Initial open-source release

This commit is contained in:
Andrey Avtomonov 2026-05-10 23:12:26 +02:00
commit 1a42152e6f
1199 changed files with 257054 additions and 0 deletions

View file

@ -0,0 +1,183 @@
import { describe, expect, it } from 'vitest';
import {
REDACTED_KLO_CREDENTIAL_VALUE,
redactKloCredentialEnvelope,
redactKloCredentialValue,
redactKloScanMetadata,
redactKloScanReport,
redactKloScanWarning,
} from './credentials.js';
import type { KloCredentialEnvelope, KloScanReport, KloScanWarning } from './types.js';
describe('KLO scan credential redaction', () => {
it('keeps credential references inspectable', () => {
const envReference: KloCredentialEnvelope = { kind: 'env', name: 'DATABASE_URL' };
const fileReference: KloCredentialEnvelope = { kind: 'file', path: '~/.config/klo/warehouse' };
expect(redactKloCredentialEnvelope(envReference)).toEqual(envReference);
expect(redactKloCredentialEnvelope(fileReference)).toEqual(fileReference);
});
it('redacts resolved credential envelope values recursively', () => {
expect(
redactKloCredentialEnvelope({
kind: 'resolved',
source: 'host',
values: {
username: 'readonly',
password: 'secret-password', // pragma: allowlist secret
nested: {
api_key: 'phx_123', // pragma: allowlist secret
warehouse: 'compute_wh',
},
headers: [{ authorizationToken: 'token-value' }, { label: 'safe' }],
},
}),
).toEqual({
kind: 'resolved',
source: 'host',
redacted: true,
values: {
username: 'readonly',
password: REDACTED_KLO_CREDENTIAL_VALUE,
nested: {
api_key: REDACTED_KLO_CREDENTIAL_VALUE,
warehouse: 'compute_wh',
},
headers: [{ authorizationToken: REDACTED_KLO_CREDENTIAL_VALUE }, { label: 'safe' }],
},
});
});
it('redacts scan metadata fields that commonly contain secrets', () => {
expect(
redactKloScanMetadata({
driver: 'postgres',
url: 'postgres://user:pass@example.test/db', // pragma: allowlist secret
serviceAccountJson: {
client_email: 'reader@example.test',
private_key: 'pem-value', // pragma: allowlist secret
},
safeCount: 3,
}),
).toEqual({
driver: 'postgres',
url: REDACTED_KLO_CREDENTIAL_VALUE,
serviceAccountJson: {
client_email: 'reader@example.test',
private_key: REDACTED_KLO_CREDENTIAL_VALUE,
},
safeCount: 3,
});
});
it('redacts scan warning messages and metadata without hiding safe context', () => {
const warning: KloScanWarning = {
code: 'sampling_failed',
message: 'sample failed for postgres://reader:secret@example.test/db', // pragma: allowlist secret
recoverable: true,
metadata: {
table: 'orders',
url: 'postgres://reader:secret@example.test/db', // pragma: allowlist secret
nested: {
api_key: 'sk_test_123', // pragma: allowlist secret
schema: 'public',
},
},
};
expect(redactKloScanWarning(warning)).toEqual({
code: 'sampling_failed',
message: 'sample failed for postgres://reader:<redacted>@example.test/db',
recoverable: true,
metadata: {
table: 'orders',
url: REDACTED_KLO_CREDENTIAL_VALUE,
nested: {
api_key: REDACTED_KLO_CREDENTIAL_VALUE,
schema: 'public',
},
},
});
});
it('redacts scan report warning metadata recursively', () => {
const report: KloScanReport = {
connectionId: 'warehouse',
driver: 'postgres',
syncId: 'sync-1',
runId: 'run-1',
trigger: 'cli',
mode: 'structural',
dryRun: false,
artifactPaths: {
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-1',
reportPath: 'raw-sources/warehouse/live-database/sync-1/scan-report.json',
manifestShards: [],
enrichmentArtifacts: [],
},
diffSummary: {
tablesAdded: 0,
tablesModified: 0,
tablesDeleted: 0,
tablesUnchanged: 0,
columnsAdded: 0,
columnsModified: 0,
columnsDeleted: 0,
},
manifestShardsWritten: 0,
structuralSyncStats: {
tablesCreated: 0,
tablesUpdated: 0,
tablesDeleted: 0,
columnsCreated: 0,
columnsUpdated: 0,
columnsDeleted: 0,
},
enrichment: {
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'skipped',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
},
capabilityGaps: [],
warnings: [
{
code: 'credential_redacted',
message: 'metadata redacted',
recoverable: true,
metadata: {
credentials_json: '{"private_key":"pem-value"}', // pragma: allowlist secret
safeCount: 2,
},
},
],
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
enrichmentState: {
resumedStages: [],
completedStages: [],
failedStages: [],
},
createdAt: '2026-04-29T00:00:00.000Z',
};
const redacted = redactKloScanReport(report);
expect(redacted.warnings[0]?.metadata).toEqual({
credentials_json: REDACTED_KLO_CREDENTIAL_VALUE,
safeCount: 2,
});
expect(report.warnings[0]?.metadata).toEqual({
credentials_json: '{"private_key":"pem-value"}', // pragma: allowlist secret
safeCount: 2,
});
});
it('redacts standalone primitive credential values only when the field key is sensitive', () => {
expect(redactKloCredentialValue('password', 'abc')).toBe(REDACTED_KLO_CREDENTIAL_VALUE);
expect(redactKloCredentialValue('schema', 'public')).toBe('public');
});
});

View file

@ -0,0 +1,50 @@
import {
redactKloSensitiveMetadata,
redactKloSensitiveText,
redactKloSensitiveValue,
REDACTED_KLO_CREDENTIAL_VALUE,
} from '../core/redaction.js';
import type { KloCredentialEnvelope, KloScanReport, KloScanWarning } from './types.js';
export { REDACTED_KLO_CREDENTIAL_VALUE };
export function redactKloCredentialValue(key: string, value: unknown): unknown {
return redactKloSensitiveValue(key, value);
}
export function redactKloScanMetadata(metadata: Record<string, unknown>): Record<string, unknown> {
return redactKloSensitiveMetadata(metadata);
}
export function redactKloCredentialEnvelope(envelope: KloCredentialEnvelope): KloCredentialEnvelope {
if (envelope.kind !== 'resolved') {
return envelope;
}
return {
kind: 'resolved',
source: envelope.source,
redacted: true,
values: redactKloScanMetadata(envelope.values),
};
}
export function redactKloScanWarning(warning: KloScanWarning): KloScanWarning {
if (!warning.metadata) {
return {
...warning,
message: redactKloSensitiveText(warning.message),
};
}
return {
...warning,
message: redactKloSensitiveText(warning.message),
metadata: redactKloScanMetadata(warning.metadata),
};
}
export function redactKloScanReport(report: KloScanReport): KloScanReport {
return {
...report,
warnings: report.warnings.map((warning) => redactKloScanWarning(warning)),
};
}

View file

@ -0,0 +1,114 @@
import { describe, expect, it } from 'vitest';
import {
defaultKloDataDictionarySettings,
isKloDataDictionaryCandidate,
shouldKloSampleColumnForDictionary,
} from './data-dictionary.js';
const defaultPatterns = defaultKloDataDictionarySettings.excludePatterns;
describe('KLO scan data dictionary policy', () => {
it('includes text-like and boolean categorical types', () => {
expect(isKloDataDictionaryCandidate('varchar(50)', 'status', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('VARCHAR', 'category', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('text', 'region', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('string', 'payment_method', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('nvarchar(100)', 'tier', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('enum', 'status', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('boolean', 'active', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('bool', 'verified', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('character varying(50)', 'region', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('character(1)', 'flag', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('ntext', 'category', defaultPatterns)).toBe(true);
});
it('excludes non-categorical primitive types', () => {
expect(isKloDataDictionaryCandidate('integer', 'count', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('bigint', 'total', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('timestamp', 'created', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('date', 'birth', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('numeric', 'amount', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('decimal(10,2)', 'price', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('float', 'rate', defaultPatterns)).toBe(false);
});
it('excludes configured high-cardinality or sensitive name patterns', () => {
expect(isKloDataDictionaryCandidate('varchar', 'user_id', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'session_uuid', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'api_key', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'password_hash', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'auth_token', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'id', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'created_at', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'birth_date', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('text', 'description', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('text', 'email_body', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'image_url', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'email', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'phone_number', defaultPatterns)).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'street_address', defaultPatterns)).toBe(false);
});
it('keeps business categorical names eligible', () => {
expect(isKloDataDictionaryCandidate('varchar', 'status', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'region', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'country', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'payment_method', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'currency', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'plan', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'category', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'tier', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'gender', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'language', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'order_type', defaultPatterns)).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'order_status', defaultPatterns)).toBe(true);
});
it('respects host-provided exclusion patterns and skips invalid regex patterns', () => {
expect(isKloDataDictionaryCandidate('varchar', 'company_size', ['company'])).toBe(false);
expect(isKloDataDictionaryCandidate('varchar', 'status', ['company'])).toBe(true);
expect(isKloDataDictionaryCandidate('varchar', 'status', ['[invalid', '(unclosed'])).toBe(true);
});
it('skips columns that already have persisted dictionary state', () => {
expect(
shouldKloSampleColumnForDictionary({
columnType: 'varchar',
columnName: 'status',
sampleValues: ['paid'],
cardinality: null,
settings: defaultKloDataDictionarySettings,
}),
).toEqual({ sample: false, reason: 'already_populated' });
expect(
shouldKloSampleColumnForDictionary({
columnType: 'varchar',
columnName: 'empty_status',
sampleValues: null,
cardinality: 0,
settings: defaultKloDataDictionarySettings,
}),
).toEqual({ sample: false, reason: 'empty_column' });
expect(
shouldKloSampleColumnForDictionary({
columnType: 'varchar',
columnName: 'customer_name',
sampleValues: null,
cardinality: 300,
settings: defaultKloDataDictionarySettings,
}),
).toEqual({ sample: false, reason: 'high_cardinality' });
expect(
shouldKloSampleColumnForDictionary({
columnType: 'varchar',
columnName: 'status',
sampleValues: null,
cardinality: null,
settings: defaultKloDataDictionarySettings,
}),
).toEqual({ sample: true });
});
});

View file

@ -0,0 +1,109 @@
export interface KloDataDictionarySettings {
cardinalityThreshold: number;
maxValuesToStore: number;
sampleSize: number;
useDbStatistics: boolean;
excludePatterns: string[];
}
export const defaultKloDataDictionarySettings: KloDataDictionarySettings = {
cardinalityThreshold: 200,
maxValuesToStore: 100,
sampleSize: 10000,
useDbStatistics: true,
excludePatterns: [
'_id$',
'_uuid$',
'_key$',
'_hash$',
'_token$',
'^id$',
'^uuid$',
'_at$',
'_date$',
'_time$',
'description$',
'comment$',
'notes?$',
'message$',
'body$',
'content$',
'_url$',
'_path$',
'email$',
'^phone',
'address$',
],
};
export type KloDataDictionarySkipReason =
| 'not_candidate'
| 'already_populated'
| 'empty_column'
| 'high_cardinality';
export interface KloDataDictionarySampleDecision {
sample: boolean;
reason?: KloDataDictionarySkipReason;
}
export interface KloDataDictionaryColumnState {
columnType: string;
columnName: string;
sampleValues?: readonly string[] | null;
cardinality?: number | null;
settings: KloDataDictionarySettings;
}
const categoricalCandidateTypes = /^(n?varchar|n?char|n?text|string|character|enum|bool(ean)?)/i;
export function isKloDataDictionaryCandidate(
columnType: string,
columnName: string,
excludePatterns: readonly string[] = defaultKloDataDictionarySettings.excludePatterns,
): boolean {
const typeLower = columnType.toLowerCase();
const nameLower = columnName.toLowerCase();
if (!categoricalCandidateTypes.test(typeLower)) {
return false;
}
for (const patternText of excludePatterns) {
try {
const pattern = new RegExp(patternText, 'i');
if (pattern.test(nameLower)) {
return false;
}
} catch {
continue;
}
}
return true;
}
export function shouldKloSampleColumnForDictionary(
input: KloDataDictionaryColumnState,
): KloDataDictionarySampleDecision {
const sampleValues = input.sampleValues ?? null;
const cardinality = input.cardinality ?? null;
if (sampleValues && sampleValues.length > 0) {
return { sample: false, reason: 'already_populated' };
}
if (cardinality === 0) {
return { sample: false, reason: 'empty_column' };
}
if (cardinality !== null && cardinality > input.settings.cardinalityThreshold) {
return { sample: false, reason: 'high_cardinality' };
}
if (!isKloDataDictionaryCandidate(input.columnType, input.columnName, input.settings.excludePatterns)) {
return { sample: false, reason: 'not_candidate' };
}
return { sample: true };
}

View file

@ -0,0 +1,318 @@
import { describe, expect, it, vi } from 'vitest';
vi.mock('ai', async (importOriginal) => {
const actual = await importOriginal<typeof import('ai')>();
return { ...actual, generateText: vi.fn() };
});
import { generateText } from 'ai';
import {
buildKloColumnDescriptionPrompt,
buildKloDataSourceDescriptionPrompt,
buildKloTableDescriptionPrompt,
type KloDescriptionCachePort,
KloDescriptionGenerator,
} from './description-generation.js';
import { createKloConnectorCapabilities, type KloScanConnector } from './types.js';
function createCache(initial: Record<string, string> = {}): KloDescriptionCachePort {
const data = new Map(Object.entries(initial));
return {
buildTableKey: (table) => [table.catalog, table.db, table.name].filter(Boolean).join('.'),
buildColumnKey: (table, columnName) => [table.catalog, table.db, table.name, columnName].filter(Boolean).join('.'),
buildConnectionKey: (connectionName) => `__connection:${connectionName}`,
get: vi.fn(async (key: string) => data.get(key) ?? null),
set: vi.fn(async (key: string, value: string) => {
data.set(key, value);
}),
};
}
function createLlmProvider(text = 'generated description') {
vi.mocked(generateText).mockResolvedValue({ text } as never);
return {
getModel: vi.fn().mockReturnValue({ modelId: 'claude-sonnet-4-6', provider: 'anthropic' }),
getModelByName: vi.fn(),
cacheMarker: vi.fn(),
repairToolCallHandler: vi.fn(),
thinkingProviderOptions: vi.fn(),
telemetryConfig: vi.fn(),
promptCachingConfig: vi.fn(() => ({
enabled: false,
systemTtl: '1h',
toolsTtl: '1h',
historyTtl: '5m',
cacheSystem: true,
cacheTools: true,
cacheHistory: true,
vertexFallbackTo5m: false,
})),
activeBackend: vi.fn(() => 'anthropic'),
} as any;
}
function createConnector(): KloScanConnector {
return {
id: 'test-connector',
driver: 'postgres',
capabilities: createKloConnectorCapabilities({
tableSampling: true,
columnSampling: true,
nestedAnalysis: true,
}),
introspect: vi.fn(async () => {
throw new Error('introspection is not used by description generation');
}),
sampleColumn: vi.fn(async () => ({
values: ['paid', 'refunded', null],
nullCount: 1,
distinctCount: 2,
})),
sampleTable: vi.fn(async () => ({
headers: ['id', 'status', 'amount'],
rows: [
[1, 'paid', 20],
[2, 'refunded', 10],
],
totalRows: 2,
})),
};
}
describe('KLO description prompt builders', () => {
it('builds column prompts with sample values, source descriptions, and nested BigQuery guidance', () => {
const prompt = buildKloColumnDescriptionPrompt({
columnName: 'payload',
columnValues: [{ nested: true }, '[1,2]'],
tableContext: 'Table: events | Columns: payload | Data source: BIGQUERY',
dataSourceType: 'BIGQUERY',
supportsNestedAnalysis: true,
rawDescriptions: { db: 'Raw event payload', ai: 'Old AI text', user: 'User text' },
});
expect(prompt).toContain(
'<table_context> Table: events | Columns: payload | Data source: BIGQUERY </table_context>',
);
expect(prompt).toContain('<column_name> payload </column_name>');
expect(prompt).toContain('<sample_values> [object Object], [1,2] </sample_values>');
expect(prompt).toContain('<db_documentation> Raw event payload </db_documentation>');
expect(prompt).not.toContain('Old AI text');
expect(prompt).not.toContain('User text');
expect(prompt).toContain('nested/structured data');
});
it('builds table and data-source prompts from sampled rows', () => {
const sample = {
headers: ['id', 'status'],
rows: [
[1, 'paid'],
[2, 'refunded'],
],
totalRows: 2,
};
expect(
buildKloTableDescriptionPrompt({
tableName: 'orders',
sampleData: sample,
dataSourceType: 'POSTGRESQL',
rawDescriptions: { dbt: 'Fact table for commerce orders' },
}),
).toContain('status: paid, refunded');
expect(
buildKloDataSourceDescriptionPrompt({
tableSamples: [['orders', sample]],
dataSourceType: 'POSTGRESQL',
}),
).toContain('orders (2 columns, 2 sample rows)');
});
});
describe('KloDescriptionGenerator', () => {
it('generates column descriptions with pre-fetched values, cache hits, and word-limit metadata', async () => {
const cache = createCache({ 'warehouse.public.orders.cached_status': 'Cached status description' });
const llmProvider = createLlmProvider('Payment state');
const connector = createConnector();
const generator = new KloDescriptionGenerator({
llmProvider,
cache,
settings: {
columnMaxWords: 12,
tableMaxWords: 18,
dataSourceMaxWords: 24,
temperature: 0.2,
concurrencyLimit: 2,
},
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: 'warehouse',
db: 'public',
name: 'orders',
columns: [
{ name: 'status', sampleValues: ['paid', 'refunded'], rawDescriptions: { db: 'Payment lifecycle' } },
{ name: 'cached_status', sampleValues: ['open'] },
],
},
skipExisting: false,
existingDescriptions: {},
});
expect(result).toEqual({
columnDescriptions: [
['status', 'Payment state'],
['cached_status', 'Cached status description'],
],
processedColumns: ['status'],
skippedColumns: ['cached_status'],
});
expect(connector.sampleColumn).not.toHaveBeenCalled();
expect(generateText).toHaveBeenCalledWith(
expect.objectContaining({
temperature: 0.2,
messages: expect.arrayContaining([
expect.objectContaining({
role: 'user',
content: expect.stringContaining('Please provide a concise description in 12 words or less.'),
}),
]),
}),
);
});
it('samples through the connector when column values are not pre-fetched', async () => {
const connector = createConnector();
const generator = new KloDescriptionGenerator({
llmProvider: createLlmProvider('Current order state'),
settings: {
columnMaxWords: 12,
tableMaxWords: 18,
dataSourceMaxWords: 24,
},
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'status' }],
},
});
expect(connector.sampleColumn).toHaveBeenCalledWith(
{
connectionId: 'conn-1',
table: { catalog: null, db: 'public', name: 'orders' },
column: 'status',
limit: 50,
},
{ runId: 'run-1' },
);
expect(result.columnDescriptions).toEqual([['status', 'Current order state']]);
});
it('samples through a description sampling port without requiring structural introspection', async () => {
const sampler = {
id: 'description-sampler:conn-1',
sampleColumn: vi.fn(async () => ({
values: ['paid', 'refunded'],
nullCount: null,
distinctCount: null,
})),
sampleTable: vi.fn(async () => ({
headers: ['id', 'status'],
rows: [[1, 'paid']],
totalRows: 1,
})),
};
const generator = new KloDescriptionGenerator({
llmProvider: createLlmProvider('Generated through sampler'),
settings: {
columnMaxWords: 12,
tableMaxWords: 18,
dataSourceMaxWords: 24,
},
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector: sampler,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'status' }],
},
});
expect(result.columnDescriptions).toEqual([['status', 'Generated through sampler']]);
expect(sampler.sampleColumn).toHaveBeenCalledWith(
{
connectionId: 'conn-1',
table: { catalog: null, db: 'public', name: 'orders' },
column: 'status',
limit: 50,
},
{ runId: 'run-1' },
);
expect('introspect' in sampler).toBe(false);
});
it('generates and caches table and data-source descriptions', async () => {
const cache = createCache();
const connector = createConnector();
const generator = new KloDescriptionGenerator({
llmProvider: createLlmProvider('Commerce orders'),
cache,
settings: {
columnMaxWords: 12,
tableMaxWords: 18,
dataSourceMaxWords: 24,
concurrencyLimit: 2,
},
});
await expect(
generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: { catalog: 'warehouse', db: 'public', name: 'orders', rawDescriptions: { db: 'Raw orders' } },
}),
).resolves.toBe('Commerce orders');
await expect(
generator.generateDataSourceDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
tables: [
{ catalog: 'warehouse', db: 'public', name: 'orders' },
{ catalog: 'warehouse', db: 'public', name: 'customers' },
],
connectionName: 'Warehouse',
}),
).resolves.toBe('Commerce orders');
expect(cache.set).toHaveBeenCalledWith('warehouse.public.orders', 'Commerce orders');
expect(cache.set).toHaveBeenCalledWith('__connection:Warehouse', 'Commerce orders');
});
});

View file

@ -0,0 +1,582 @@
import type { KloLlmProvider } from '@klo/llm';
import { generateKloText } from '../llm/index.js';
import type {
KloColumnSampleInput,
KloColumnSampleResult,
KloScanContext,
KloScanLoggerPort,
KloTableRef,
KloTableSampleInput,
KloTableSampleResult,
} from './types.js';
export interface KloDescriptionCachePort {
buildTableKey(table: KloTableRef): string;
buildColumnKey(table: KloTableRef, columnName: string): string;
buildConnectionKey(connectionName: string): string;
get(key: string): Promise<string | null>;
set(key: string, value: string): Promise<void>;
}
export interface KloDescriptionSamplingPort {
id: string;
sampleColumn?(input: KloColumnSampleInput, ctx: KloScanContext): Promise<KloColumnSampleResult>;
sampleTable?(input: KloTableSampleInput, ctx: KloScanContext): Promise<KloTableSampleResult>;
}
export interface KloDescriptionGenerationSettings {
columnMaxWords: number;
tableMaxWords: number;
dataSourceMaxWords: number;
temperature?: number;
concurrencyLimit?: number;
}
interface ResolvedKloDescriptionGenerationSettings {
columnMaxWords: number;
tableMaxWords: number;
dataSourceMaxWords: number;
temperature?: number;
concurrencyLimit: number;
}
export interface KloDescriptionColumn {
name: string;
type?: string;
rawDescriptions?: Record<string, string>;
sampleValues?: unknown[];
}
export interface KloDescriptionColumnTable extends KloTableRef {
columns: KloDescriptionColumn[];
}
export interface KloDescriptionTableInput extends KloTableRef {
rawDescriptions?: Record<string, string>;
}
export interface KloColumnAnalysisResult {
columnDescriptions: Array<[string, string | null]>;
processedColumns: string[];
skippedColumns: string[];
}
export interface KloColumnDescriptionPromptInput {
columnName: string;
columnValues: unknown[];
tableContext: string;
dataSourceType: string;
supportsNestedAnalysis: boolean;
rawDescriptions?: Record<string, string>;
}
export interface KloTableDescriptionPromptInput {
tableName: string;
sampleData: KloTableSampleResult;
dataSourceType: string;
rawDescriptions?: Record<string, string>;
}
export interface KloDataSourceDescriptionPromptInput {
tableSamples: Array<[string, KloTableSampleResult]>;
dataSourceType: string;
}
export interface KloGenerateColumnDescriptionsInput {
connectionId: string;
connector: KloDescriptionSamplingPort;
context: KloScanContext;
dataSourceType: string;
supportsNestedAnalysis: boolean;
table: KloDescriptionColumnTable;
skipExisting?: boolean;
existingDescriptions?: Record<string, string | null>;
}
export interface KloGenerateTableDescriptionInput {
connectionId: string;
connector: KloDescriptionSamplingPort;
context: KloScanContext;
dataSourceType: string;
table: KloDescriptionTableInput;
}
export interface KloGenerateDataSourceDescriptionInput {
connectionId: string;
connector: KloDescriptionSamplingPort;
context: KloScanContext;
dataSourceType: string;
tables: KloTableRef[];
connectionName?: string;
}
export interface KloDescriptionGeneratorOptions {
llmProvider: KloLlmProvider;
cache?: KloDescriptionCachePort;
logger?: KloScanLoggerPort;
settings: KloDescriptionGenerationSettings;
}
interface ColumnTaskResult {
columnName: string;
description: string | null;
processed: boolean;
skipped: boolean;
}
function descriptionSources(rawDescriptions: Record<string, string> | undefined): Array<[string, string]> {
if (!rawDescriptions) {
return [];
}
return Object.entries(rawDescriptions).filter(([source, text]) => source !== 'ai' && source !== 'user' && !!text);
}
function errorMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
function toTableRef(table: KloTableRef): KloTableRef {
return {
catalog: table.catalog,
db: table.db,
name: table.name,
};
}
async function runWithConcurrency<TInput, TOutput>(
items: readonly TInput[],
concurrencyLimit: number,
worker: (item: TInput, index: number) => Promise<TOutput>,
): Promise<TOutput[]> {
const results: TOutput[] = [];
let nextIndex = 0;
const workerCount = Math.max(1, Math.min(concurrencyLimit, items.length || 1));
await Promise.all(
Array.from({ length: workerCount }, async () => {
while (nextIndex < items.length) {
const index = nextIndex;
nextIndex += 1;
const item = items[index];
if (item !== undefined) {
results[index] = await worker(item, index);
}
}
}),
);
return results;
}
export function appendKloWordLimitInstruction(prompt: string, maxWords: number): string {
return `${prompt}\n\nPlease provide a concise description in ${maxWords} words or less.`;
}
export function buildKloColumnDescriptionPrompt(input: KloColumnDescriptionPromptInput): string {
const sampleValues = input.columnValues.slice(0, 5);
const valuesStr = sampleValues
.filter((value) => value !== null && value !== undefined)
.map((value) => String(value))
.join(', ');
let prompt = `Analyze this database column and provide a concise description:
<table_context> ${input.tableContext} </table_context>
<column_name> ${input.columnName} </column_name>
<sample_values> ${valuesStr} </sample_values>
`;
const sources = descriptionSources(input.rawDescriptions);
if (sources.length > 0) {
prompt += '\nExisting descriptions from other sources:\n';
for (const [source, text] of sources) {
prompt += `<${source}_documentation> ${text} </${source}_documentation>\n`;
}
prompt +=
'\nSynthesize a description that captures the most important information from all sources. Prioritize the sources as authoritative context.\n';
}
prompt += `
Provide a brief description of what this column contains without repeating the column name.
Focus on the data's meaning and business purpose. Start directly with the content description.
Example:
"first names of individuals, likely employees or contacts" instead of "The column contains first names..."
"Job titles or roles of individuals..." instead of "This column contains job titles..."
`;
if (input.dataSourceType === 'BIGQUERY' && input.supportsNestedAnalysis) {
const hasNestedData = sampleValues.some((value) => {
const text = String(value);
return text.includes('nested') || text.includes('{') || text.includes('[');
});
if (hasNestedData) {
prompt +=
'\nNote: This column contains nested/structured data (JSON, STRUCT, or ARRAY) - describe its general business purpose and data organization.';
}
}
return prompt.trim();
}
export function buildKloTableDescriptionPrompt(input: KloTableDescriptionPromptInput): string {
const columnInfo: string[] = [];
for (let index = 0; index < Math.min(input.sampleData.headers.length, 10); index += 1) {
const header = input.sampleData.headers[index];
const sampleValues = input.sampleData.rows
.slice(0, 3)
.map((row) => row[index])
.filter((value) => value !== null && value !== undefined);
columnInfo.push(`${header}: ${sampleValues.map((value) => String(value)).join(', ')}`);
}
let prompt = `
Analyze this database table and provide a concise description:
Table: ${input.tableName}
Columns and sample data: ${columnInfo.join(' | ')}
Total rows in sample: ${input.sampleData.rows.length}
Data source type: ${input.dataSourceType}
`;
const sources = descriptionSources(input.rawDescriptions);
if (sources.length > 0) {
prompt += '\n Existing descriptions from other sources:\n';
for (const [source, text] of sources) {
prompt += ` ${source}: ${text}\n`;
}
prompt +=
'\n Synthesize a description that captures the most important information from all sources. Prioritize the sources as authoritative context.\n';
}
if (input.dataSourceType === 'BIGQUERY') {
prompt +=
"\nNote (Don't include this note in the final answer.): This is a BigQuery table which may contain nested structures, arrays, or other complex data types.";
}
prompt += `
Provide a brief description of what this table represents and its business purpose.
Do NOT list or describe individual columns or fields.
Start directly with the content description without mentioning the table name.
Focus on the data's meaning and business purpose.
Example: "Information about healthcare professionals used for workforce management" instead of "The blahblah table contains information about healthcare professionals including their names, titles..."
`;
return prompt.trim();
}
export function buildKloDataSourceDescriptionPrompt(input: KloDataSourceDescriptionPromptInput): string {
const tablesText = input.tableSamples
.map(
([tableName, sampleData]) =>
`${tableName} (${sampleData.headers.length} columns, ${sampleData.rows.length} sample rows)`,
)
.join(' | ');
let prompt = `
Analyze this database and provide a concise description:
Tables: ${tablesText}
Total tables analyzed: ${input.tableSamples.length}
Data source type: ${input.dataSourceType}
`;
if (input.dataSourceType === 'BIGQUERY') {
prompt +=
"\nNote (Don't include this note in the final answer): This is a BigQuery dataset which may contain large-scale analytics data, nested structures, and complex data types.";
}
prompt += `
Provide a direct, concise description of what this database represents and its business purpose.
Do NOT start with phrases like "This database appears to represent" or "This BigQuery dataset".
Start directly with the domain or business area description.
Focus on the overall data model and its intended use.
Example: "Healthcare-related database with a focus on patient management..." instead of "This database appears to represent a healthcare-related system..."
`;
return prompt.trim();
}
export class KloDescriptionGenerator {
private readonly llmProvider: KloLlmProvider;
private readonly cache?: KloDescriptionCachePort;
private readonly logger?: KloScanLoggerPort;
private readonly settings: ResolvedKloDescriptionGenerationSettings;
constructor(options: KloDescriptionGeneratorOptions) {
this.llmProvider = options.llmProvider;
this.cache = options.cache;
this.logger = options.logger;
this.settings = {
columnMaxWords: options.settings.columnMaxWords,
tableMaxWords: options.settings.tableMaxWords,
dataSourceMaxWords: options.settings.dataSourceMaxWords,
...(options.settings.temperature !== undefined ? { temperature: options.settings.temperature } : {}),
concurrencyLimit: options.settings.concurrencyLimit ?? 5,
};
}
async generateColumnDescriptions(input: KloGenerateColumnDescriptionsInput): Promise<KloColumnAnalysisResult> {
const columnsToProcess = input.table.columns;
const tableContext = `Table: ${input.table.name} | Columns: ${columnsToProcess.map((column) => column.name).join(', ')} | Data source: ${input.dataSourceType}`;
const results = await runWithConcurrency(columnsToProcess, this.settings.concurrencyLimit, async (column) =>
this.generateOneColumnDescription(input, column, tableContext),
);
const columnDescriptions: Array<[string, string | null]> = [];
const processedColumns: string[] = [];
const skippedColumns: string[] = [];
for (const result of results) {
columnDescriptions.push([result.columnName, result.description]);
if (result.skipped) {
skippedColumns.push(result.columnName);
} else if (result.processed) {
processedColumns.push(result.columnName);
}
}
return {
columnDescriptions,
processedColumns,
skippedColumns,
};
}
async generateTableDescription(input: KloGenerateTableDescriptionInput): Promise<string> {
const tableRef = toTableRef(input.table);
const cacheKey = this.cache?.buildTableKey(tableRef);
if (cacheKey) {
const cached = await this.cache?.get(cacheKey);
if (cached) {
return cached;
}
}
if (!input.connector.sampleTable) {
this.logger?.warn('KLO scan connector does not support table sampling for table description generation', {
connectorId: input.connector.id,
table: input.table.name,
});
return 'Table not found';
}
try {
const sampleData = await input.connector.sampleTable(
{
connectionId: input.connectionId,
table: tableRef,
limit: 20,
},
input.context,
);
const prompt = buildKloTableDescriptionPrompt({
tableName: input.table.name,
sampleData,
dataSourceType: input.dataSourceType,
rawDescriptions: input.table.rawDescriptions,
});
const description = await this.generateAiDescription(
prompt,
this.settings.tableMaxWords,
'klo-table-description',
);
if (cacheKey) {
await this.cache?.set(cacheKey, description);
}
return description;
} catch (error) {
this.logger?.error(`Error generating table description: ${errorMessage(error)}`);
return 'Table not found';
}
}
async generateDataSourceDescription(input: KloGenerateDataSourceDescriptionInput): Promise<string> {
if (input.tables.length === 0) {
return 'No tables found in database';
}
const cacheKey = input.connectionName ? this.cache?.buildConnectionKey(input.connectionName) : undefined;
if (cacheKey) {
const cached = await this.cache?.get(cacheKey);
if (cached) {
return cached;
}
}
if (!input.connector.sampleTable) {
this.logger?.warn('KLO scan connector does not support table sampling for data-source description generation', {
connectorId: input.connector.id,
});
return 'No accessible tables found in database';
}
const tablesToAnalyze = input.tables.slice(0, 10);
const tableSamples = await runWithConcurrency(tablesToAnalyze, this.settings.concurrencyLimit, async (table) => {
try {
const sampleData = await input.connector.sampleTable!(
{
connectionId: input.connectionId,
table: toTableRef(table),
limit: 5,
},
input.context,
);
return [table.name, sampleData] as [string, KloTableSampleResult];
} catch (error) {
this.logger?.warn(`Failed to sample table '${table.name}' for data source analysis - ${errorMessage(error)}`);
return null;
}
});
const accessibleSamples = tableSamples.filter(
(sample): sample is [string, KloTableSampleResult] => sample !== null,
);
if (accessibleSamples.length === 0) {
return 'No accessible tables found in database';
}
try {
const prompt = buildKloDataSourceDescriptionPrompt({
tableSamples: accessibleSamples,
dataSourceType: input.dataSourceType,
});
const description = await this.generateAiDescription(
prompt,
this.settings.dataSourceMaxWords,
'klo-data-source-description',
);
if (cacheKey) {
await this.cache?.set(cacheKey, description);
}
return description;
} catch (error) {
this.logger?.error(`Error generating data source description: ${errorMessage(error)}`);
return 'Failed to generate data source description';
}
}
private async generateOneColumnDescription(
input: KloGenerateColumnDescriptionsInput,
column: KloDescriptionColumn,
tableContext: string,
): Promise<ColumnTaskResult> {
const existingDescription = input.existingDescriptions?.[column.name];
if (input.skipExisting && existingDescription) {
return {
columnName: column.name,
description: existingDescription,
skipped: true,
processed: false,
};
}
const tableRef = toTableRef(input.table);
const cacheKey = this.cache?.buildColumnKey(tableRef, column.name);
if (cacheKey) {
const cached = await this.cache?.get(cacheKey);
if (cached) {
return {
columnName: column.name,
description: cached,
skipped: true,
processed: false,
};
}
}
try {
let columnValues = column.sampleValues;
if (!columnValues || columnValues.length === 0) {
if (!input.connector.sampleColumn) {
this.logger?.warn('KLO scan connector does not support column sampling for column description generation', {
connectorId: input.connector.id,
table: input.table.name,
column: column.name,
});
return {
columnName: column.name,
description: null,
skipped: false,
processed: false,
};
}
const sample = await input.connector.sampleColumn(
{
connectionId: input.connectionId,
table: tableRef,
column: column.name,
limit: 50,
},
input.context,
);
columnValues = sample.values;
}
const nonNullValues = (columnValues ?? []).filter((value) => value !== null && value !== undefined);
if (nonNullValues.length === 0) {
return {
columnName: column.name,
description: null,
skipped: false,
processed: false,
};
}
const prompt = buildKloColumnDescriptionPrompt({
columnName: column.name,
columnValues: nonNullValues,
tableContext,
dataSourceType: input.dataSourceType,
supportsNestedAnalysis: input.supportsNestedAnalysis,
rawDescriptions: column.rawDescriptions,
});
const description = await this.generateAiDescription(
prompt,
this.settings.columnMaxWords,
'klo-column-description',
);
if (cacheKey) {
await this.cache?.set(cacheKey, description);
}
return {
columnName: column.name,
description,
skipped: false,
processed: true,
};
} catch (error) {
this.logger?.error(`Error analyzing column '${column.name}': ${errorMessage(error)}`);
return {
columnName: column.name,
description: `Error generating description: ${errorMessage(error)}`,
skipped: false,
processed: false,
};
}
}
private async generateAiDescription(prompt: string, maxWords: number, _operationName: string): Promise<string> {
try {
const text = await generateKloText({
llmProvider: this.llmProvider,
role: 'candidateExtraction',
prompt: appendKloWordLimitInstruction(prompt, maxWords),
temperature: this.settings.temperature,
});
const description = text.trim();
return description || 'Failed to generate description';
} catch (error) {
this.logger?.error(`Error generating AI description: ${errorMessage(error)}`);
return `Error generating description: ${errorMessage(error)}`;
}
}
}

View file

@ -0,0 +1,47 @@
import { describe, expect, it } from 'vitest';
import { buildKloColumnEmbeddingText } from './embedding-text.js';
describe('KLO scan embedding text', () => {
it('builds column embedding text with table, description, FK, and sample-value context', () => {
expect(
buildKloColumnEmbeddingText({
tableName: 'orders',
columnName: 'status',
columnType: 'varchar',
resolvedDescription: 'Payment lifecycle state',
sampleValues: ['paid', 'refunded', 'pending'],
resolvedTableDescription: 'Customer orders',
foreignKeys: {
outgoing: [{ toTable: 'customers', toColumn: 'id' }],
incoming: [{ fromTable: 'refunds', fromColumn: 'order_status' }],
},
maxSampleValues: 2,
}),
).toBe(
'orders.status (varchar). Table: Customer orders. Payment lifecycle state. FK -> customers.id. FK <- refunds.order_status. Values: paid, refunded',
);
});
it('omits optional sections when the scan has no enrichment context yet', () => {
expect(
buildKloColumnEmbeddingText({
tableName: 'orders',
columnName: 'id',
columnType: 'integer',
resolvedDescription: null,
}),
).toBe('orders.id (integer)');
});
it('keeps all available sample values when no explicit max is supplied', () => {
expect(
buildKloColumnEmbeddingText({
tableName: 'orders',
columnName: 'status',
columnType: 'varchar',
resolvedDescription: null,
sampleValues: ['paid', 'refunded'],
}),
).toBe('orders.status (varchar). Values: paid, refunded');
});
});

View file

@ -0,0 +1,45 @@
export interface KloColumnEmbeddingForeignKeys {
outgoing: Array<{ toTable: string; toColumn: string }>;
incoming: Array<{ fromTable: string; fromColumn: string }>;
}
export interface KloColumnEmbeddingTextInput {
tableName: string;
columnName: string;
columnType: string;
resolvedDescription: string | null;
sampleValues?: readonly string[] | null;
resolvedTableDescription?: string | null;
foreignKeys?: KloColumnEmbeddingForeignKeys | null;
maxSampleValues?: number;
}
export function buildKloColumnEmbeddingText(input: KloColumnEmbeddingTextInput): string {
const parts: string[] = [];
parts.push(`${input.tableName}.${input.columnName} (${input.columnType})`);
if (input.resolvedTableDescription) {
parts.push(`Table: ${input.resolvedTableDescription}`);
}
if (input.resolvedDescription) {
parts.push(input.resolvedDescription);
}
if (input.foreignKeys) {
for (const fk of input.foreignKeys.outgoing) {
parts.push(`FK -> ${fk.toTable}.${fk.toColumn}`);
}
for (const fk of input.foreignKeys.incoming) {
parts.push(`FK <- ${fk.fromTable}.${fk.fromColumn}`);
}
}
if (input.sampleValues && input.sampleValues.length > 0) {
const maxSampleValues = input.maxSampleValues ?? 20;
parts.push(`Values: ${input.sampleValues.slice(0, maxSampleValues).join(', ')}`);
}
return parts.join('. ');
}

View file

@ -0,0 +1,175 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import {
completedKloScanEnrichmentStateSummary,
computeKloScanEnrichmentInputHash,
summarizeKloScanEnrichmentState,
} from './enrichment-state.js';
import { SqliteLocalScanEnrichmentStateStore } from './sqlite-local-enrichment-state-store.js';
import type { KloSchemaSnapshot } from './types.js';
const snapshot: KloSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-29T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: null,
estimatedRows: 1,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
],
},
],
};
describe('scan enrichment state', () => {
let tempDir: string;
let store: SqliteLocalScanEnrichmentStateStore;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-scan-enrichment-state-'));
store = new SqliteLocalScanEnrichmentStateStore({ dbPath: join(tempDir, 'db.sqlite') });
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('computes stable input hashes without depending on object key order', () => {
const first = computeKloScanEnrichmentInputHash({
snapshot,
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' },
});
const second = computeKloScanEnrichmentInputHash({
snapshot: { ...snapshot, metadata: {} },
mode: 'enriched',
detectRelationships: true,
providerIdentity: { llmModel: 'a', embeddingDimensions: 8, provider: 'deterministic' },
});
const firstTable = snapshot.tables[0];
if (!firstTable) {
throw new Error('Expected test snapshot table');
}
const changed = computeKloScanEnrichmentInputHash({
snapshot: { ...snapshot, tables: [{ ...firstTable, name: 'orders_v2' }] },
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' },
});
expect(first).toMatch(/^[a-f0-9]{64}$/);
expect(second).toBe(first);
expect(changed).not.toBe(first);
});
it('persists completed stages and ignores stale hashes', async () => {
const inputHash = computeKloScanEnrichmentInputHash({
snapshot,
mode: 'enriched',
detectRelationships: true,
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8 },
});
await store.saveCompletedStage({
runId: 'scan-run-1',
connectionId: 'warehouse',
syncId: 'sync-1',
mode: 'enriched',
stage: 'descriptions',
inputHash,
output: [{ table: { catalog: null, db: 'public', name: 'orders' }, tableDescription: 'Orders' }],
updatedAt: '2026-04-29T12:01:00.000Z',
});
await expect(
store.findCompletedStage({
runId: 'scan-run-1',
stage: 'descriptions',
inputHash,
}),
).resolves.toMatchObject({
runId: 'scan-run-1',
stage: 'descriptions',
status: 'completed',
output: [{ table: { catalog: null, db: 'public', name: 'orders' }, tableDescription: 'Orders' }],
});
await expect(
store.findCompletedStage({
runId: 'scan-run-1',
stage: 'descriptions',
inputHash: 'different-hash',
}),
).resolves.toBeNull();
});
it('records failed stages without making them reusable', async () => {
await store.saveFailedStage({
runId: 'scan-run-2',
connectionId: 'warehouse',
syncId: 'sync-2',
mode: 'enriched',
stage: 'embeddings',
inputHash: 'hash-2',
errorMessage: 'embedding service timed out',
updatedAt: '2026-04-29T12:02:00.000Z',
});
await expect(
store.findCompletedStage({
runId: 'scan-run-2',
stage: 'embeddings',
inputHash: 'hash-2',
}),
).resolves.toBeNull();
await expect(store.listRunStages('scan-run-2')).resolves.toEqual([
expect.objectContaining({
runId: 'scan-run-2',
stage: 'embeddings',
status: 'failed',
errorMessage: 'embedding service timed out',
}),
]);
});
it('summarizes resumed, completed, and failed stages for reports', () => {
expect(
summarizeKloScanEnrichmentState({
resumedStages: ['descriptions'],
completedStages: ['descriptions', 'embeddings'],
failedStages: ['relationships'],
}),
).toEqual({
resumedStages: ['descriptions'],
completedStages: ['descriptions', 'embeddings'],
failedStages: ['relationships'],
});
expect(completedKloScanEnrichmentStateSummary()).toEqual({
resumedStages: [],
completedStages: [],
failedStages: [],
});
});
});

View file

@ -0,0 +1,108 @@
import { createHash } from 'node:crypto';
import type { KloScanEnrichmentStage, KloScanEnrichmentStateSummary, KloScanMode, KloSchemaSnapshot } from './types.js';
export const KLO_SCAN_ENRICHMENT_STAGES: readonly KloScanEnrichmentStage[] = [
'descriptions',
'embeddings',
'relationships',
] as const;
export interface KloScanEnrichmentStageLookup {
runId: string;
stage: KloScanEnrichmentStage;
inputHash: string;
}
export interface KloScanEnrichmentCompletedStage<TOutput = unknown> {
runId: string;
connectionId: string;
syncId: string;
mode: KloScanMode;
stage: KloScanEnrichmentStage;
inputHash: string;
status: 'completed';
output: TOutput;
errorMessage: null;
updatedAt: string;
}
export interface KloScanEnrichmentFailedStage {
runId: string;
connectionId: string;
syncId: string;
mode: KloScanMode;
stage: KloScanEnrichmentStage;
inputHash: string;
status: 'failed';
output: null;
errorMessage: string;
updatedAt: string;
}
export type KloScanEnrichmentStageRecord<TOutput = unknown> =
| KloScanEnrichmentCompletedStage<TOutput>
| KloScanEnrichmentFailedStage;
export interface KloScanEnrichmentStateStore {
findCompletedStage<TOutput = unknown>(
input: KloScanEnrichmentStageLookup,
): Promise<KloScanEnrichmentCompletedStage<TOutput> | null>;
saveCompletedStage<TOutput = unknown>(
input: Omit<KloScanEnrichmentCompletedStage<TOutput>, 'status' | 'errorMessage'>,
): Promise<void>;
saveFailedStage(input: Omit<KloScanEnrichmentFailedStage, 'status' | 'output'>): Promise<void>;
listRunStages(runId: string): Promise<KloScanEnrichmentStageRecord[]>;
}
export interface ComputeKloScanEnrichmentInputHashInput {
snapshot: KloSchemaSnapshot;
mode: KloScanMode;
detectRelationships: boolean;
providerIdentity: Record<string, unknown>;
relationshipSettings?: unknown;
}
function stableJson(value: unknown): string {
if (Array.isArray(value)) {
return `[${value.map(stableJson).join(',')}]`;
}
if (value && typeof value === 'object') {
const entries = Object.entries(value as Record<string, unknown>).sort(([left], [right]) =>
left.localeCompare(right),
);
return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableJson(item)}`).join(',')}}`;
}
return JSON.stringify(value);
}
export function computeKloScanEnrichmentInputHash(input: ComputeKloScanEnrichmentInputHashInput): string {
return createHash('sha256').update(stableJson(input)).digest('hex');
}
function uniqueStages(stages: KloScanEnrichmentStage[]): KloScanEnrichmentStage[] {
const seen = new Set<KloScanEnrichmentStage>();
const ordered: KloScanEnrichmentStage[] = [];
for (const stage of KLO_SCAN_ENRICHMENT_STAGES) {
if (stages.includes(stage) && !seen.has(stage)) {
seen.add(stage);
ordered.push(stage);
}
}
return ordered;
}
export function completedKloScanEnrichmentStateSummary(): KloScanEnrichmentStateSummary {
return {
resumedStages: [],
completedStages: [],
failedStages: [],
};
}
export function summarizeKloScanEnrichmentState(input: KloScanEnrichmentStateSummary): KloScanEnrichmentStateSummary {
return {
resumedStages: uniqueStages(input.resumedStages),
completedStages: uniqueStages(input.completedStages),
failedStages: uniqueStages(input.failedStages),
};
}

View file

@ -0,0 +1,42 @@
import { describe, expect, it } from 'vitest';
import {
failedKloScanEnrichmentSummary,
kloScanErrorMessage,
skippedKloScanEnrichmentSummary,
} from './enrichment-summary.js';
describe('KLO scan enrichment summaries', () => {
it('keeps structural scans skipped when no enrichment was requested', () => {
expect(failedKloScanEnrichmentSummary('structural', false)).toEqual(skippedKloScanEnrichmentSummary);
});
it('marks relationship stages failed when relationship detection fails', () => {
expect(failedKloScanEnrichmentSummary('relationships', true)).toEqual({
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'failed',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'failed',
});
});
it('marks every enriched-only stage failed when full enrichment fails', () => {
expect(failedKloScanEnrichmentSummary('enriched', true)).toEqual({
dataDictionary: 'failed',
tableDescriptions: 'failed',
columnDescriptions: 'failed',
embeddings: 'failed',
deterministicRelationships: 'failed',
llmRelationshipValidation: 'failed',
statisticalValidation: 'failed',
});
});
it('formats unknown thrown values for scan warnings', () => {
expect(kloScanErrorMessage(new Error('gateway timeout'))).toBe('gateway timeout');
expect(kloScanErrorMessage('plain failure')).toBe('plain failure');
expect(kloScanErrorMessage({ code: 'E_SCAN' })).toBe('{"code":"E_SCAN"}');
});
});

View file

@ -0,0 +1,52 @@
import type { KloScanEnrichmentSummary, KloScanMode } from './types.js';
export const skippedKloScanEnrichmentSummary: KloScanEnrichmentSummary = {
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'skipped',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
};
export function failedKloScanEnrichmentSummary(
mode: KloScanMode,
detectRelationships = false,
): KloScanEnrichmentSummary {
if (mode === 'enriched') {
return {
dataDictionary: 'failed',
tableDescriptions: 'failed',
columnDescriptions: 'failed',
embeddings: 'failed',
deterministicRelationships: 'failed',
llmRelationshipValidation: 'failed',
statisticalValidation: 'failed',
};
}
if (mode === 'relationships' || detectRelationships) {
return {
...skippedKloScanEnrichmentSummary,
deterministicRelationships: 'failed',
statisticalValidation: 'failed',
};
}
return skippedKloScanEnrichmentSummary;
}
export function kloScanErrorMessage(error: unknown): string {
if (error instanceof Error) {
return error.message;
}
if (typeof error === 'string') {
return error;
}
try {
return JSON.stringify(error);
} catch {
return String(error);
}
}

View file

@ -0,0 +1,159 @@
import { describe, expect, it } from 'vitest';
import type {
KloColumnSampleUpdate,
KloDescriptionUpdate,
KloEmbeddingUpdate,
KloEnrichedSchema,
KloJoinUpdate,
KloRelationshipEndpoint,
KloRelationshipUpdate,
KloScanMetadataStore,
KloStructuralSyncPlan,
} from './enrichment-types.js';
describe('KLO scan enrichment contracts', () => {
it('models an enriched schema with reusable table, column, and relationship metadata', () => {
const schema: KloEnrichedSchema = {
connectionId: 'warehouse',
tables: [
{
id: 'table-orders',
ref: { catalog: 'analytics', db: 'public', name: 'orders' },
enabled: true,
descriptions: { db: 'Raw orders', ai: 'Customer orders' },
columns: [
{
id: 'column-orders-status',
tableId: 'table-orders',
tableRef: { catalog: 'analytics', db: 'public', name: 'orders' },
name: 'status',
nativeType: 'varchar',
normalizedType: 'string',
dimensionType: 'string',
nullable: false,
primaryKey: false,
parentColumnId: null,
descriptions: { db: 'Status code' },
embedding: [0.1, 0.2],
sampleValues: ['paid', 'refunded'],
cardinality: 2,
},
],
},
],
relationships: [
{
id: 'rel-orders-customers',
source: 'formal',
from: {
tableId: 'table-orders',
columnIds: ['column-orders-customer-id'],
table: { catalog: 'analytics', db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'table-customers',
columnIds: ['column-customers-id'],
table: { catalog: 'analytics', db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
],
};
expect(schema.tables[0].columns[0].sampleValues).toEqual(['paid', 'refunded']);
expect(schema.relationships[0].source).toBe('formal');
});
it('models metadata-store updates without requiring a concrete store implementation', async () => {
const structuralPlan: KloStructuralSyncPlan = {
connectionId: 'warehouse',
snapshotId: 'snapshot-1',
operations: [{ kind: 'create_table', table: 'orders' }],
};
const descriptionUpdate: KloDescriptionUpdate = {
connectionId: 'warehouse',
table: { catalog: 'analytics', db: 'public', name: 'orders' },
source: 'ai',
tableDescription: 'Customer orders',
columnDescriptions: { status: 'Payment lifecycle state' },
};
const sampleUpdate: KloColumnSampleUpdate = {
columnId: 'column-orders-status',
sampleValues: ['paid', 'refunded'],
cardinality: 2,
};
const embeddingUpdate: KloEmbeddingUpdate = {
columnId: 'column-orders-status',
text: 'orders.status (varchar). Values: paid, refunded',
embedding: [0.25, 0.75],
};
const relationshipUpdate: KloRelationshipUpdate = {
connectionId: 'warehouse',
accepted: [],
rejected: [],
skipped: [{ reason: 'missing parent table', relationshipId: 'candidate-1' }],
};
const store: KloScanMetadataStore = {
loadSchema: async () => null,
applyStructuralPlan: async (plan) => ({
connectionId: plan.connectionId,
tables: [],
relationships: [],
}),
updateDescriptions: async (input) => {
expect(input).toEqual(descriptionUpdate);
},
updateColumnSamples: async (input) => {
expect(input).toEqual([sampleUpdate]);
},
updateColumnEmbeddings: async (input) => {
expect(input).toEqual([embeddingUpdate]);
},
updateInferredRelationships: async (input) => {
expect(input).toEqual(relationshipUpdate);
},
};
await expect(store.loadSchema('warehouse')).resolves.toBeNull();
await expect(store.applyStructuralPlan(structuralPlan)).resolves.toEqual({
connectionId: 'warehouse',
tables: [],
relationships: [],
});
await expect(store.updateDescriptions(descriptionUpdate)).resolves.toBeUndefined();
await expect(store.updateColumnSamples([sampleUpdate])).resolves.toBeUndefined();
await expect(store.updateColumnEmbeddings([embeddingUpdate])).resolves.toBeUndefined();
await expect(store.updateInferredRelationships(relationshipUpdate)).resolves.toBeUndefined();
});
});
describe('relationship tuple contracts', () => {
it('represents relationship endpoints and join updates as ordered column tuples', () => {
const endpoint: KloRelationshipEndpoint = {
tableId: 'public.order_lines',
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
table: { catalog: null, db: 'public', name: 'order_lines' },
columns: ['order_id', 'line_number'],
};
const update: KloJoinUpdate = {
connectionId: 'warehouse',
fromTable: 'order_line_allocations',
fromColumns: ['order_id', 'line_number'],
toTable: 'order_lines',
toColumns: ['order_id', 'line_number'],
relationship: 'many_to_one',
author: 'klo',
authorEmail: 'klo@example.com',
};
expect(endpoint.columns).toEqual(['order_id', 'line_number']);
expect(endpoint.columnIds).toEqual(['public.order_lines.order_id', 'public.order_lines.line_number']);
expect(update.fromColumns).toEqual(['order_id', 'line_number']);
expect(update.toColumns).toEqual(['order_id', 'line_number']);
});
});

View file

@ -0,0 +1,130 @@
import type { KloSchemaDimensionType, KloTableRef } from './types.js';
export type KloDescriptionSource = 'ai' | 'db' | 'dbt' | 'user' | (string & {});
export type KloRelationshipSource = 'formal' | 'inferred' | 'manual';
export type KloRelationshipType = 'many_to_one' | 'one_to_many' | 'one_to_one';
export interface KloEnrichedColumn {
id: string;
tableId: string;
tableRef: KloTableRef;
name: string;
nativeType: string;
normalizedType: string;
dimensionType: KloSchemaDimensionType;
nullable: boolean;
primaryKey: boolean;
parentColumnId: string | null;
descriptions: Partial<Record<KloDescriptionSource, string>>;
embedding: number[] | null;
sampleValues: string[] | null;
cardinality: number | null;
}
export interface KloEnrichedTable {
id: string;
ref: KloTableRef;
enabled: boolean;
descriptions: Partial<Record<KloDescriptionSource, string>>;
columns: KloEnrichedColumn[];
}
export interface KloRelationshipEndpoint {
tableId: string;
columnIds: string[];
table: KloTableRef;
columns: string[];
}
export interface KloEnrichedRelationship {
id: string;
source: KloRelationshipSource;
from: KloRelationshipEndpoint;
to: KloRelationshipEndpoint;
relationshipType: KloRelationshipType;
confidence: number;
isPrimaryKeyReference: boolean;
}
export interface KloEnrichedSchema {
connectionId: string;
tables: KloEnrichedTable[];
relationships: KloEnrichedRelationship[];
}
export interface KloStructuralSyncPlan {
connectionId: string;
snapshotId: string;
operations: Array<Record<string, unknown>>;
}
export interface KloDescriptionUpdate {
connectionId: string;
table: KloTableRef;
source: KloDescriptionSource;
tableDescription?: string;
columnDescriptions?: Record<string, string | null>;
}
const PREFERRED_METADATA_FIELD_NAMES = [
'tags',
'constraints',
'enum_values',
'freshness',
'tests',
'lineage',
] as const;
export interface KloMetadataUpdate {
connectionId: string;
table: KloTableRef;
source: KloDescriptionSource;
tableFields?: Record<string, unknown>;
columnFields?: Record<string, Record<string, unknown>>;
}
export interface KloJoinUpdate {
connectionId: string;
fromTable: string;
fromColumns: string[];
toTable: string;
toColumns: string[];
relationship: KloRelationshipType;
author: string;
authorEmail: string;
}
export interface KloColumnSampleUpdate {
columnId: string;
sampleValues: string[] | null;
cardinality: number | null;
}
export interface KloEmbeddingUpdate {
columnId: string;
text: string;
embedding: number[];
}
export interface KloSkippedRelationship {
relationshipId: string;
reason: string;
}
export interface KloRelationshipUpdate {
connectionId: string;
accepted: KloEnrichedRelationship[];
rejected: KloEnrichedRelationship[];
skipped: KloSkippedRelationship[];
}
export interface KloScanMetadataStore {
loadSchema(connectionId: string): Promise<KloEnrichedSchema | null>;
applyStructuralPlan(plan: KloStructuralSyncPlan): Promise<KloEnrichedSchema>;
updateDescriptions(input: KloDescriptionUpdate): Promise<void>;
updateColumnSamples(input: KloColumnSampleUpdate[]): Promise<void>;
updateColumnEmbeddings(input: KloEmbeddingUpdate[]): Promise<void>;
updateInferredRelationships(input: KloRelationshipUpdate): Promise<void>;
}

View file

@ -0,0 +1,400 @@
export {
REDACTED_KLO_CREDENTIAL_VALUE,
redactKloCredentialEnvelope,
redactKloCredentialValue,
redactKloScanMetadata,
redactKloScanReport,
redactKloScanWarning,
} from './credentials.js';
export type {
KloDataDictionaryColumnState,
KloDataDictionarySampleDecision,
KloDataDictionarySettings,
KloDataDictionarySkipReason,
} from './data-dictionary.js';
export {
defaultKloDataDictionarySettings,
isKloDataDictionaryCandidate,
shouldKloSampleColumnForDictionary,
} from './data-dictionary.js';
export type {
KloColumnAnalysisResult,
KloColumnDescriptionPromptInput,
KloDataSourceDescriptionPromptInput,
KloDescriptionCachePort,
KloDescriptionColumn,
KloDescriptionColumnTable,
KloDescriptionGenerationSettings,
KloDescriptionGeneratorOptions,
KloDescriptionSamplingPort,
KloDescriptionTableInput,
KloGenerateColumnDescriptionsInput,
KloGenerateDataSourceDescriptionInput,
KloGenerateTableDescriptionInput,
KloTableDescriptionPromptInput,
} from './description-generation.js';
export {
appendKloWordLimitInstruction,
buildKloColumnDescriptionPrompt,
buildKloDataSourceDescriptionPrompt,
buildKloTableDescriptionPrompt,
KloDescriptionGenerator,
} from './description-generation.js';
export type { KloColumnEmbeddingForeignKeys, KloColumnEmbeddingTextInput } from './embedding-text.js';
export { buildKloColumnEmbeddingText } from './embedding-text.js';
export type {
ComputeKloScanEnrichmentInputHashInput,
KloScanEnrichmentCompletedStage,
KloScanEnrichmentFailedStage,
KloScanEnrichmentStageLookup,
KloScanEnrichmentStageRecord,
KloScanEnrichmentStateStore,
} from './enrichment-state.js';
export {
completedKloScanEnrichmentStateSummary,
computeKloScanEnrichmentInputHash,
KLO_SCAN_ENRICHMENT_STAGES,
summarizeKloScanEnrichmentState,
} from './enrichment-state.js';
export {
failedKloScanEnrichmentSummary,
kloScanErrorMessage,
skippedKloScanEnrichmentSummary,
} from './enrichment-summary.js';
export type {
KloColumnSampleUpdate,
KloDescriptionSource,
KloDescriptionUpdate,
KloEmbeddingUpdate,
KloEnrichedColumn,
KloEnrichedRelationship,
KloEnrichedSchema,
KloEnrichedTable,
KloRelationshipEndpoint,
KloRelationshipSource,
KloRelationshipType,
KloRelationshipUpdate,
KloScanMetadataStore,
KloSkippedRelationship,
KloStructuralSyncPlan,
} from './enrichment-types.js';
export type {
DeterministicLocalScanEnrichmentProviderOptions,
KloLocalScanEnrichmentInput,
KloLocalScanEnrichmentProviders,
KloLocalScanEnrichmentResult,
} from './local-enrichment.js';
export {
createDeterministicLocalScanEnrichmentProviders,
runLocalScanEnrichment,
snapshotToKloEnrichedSchema,
} from './local-enrichment.js';
export type {
WriteLocalScanEnrichmentArtifactsInput,
WriteLocalScanEnrichmentArtifactsResult,
WriteLocalScanManifestShardsInput,
WriteLocalScanManifestShardsResult,
} from './local-enrichment-artifacts.js';
export {
writeLocalScanEnrichmentArtifacts,
writeLocalScanManifestShards,
} from './local-enrichment-artifacts.js';
export type {
LocalScanMcpOptions,
LocalScanRunResult,
LocalScanStatusResponse,
RunLocalScanOptions,
} from './local-scan.js';
export { getLocalScanReport, getLocalScanStatus, runLocalScan } from './local-scan.js';
export type { ReadLocalScanStructuralSnapshotInput } from './local-structural-artifacts.js';
export { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js';
export type {
KloEnrichmentScanPhaseResult,
KloScanOrchestratorOptions,
KloScanOrchestratorRunInput,
KloScanOrchestratorRunResult,
KloStructuralScanPhaseResult,
} from './orchestrator.js';
export { KloScanOrchestrator } from './orchestrator.js';
export type {
KloRelationshipArtifactStatus,
ReadLocalScanRelationshipArtifactsResult,
} from './relationship-artifacts.js';
export { readLocalScanRelationshipArtifacts } from './relationship-artifacts.js';
export type {
KloRelationshipBenchmarkReport,
KloRelationshipBenchmarkReportCase,
KloRelationshipBenchmarkReportCaseStatus,
} from './relationship-benchmark-report.js';
export {
buildKloRelationshipBenchmarkReport,
formatKloRelationshipBenchmarkReportMarkdown,
} from './relationship-benchmark-report.js';
export type {
KloRelationshipBenchmarkCaseResult,
KloRelationshipBenchmarkDetectedLink,
KloRelationshipBenchmarkDetectedPk,
KloRelationshipBenchmarkDetector,
KloRelationshipBenchmarkDetectorInput,
KloRelationshipBenchmarkDetectorResult,
KloRelationshipBenchmarkExpectedLink,
KloRelationshipBenchmarkExpectedLinks,
KloRelationshipBenchmarkExpectedPk,
KloRelationshipBenchmarkFixture,
KloRelationshipBenchmarkMetrics,
KloRelationshipBenchmarkMode,
KloRelationshipBenchmarkStatus,
KloRelationshipBenchmarkSuiteResult,
KloRelationshipBenchmarkTier,
} from './relationship-benchmarks.js';
export {
currentKloRelationshipBenchmarkDetector,
kloRelationshipBenchmarkDetectorWithLlm,
KLO_RELATIONSHIP_BENCHMARK_MODES,
KLO_RELATIONSHIP_BENCHMARK_TIERS,
loadKloRelationshipBenchmarkFixture,
loadKloRelationshipBenchmarkFixtures,
maskKloRelationshipBenchmarkSnapshot,
runKloRelationshipBenchmarkCase,
runKloRelationshipBenchmarkSuite,
} from './relationship-benchmarks.js';
export type {
ApplyKloRelationshipValidationBudgetInput,
KloRelationshipBudgetedCandidate,
KloRelationshipValidationBudget,
KloRelationshipValidationBudgetResult,
} from './relationship-budget.js';
export {
applyKloRelationshipValidationBudget,
defaultKloRelationshipValidationBudget,
} from './relationship-budget.js';
export type {
KloRelationshipDiscoveryCandidate,
KloRelationshipDiscoveryCandidateEvidence,
KloRelationshipDiscoveryCandidateOptions,
KloRelationshipDiscoveryCandidateSource,
KloRelationshipDiscoveryCandidateStatus,
KloRelationshipInferredTargetPk,
} from './relationship-candidates.js';
export {
generateKloRelationshipDiscoveryCandidates,
inferKloRelationshipTargetPks,
mergeKloRelationshipDiscoveryCandidates,
} from './relationship-candidates.js';
export type {
DiscoverKloCompositeRelationshipsInput,
DiscoverKloCompositeRelationshipsResult,
KloCompositePrimaryKeyCandidate,
KloCompositeRelationshipCandidate,
KloCompositeRelationshipStatus,
KloCompositeRelationshipTupleEndpoint,
KloCompositeRelationshipValidationEvidence,
} from './relationship-composite-candidates.js';
export { discoverKloCompositeRelationships } from './relationship-composite-candidates.js';
export type {
BuildKloRelationshipArtifactsInput,
BuildKloRelationshipDiagnosticsInput,
EmptyKloRelationshipProfileArtifactInput,
KloRelationshipArtifact,
KloRelationshipArtifactEdge,
KloRelationshipArtifactEndpoint,
KloRelationshipDiagnosticsArtifact,
KloRelationshipDiagnosticsSummary,
KloRelationshipDiagnosticsThresholds,
KloRelationshipDiagnosticsValidation,
} from './relationship-diagnostics.js';
export {
buildKloRelationshipArtifacts,
buildKloRelationshipDiagnostics,
emptyKloRelationshipProfileArtifact,
} from './relationship-diagnostics.js';
export type {
BuildKloRelationshipFeedbackCalibrationReportInput,
CalibrateLocalRelationshipFeedbackLabelsInput,
KloRelationshipFeedbackCalibrationBucket,
KloRelationshipFeedbackCalibrationLabel,
KloRelationshipFeedbackCalibrationReport,
} from './relationship-feedback-calibration.js';
export {
buildKloRelationshipFeedbackCalibrationReport,
calibrateLocalRelationshipFeedbackLabels,
formatKloRelationshipFeedbackCalibrationMarkdown,
} from './relationship-feedback-calibration.js';
export type {
ExportLocalRelationshipFeedbackLabelsInput,
ExportLocalRelationshipFeedbackLabelsResult,
KloRelationshipFeedbackDecisionFilter,
KloRelationshipFeedbackExportWarning,
KloRelationshipFeedbackLabel,
} from './relationship-feedback-export.js';
export {
exportLocalRelationshipFeedbackLabels,
formatKloRelationshipFeedbackLabelsJsonl,
} from './relationship-feedback-export.js';
export {
collectKloFormalMetadataRelationships,
type KloFormalMetadataRelationshipCollection,
} from './relationship-formal-metadata.js';
export type {
KloRelationshipGraphResolutionResult,
KloRelationshipGraphResolverSettings,
KloResolvedRelationshipDiscoveryCandidate,
KloResolvedRelationshipGraphEvidence,
KloResolvedRelationshipPk,
KloResolvedRelationshipPkEvidence,
KloResolvedRelationshipStatus,
ResolveKloRelationshipGraphInput,
} from './relationship-graph-resolver.js';
export { resolveKloRelationshipGraph } from './relationship-graph-resolver.js';
export type {
KloRelationshipLlmProposalGenerateText,
KloRelationshipLlmProposalResult,
KloRelationshipLlmProposalSettings,
ProposeKloRelationshipCandidatesWithLlmInput,
} from './relationship-llm-proposal.js';
export { proposeKloRelationshipCandidatesWithLlm } from './relationship-llm-proposal.js';
export type {
KloRelationshipLocalityCandidateTable,
LocalKloRelationshipCandidateTablesInput,
} from './relationship-locality.js';
export { localCandidateTables } from './relationship-locality.js';
export type {
KloRelationshipNormalizedName,
KloRelationshipTokenInput,
} from './relationship-name-similarity.js';
export {
normalizeKloRelationshipName,
pluralizeKloRelationshipToken,
singularizeKloRelationshipToken,
tokenizeKloRelationshipName,
tokenSimilarity,
} from './relationship-name-similarity.js';
export type {
DiscoverKloRelationshipsInput,
DiscoverKloRelationshipsResult,
} from './relationship-discovery.js';
export { discoverKloRelationships } from './relationship-discovery.js';
export type {
KloRelationshipColumnProfile,
KloRelationshipProfileArtifact,
KloRelationshipReadOnlyExecutor,
KloRelationshipTableProfile,
ProfileKloRelationshipSchemaInput,
} from './relationship-profiling.js';
export {
formatKloRelationshipTableRef,
profileKloRelationshipSchema,
quoteKloRelationshipIdentifier,
} from './relationship-profiling.js';
export type {
AppliedRelationshipReviewDecision,
ApplyLocalScanRelationshipReviewDecisionsInput,
ApplyLocalScanRelationshipReviewDecisionsResult,
} from './relationship-review-apply.js';
export { applyLocalScanRelationshipReviewDecisions } from './relationship-review-apply.js';
export type {
KloRelationshipReviewDecisionArtifact,
KloRelationshipReviewDecisionEntry,
KloRelationshipReviewDecisionValue,
WriteLocalScanRelationshipReviewDecisionInput,
WriteLocalScanRelationshipReviewDecisionResult,
} from './relationship-review-decisions.js';
export { writeLocalScanRelationshipReviewDecision } from './relationship-review-decisions.js';
export type {
KloRelationshipFixtureOrigin,
KloRelationshipScoreBreakdown,
KloRelationshipScoreSignal,
KloRelationshipScoreWeights,
KloRelationshipScoringCalibrationObservation,
KloRelationshipSignalVector,
} from './relationship-scoring.js';
export {
calibrateWeightsFromSyntheticFixtures,
defaultKloRelationshipScoreWeights,
KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS,
normalizeKloRelationshipScoreWeights,
scoreKloRelationshipCandidate,
} from './relationship-scoring.js';
export type {
AdviseLocalRelationshipFeedbackThresholdsInput,
BuildKloRelationshipThresholdAdviceReportInput,
KloRelationshipThresholdAdviceCandidate,
KloRelationshipThresholdAdviceReport,
KloRelationshipThresholdAdviceStatus,
} from './relationship-threshold-advice.js';
export {
adviseLocalRelationshipFeedbackThresholds,
buildKloRelationshipThresholdAdviceReport,
formatKloRelationshipThresholdAdviceMarkdown,
} from './relationship-threshold-advice.js';
export type {
KloRelationshipValidationEvidence,
KloRelationshipValidationSettings,
KloValidatedRelationshipDiscoveryCandidate,
KloValidatedRelationshipStatus,
ValidateKloRelationshipDiscoveryCandidatesInput,
} from './relationship-validation.js';
export { validateKloRelationshipDiscoveryCandidates } from './relationship-validation.js';
export type { SqliteLocalScanEnrichmentStateStoreOptions } from './sqlite-local-enrichment-state-store.js';
export { SqliteLocalScanEnrichmentStateStore } from './sqlite-local-enrichment-state-store.js';
export type { KloColumnTypeMapping } from './type-normalization.js';
export {
inferKloDimensionType,
kloColumnTypeMappingFromNative,
normalizeKloNativeType,
} from './type-normalization.js';
export type {
KloColumnSampleInput,
KloColumnSampleResult,
KloColumnStatsInput,
KloColumnStatsResult,
KloConnectionDriver,
KloConnectorCapabilities,
KloCredentialEnvelope,
KloCredentialEnvReference,
KloCredentialFileReference,
KloEmbeddingPort,
KloEventPropertyDiscovery,
KloEventPropertyDiscoveryInput,
KloEventPropertyValuesInput,
KloEventPropertyValuesResult,
KloEventStreamDiscoveryPort,
KloEventTypeDiscovery,
KloEventTypeDiscoveryInput,
KloNetworkEndpoint,
KloNetworkTunnelPort,
KloNetworkTunnelRequest,
KloOptionalConnectorCapabilities,
KloProgressPort,
KloProgressUpdateOptions,
KloQueryResult,
KloReadOnlyQueryInput,
KloResolvedCredentialEnvelope,
KloScanArtifactPaths,
KloScanConnector,
KloScanContext,
KloScanDiffSummary,
KloScanEnrichmentStage,
KloScanEnrichmentStateSummary,
KloScanEnrichmentSummary,
KloScanInput,
KloScanLoggerPort,
KloScanMode,
KloScanRelationshipSummary,
KloScanReport,
KloScanTrigger,
KloScanWarning,
KloScanWarningCode,
KloSchemaColumn,
KloSchemaDimensionType,
KloSchemaForeignKey,
KloSchemaScope,
KloSchemaSnapshot,
KloSchemaTable,
KloSchemaTableKind,
KloStructuralSyncStats,
KloTableRef,
KloTableSampleInput,
KloTableSampleResult,
} from './types.js';
export { createKloConnectorCapabilities } from './types.js';

View file

@ -0,0 +1,852 @@
import { mkdtemp, readFile, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import YAML from 'yaml';
import { initKloProject, type KloLocalProject } from '../project/index.js';
import type { KloLocalScanEnrichmentResult } from './local-enrichment.js';
import { writeLocalScanEnrichmentArtifacts, writeLocalScanManifestShards } from './local-enrichment-artifacts.js';
import type { KloSchemaSnapshot } from './types.js';
const snapshot: KloSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-29T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'customers',
kind: 'table',
comment: 'DB customer table',
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'DB customer id',
},
],
},
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: 'DB orders table',
estimatedRows: 3,
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: 'orders_customer_id_fkey',
},
],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'DB order id',
},
{
name: 'customer_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: 'DB customer id',
},
],
},
],
};
function enrichment(): KloLocalScanEnrichmentResult {
return {
snapshot,
summary: {
dataDictionary: 'completed',
tableDescriptions: 'completed',
columnDescriptions: 'completed',
embeddings: 'completed',
deterministicRelationships: 'completed',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
},
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
state: {
resumedStages: [],
completedStages: ['descriptions', 'embeddings', 'relationships'],
failedStages: [],
},
warnings: [],
descriptionUpdates: [
{
table: { catalog: null, db: 'public', name: 'orders' },
tableDescription: 'AI orders table',
columnDescriptions: {
id: 'AI order id',
customer_id: 'AI customer reference',
},
},
{
table: { catalog: null, db: 'public', name: 'customers' },
tableDescription: 'AI customers table',
columnDescriptions: {
id: 'AI customer id',
},
},
],
embeddingUpdates: [
{ columnId: 'public.orders.id', text: 'orders id', embedding: [0.1, 0.2] },
{ columnId: 'public.orders.customer_id', text: 'orders customer_id', embedding: [0.3, 0.4] },
],
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
source: 'inferred',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
relationshipProfile: {
connectionId: 'warehouse',
driver: 'postgres',
sqlAvailable: true,
queryCount: 6,
tables: [{ table: { catalog: null, db: 'public', name: 'customers' }, rowCount: 2 }],
columns: {
'customers.id': {
table: { catalog: null, db: 'public', name: 'customers' },
column: 'id',
nativeType: 'integer',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
},
warnings: [],
},
resolvedRelationships: [
{
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
source: 'llm_proposal',
status: 'accepted',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.92,
pkScore: 0.95,
fkScore: 0.91,
score: 0.9,
evidence: {
sourceColumnBase: 'buyer',
targetTableBase: 'customer',
targetColumnBase: 'id',
targetKeyScore: 0.88,
nameScore: 0.45,
reasons: ['llm_proposal', 'llm_pk_proposal'],
llmConfidence: 0.89,
llmRationale: 'Buyer reference values align with customer identifiers.',
},
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 2,
parentDistinct: 2,
overlap: 2,
checkedValues: 2,
reasons: ['validation_passed'],
},
graph: {
targetPkScore: 0.95,
incomingCandidateCount: 1,
conflictRank: 1,
reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'],
},
},
],
compositeRelationships: null,
};
}
describe('writeLocalScanEnrichmentArtifacts', () => {
let tempDir: string;
let project: KloLocalProject;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-local-enrichment-artifacts-'));
project = await initKloProject({
projectDir: join(tempDir, 'project'),
projectName: 'warehouse',
});
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('writes enrichment artifacts and manifest shards while preserving external descriptions', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify(
{
tables: {
orders: {
table: 'public.orders',
descriptions: { user: 'Pinned analyst description', ai: 'Old AI description' },
columns: [
{
name: 'id',
type: 'number',
descriptions: { user: 'Pinned id description', ai: 'Old AI id' },
},
{ name: 'customer_id', type: 'number' },
],
joins: [
{
to: 'customers',
on: 'orders.id = customers.id',
relationship: 'many_to_one',
source: 'manual',
},
],
},
},
},
{ indent: 2, lineWidth: 0 },
),
'klo',
'klo@example.com',
'Seed manifest shard',
);
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
syncId: 'sync-1',
driver: 'postgres',
enrichment: enrichment(),
dryRun: false,
relationshipSettings: {
enabled: true,
llmProposals: false,
validationRequiredForManifest: true,
acceptThreshold: 0.91,
reviewThreshold: 0.61,
maxLlmTablesPerBatch: 12,
maxCandidatesPerColumn: 7,
profileSampleRows: 500,
validationConcurrency: 2,
},
});
expect(result).toEqual({
enrichmentArtifacts: [
'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/embeddings.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json',
],
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
manifestShardsWritten: 1,
});
await expect(
readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json'),
'utf-8',
),
).resolves.toContain('AI orders table');
const relationshipsRaw = await readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json'),
'utf-8',
);
const relationshipsArtifact = JSON.parse(relationshipsRaw) as {
accepted: Array<{
id: string;
status: string;
source: string;
pkScore: number;
fkScore: number;
evidence: unknown;
reasons: string[];
validation: unknown;
graph: unknown;
}>;
review: unknown[];
rejected: unknown[];
skipped: unknown[];
};
expect(relationshipsArtifact.accepted).toHaveLength(1);
expect(relationshipsArtifact.accepted[0]).toMatchObject({
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
status: 'accepted',
source: 'llm_proposal',
pkScore: 0.95,
fkScore: 0.91,
evidence: expect.objectContaining({
llmConfidence: 0.89,
llmRationale: 'Buyer reference values align with customer identifiers.',
}),
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
validation: expect.objectContaining({ reasons: ['validation_passed'] }),
graph: expect.objectContaining({ reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'] }),
});
expect(relationshipsArtifact.review).toEqual([]);
expect(relationshipsArtifact.rejected).toEqual([]);
expect(relationshipsArtifact.skipped).toEqual([]);
const profileRaw = await readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json'),
'utf-8',
);
expect(JSON.parse(profileRaw)).toMatchObject({
connectionId: 'warehouse',
driver: 'postgres',
sqlAvailable: true,
queryCount: 6,
warnings: [],
});
const diagnosticsRaw = await readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json'),
'utf-8',
);
expect(JSON.parse(diagnosticsRaw)).toMatchObject({
connectionId: 'warehouse',
summary: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
noAcceptedReason: null,
candidateCountsBySource: { llm_proposal: 1 },
validation: { available: true, sqlAvailable: true, queryCount: 6 },
thresholds: { acceptThreshold: 0.91, reviewThreshold: 0.61 },
policy: {
validationRequiredForManifest: true,
maxCandidatesPerColumn: 7,
profileSampleRows: 500,
validationConcurrency: 2,
},
profileWarnings: [],
});
const manifestRaw = await readFile(
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
'utf-8',
);
const manifest = YAML.parse(manifestRaw) as {
tables: {
orders: {
descriptions: Record<string, string>;
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
joins: Array<{ to: string; on: string; source: string }>;
};
};
};
expect(manifest.tables.orders.descriptions).toEqual({
user: 'Pinned analyst description',
db: 'DB orders table',
ai: 'AI orders table',
});
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
user: 'Pinned id description',
db: 'DB order id',
ai: 'AI order id',
});
expect(manifest.tables.orders.joins).toEqual(
expect.arrayContaining([
expect.objectContaining({
to: 'customers',
on: 'orders.customer_id = customers.id',
source: 'formal',
}),
expect.objectContaining({
to: 'customers',
on: 'orders.id = customers.id',
source: 'manual',
}),
]),
);
});
it('writes formal accepted relationships into relationship artifacts and manifest shards', async () => {
const source = enrichment();
const formalEnrichment: KloLocalScanEnrichmentResult = {
...source,
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
source: 'formal',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
resolvedRelationships: [],
compositeRelationships: null,
};
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
driver: 'sqlite',
syncId: 'sync-formal',
enrichment: formalEnrichment,
relationshipSettings: {
enabled: true,
llmProposals: false,
validationRequiredForManifest: true,
acceptThreshold: 0.85,
reviewThreshold: 0.55,
maxLlmTablesPerBatch: 40,
maxCandidatesPerColumn: 25,
profileSampleRows: 10000,
validationConcurrency: 4,
},
dryRun: false,
});
const relationshipsPath = 'raw-sources/warehouse/live-database/sync-formal/enrichment/relationships.json';
const relationships = JSON.parse((await project.fileStore.readFile(relationshipsPath)).content) as {
accepted: Array<{ source: string; reasons: string[] }>;
};
expect(relationships.accepted).toEqual([
expect.objectContaining({
source: 'formal',
reasons: ['formal_metadata_accepted'],
}),
]);
const manifestPath = result.manifestShards[0];
if (!manifestPath) {
throw new Error('Expected manifest shard path');
}
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
tables: { orders: { joins: Array<{ to: string; on: string; source: string }> } };
};
expect(manifest.tables.orders.joins).toEqual(
expect.arrayContaining([
expect.objectContaining({
to: 'customers',
on: 'orders.customer_id = customers.id',
source: 'formal',
}),
]),
);
});
it('writes manually applied relationship joins with manual source', async () => {
const result = await writeLocalScanManifestShards({
project,
connectionId: 'warehouse',
syncId: 'sync-manual',
driver: 'postgres',
snapshot,
dryRun: false,
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'public.orders:(public.orders.customer_id)->public.customers:(public.customers.id)',
source: 'manual',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
});
expect(result.manifestShardsWritten).toBe(1);
const shard = YAML.parse(await readFile(join(tempDir, 'project/semantic-layer/warehouse/_schema/public.yaml'), 'utf8'));
expect(shard.tables.orders.joins).toContainEqual({
to: 'customers',
on: 'orders.customer_id = customers.id',
relationship: 'many_to_one',
source: 'manual',
});
});
it('writes accepted composite relationships to relationship artifacts and manifest shards', async () => {
const compositeSnapshot: KloSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-05-07T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'order_lines',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'order_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'line_number',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: 'public',
name: 'order_line_allocations',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'order_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'line_number',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
const compositeEnrichment: KloLocalScanEnrichmentResult = Object.assign(enrichment(), {
snapshot: compositeSnapshot,
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
descriptionUpdates: [],
embeddingUpdates: [],
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
source: 'inferred',
from: {
tableId: 'public.order_line_allocations',
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
columns: ['order_id', 'line_number'],
},
to: {
tableId: 'public.order_lines',
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
table: { catalog: null, db: 'public', name: 'order_lines' },
columns: ['order_id', 'line_number'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
resolvedRelationships: [],
compositeRelationships: [
{
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
source: 'composite_profile_match',
status: 'accepted',
from: {
tableId: 'public.order_line_allocations',
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
columns: ['order_id', 'line_number'],
},
to: {
tableId: 'public.order_lines',
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
table: { catalog: null, db: 'public', name: 'order_lines' },
columns: ['order_id', 'line_number'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
childDistinct: 2,
parentDistinct: 2,
overlap: 2,
reasons: ['composite_validation_passed'],
},
},
],
});
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
driver: 'postgres',
syncId: 'sync-composite',
enrichment: compositeEnrichment,
relationshipSettings: {
enabled: true,
llmProposals: false,
validationRequiredForManifest: true,
acceptThreshold: 0.85,
reviewThreshold: 0.55,
maxLlmTablesPerBatch: 40,
maxCandidatesPerColumn: 25,
profileSampleRows: 10000,
validationConcurrency: 4,
},
dryRun: false,
});
const relationships = JSON.parse(
(await project.fileStore.readFile('raw-sources/warehouse/live-database/sync-composite/enrichment/relationships.json'))
.content,
) as { accepted: Array<{ from: { columns: string[] }; to: { columns: string[] }; reasons: string[] }> };
expect(relationships.accepted[0]).toMatchObject({
from: { columns: ['order_id', 'line_number'] },
to: { columns: ['order_id', 'line_number'] },
reasons: ['composite_validation_passed'],
});
const manifestPath = result.manifestShards[0];
if (!manifestPath) {
throw new Error('Expected manifest shard path');
}
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
tables: { order_line_allocations: { joins: Array<{ to: string; on: string; source: string }> } };
};
expect(manifest.tables.order_line_allocations.joins).toEqual([
{
to: 'order_lines',
on: 'order_line_allocations.order_id = order_lines.order_id AND order_line_allocations.line_number = order_lines.line_number',
relationship: 'many_to_one',
source: 'inferred',
},
]);
});
it('writes structural manifest shards without enrichment artifacts', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify(
{
tables: {
orders: {
table: 'public.orders',
descriptions: { user: 'Pinned structural description', ai: 'Old generated text' },
columns: [
{
name: 'id',
type: 'number',
descriptions: { user: 'Pinned structural id', ai: 'Old generated id' },
},
{ name: 'customer_id', type: 'number' },
],
joins: [
{
to: 'customers',
on: 'orders.id = customers.id',
relationship: 'many_to_one',
source: 'manual',
},
],
},
},
},
{ indent: 2, lineWidth: 0 },
),
'klo',
'klo@example.com',
'Seed structural manifest shard',
);
const result = await writeLocalScanManifestShards({
project,
connectionId: 'warehouse',
syncId: 'sync-structural-1',
driver: 'postgres',
snapshot,
dryRun: false,
});
expect(result).toEqual({
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
manifestShardsWritten: 1,
});
await expect(
readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-structural-1/enrichment/descriptions.json'),
'utf-8',
),
).rejects.toMatchObject({ code: 'ENOENT' });
const manifestRaw = await readFile(
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
'utf-8',
);
const manifest = YAML.parse(manifestRaw) as {
tables: {
orders: {
descriptions: Record<string, string>;
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
joins: Array<{ to: string; on: string; source: string }>;
};
};
};
expect(manifest.tables.orders.descriptions).toEqual({
user: 'Pinned structural description',
db: 'DB orders table',
});
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
user: 'Pinned structural id',
db: 'DB order id',
});
expect(manifest.tables.orders.joins).toEqual(
expect.arrayContaining([
expect.objectContaining({
to: 'customers',
on: 'orders.customer_id = customers.id',
source: 'formal',
}),
expect.objectContaining({
to: 'customers',
on: 'orders.id = customers.id',
source: 'manual',
}),
]),
);
});
it('returns planned empty paths without writing files during dry runs', async () => {
const result = await writeLocalScanEnrichmentArtifacts({
project,
connectionId: 'warehouse',
syncId: 'sync-dry-run',
driver: 'postgres',
enrichment: enrichment(),
dryRun: true,
});
expect(result).toEqual({
enrichmentArtifacts: [],
manifestShards: [],
manifestShardsWritten: 0,
});
await expect(
readFile(
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-dry-run/enrichment/descriptions.json'),
'utf-8',
),
).rejects.toMatchObject({ code: 'ENOENT' });
});
});

View file

@ -0,0 +1,417 @@
import YAML from 'yaml';
import {
buildLiveDatabaseManifestShards,
type LiveDatabaseManifestExistingDescriptions,
type LiveDatabaseManifestJoinData,
type LiveDatabaseManifestJoinEntry,
type LiveDatabaseManifestShard,
type LiveDatabaseManifestTableData,
} from '../ingest/index.js';
import type { KloScanRelationshipConfig } from '../project/config.js';
import type { KloLocalProject } from '../project/index.js';
import type { KloLocalScanEnrichmentResult } from './local-enrichment.js';
import {
buildKloRelationshipArtifacts,
buildKloRelationshipDiagnostics,
emptyKloRelationshipProfileArtifact,
} from './relationship-diagnostics.js';
import type { KloConnectionDriver, KloSchemaColumn, KloSchemaSnapshot, KloSchemaTable } from './types.js';
const LIVE_DATABASE_ADAPTER = 'live-database';
const LOCAL_AUTHOR = 'klo';
const LOCAL_AUTHOR_EMAIL = 'klo@example.com';
const SCHEMA_DIR = '_schema';
const SL_DIR_PREFIX = 'semantic-layer';
export interface WriteLocalScanManifestShardsInput {
project: KloLocalProject;
connectionId: string;
syncId: string;
driver: KloConnectionDriver;
snapshot: KloSchemaSnapshot;
dryRun: boolean;
descriptionUpdates?: KloLocalScanEnrichmentResult['descriptionUpdates'];
relationshipUpdate?: KloLocalScanEnrichmentResult['relationshipUpdate'];
}
export interface WriteLocalScanManifestShardsResult {
manifestShards: string[];
manifestShardsWritten: number;
}
export interface WriteLocalScanEnrichmentArtifactsInput {
project: KloLocalProject;
connectionId: string;
syncId: string;
driver: KloConnectionDriver;
enrichment: KloLocalScanEnrichmentResult;
dryRun: boolean;
relationshipSettings?: KloScanRelationshipConfig;
}
export interface WriteLocalScanEnrichmentArtifactsResult extends WriteLocalScanManifestShardsResult {
enrichmentArtifacts: string[];
}
interface ExistingManifestState {
descriptions: Map<string, LiveDatabaseManifestExistingDescriptions>;
preservedJoins: Map<string, LiveDatabaseManifestJoinEntry[]>;
}
type LocalDescriptionUpdates = KloLocalScanEnrichmentResult['descriptionUpdates'];
function artifactDir(connectionId: string, syncId: string): string {
return `raw-sources/${connectionId}/${LIVE_DATABASE_ADAPTER}/${syncId}/enrichment`;
}
function schemaDir(connectionId: string): string {
return `${SL_DIR_PREFIX}/${connectionId}/${SCHEMA_DIR}`;
}
function tableDescription(
table: KloSchemaTable,
descriptionUpdates: LocalDescriptionUpdates = [],
): Record<string, string> | undefined {
const update = descriptionUpdates.find((candidate) => candidate.table.name === table.name);
const descriptions: Record<string, string> = {};
if (table.comment) {
descriptions.db = table.comment;
}
if (update?.tableDescription) {
descriptions.ai = update.tableDescription;
}
return Object.keys(descriptions).length > 0 ? descriptions : undefined;
}
function columnDescription(
table: KloSchemaTable,
column: KloSchemaColumn,
descriptionUpdates: LocalDescriptionUpdates = [],
): Record<string, string> | undefined {
const update = descriptionUpdates.find((candidate) => candidate.table.name === table.name);
const aiDescription = update?.columnDescriptions[column.name] ?? null;
const descriptions: Record<string, string> = {};
if (column.comment) {
descriptions.db = column.comment;
}
if (aiDescription) {
descriptions.ai = aiDescription;
}
return Object.keys(descriptions).length > 0 ? descriptions : undefined;
}
function snapshotTablesToManifestData(
snapshot: KloSchemaSnapshot,
descriptionUpdates: LocalDescriptionUpdates = [],
): LiveDatabaseManifestTableData[] {
return snapshot.tables.map((table) => ({
name: table.name,
catalog: table.catalog,
db: table.db,
descriptions: tableDescription(table, descriptionUpdates),
columns: table.columns.map((column) => ({
name: column.name,
type: column.dimensionType,
...(column.primaryKey ? { pk: true } : {}),
...(column.nullable === false ? { nullable: false } : {}),
descriptions: columnDescription(table, column, descriptionUpdates),
})),
}));
}
function formalJoins(snapshot: KloSchemaSnapshot): LiveDatabaseManifestJoinData[] {
const joins: LiveDatabaseManifestJoinData[] = [];
for (const table of snapshot.tables) {
for (const foreignKey of table.foreignKeys) {
joins.push({
fromTable: table.name,
fromColumns: [foreignKey.fromColumn],
toTable: foreignKey.toTable,
toColumns: [foreignKey.toColumn],
relationship: 'many_to_one',
source: 'formal',
});
}
}
return joins;
}
function acceptedRelationshipJoins(
relationshipUpdate: KloLocalScanEnrichmentResult['relationshipUpdate'] | undefined,
): LiveDatabaseManifestJoinData[] {
return (relationshipUpdate?.accepted ?? []).map((relationship) => ({
fromTable: relationship.from.table.name,
fromColumns: relationship.from.columns,
toTable: relationship.to.table.name,
toColumns: relationship.to.columns,
relationship: relationship.relationshipType,
source: relationship.source,
}));
}
function relationshipJoins(
snapshot: KloSchemaSnapshot,
relationshipUpdate: KloLocalScanEnrichmentResult['relationshipUpdate'] | undefined,
): LiveDatabaseManifestJoinData[] {
const accepted = acceptedRelationshipJoins(relationshipUpdate);
const manual = accepted.filter((relationship) => relationship.source === 'manual');
const generated = accepted.filter((relationship) => relationship.source !== 'manual');
return [...manual, ...formalJoins(snapshot), ...generated];
}
function validColumns(snapshot: KloSchemaSnapshot): Map<string, Set<string>> {
return new Map(snapshot.tables.map((table) => [table.name, new Set(table.columns.map((column) => column.name))]));
}
function joinReferencesExistingColumns(
join: LiveDatabaseManifestJoinEntry,
columnsByTable: Map<string, Set<string>>,
): boolean {
const terms = join.on.split(/\s+AND\s+/iu);
for (const term of terms) {
const match = term.match(/^(\w+)\.(\w+)\s*=\s*(\w+)\.(\w+)$/u);
if (!match) {
return true;
}
const leftTable = match[1];
const leftColumn = match[2];
const rightTable = match[3];
const rightColumn = match[4];
if (!leftTable || !leftColumn || !rightTable || !rightColumn) {
return true;
}
const leftColumns = columnsByTable.get(leftTable);
const rightColumns = columnsByTable.get(rightTable);
if ((leftColumns && !leftColumns.has(leftColumn)) || (rightColumns && !rightColumns.has(rightColumn))) {
return false;
}
}
return true;
}
async function loadExistingManifestState(
project: KloLocalProject,
connectionId: string,
snapshot: KloSchemaSnapshot,
): Promise<ExistingManifestState> {
const descriptions = new Map<string, LiveDatabaseManifestExistingDescriptions>();
const preservedJoins = new Map<string, LiveDatabaseManifestJoinEntry[]>();
const validTableNames = new Set(snapshot.tables.map((table) => table.name));
const columnsByTable = validColumns(snapshot);
let files: string[];
try {
files = (await project.fileStore.listFiles(schemaDir(connectionId))).files.filter((file) => file.endsWith('.yaml'));
} catch {
return { descriptions, preservedJoins };
}
for (const file of files) {
try {
const { content } = await project.fileStore.readFile(file);
const shard = YAML.parse(content) as LiveDatabaseManifestShard | null;
if (!shard?.tables) {
continue;
}
for (const [tableName, entry] of Object.entries(shard.tables)) {
if (!validTableNames.has(tableName)) {
continue;
}
descriptions.set(tableName, {
table: entry.descriptions ? { ...entry.descriptions } : undefined,
columns: new Map(
(entry.columns ?? []).flatMap((column) =>
column.descriptions ? ([[column.name, { ...column.descriptions }]] as const) : [],
),
),
});
const joins = (entry.joins ?? []).filter((join) => {
return (
(join.source === 'manual' || join.source === 'inferred') &&
validTableNames.has(join.to) &&
joinReferencesExistingColumns(join, columnsByTable)
);
});
if (joins.length > 0) {
preservedJoins.set(tableName, joins);
}
}
} catch {
continue;
}
}
return { descriptions, preservedJoins };
}
async function writeJsonArtifact(
project: KloLocalProject,
path: string,
value: unknown,
commitMessage: string,
): Promise<void> {
await project.fileStore.writeFile(
path,
`${JSON.stringify(value, null, 2)}\n`,
LOCAL_AUTHOR,
LOCAL_AUTHOR_EMAIL,
commitMessage,
);
}
export async function writeLocalScanManifestShards(
input: WriteLocalScanManifestShardsInput,
): Promise<WriteLocalScanManifestShardsResult> {
if (input.dryRun) {
return {
manifestShards: [],
manifestShardsWritten: 0,
};
}
const existing = await loadExistingManifestState(input.project, input.connectionId, input.snapshot);
const { shards } = buildLiveDatabaseManifestShards({
connectionType: input.driver.toUpperCase(),
tables: snapshotTablesToManifestData(input.snapshot, input.descriptionUpdates),
joins: relationshipJoins(input.snapshot, input.relationshipUpdate),
existingDescriptions: existing.descriptions,
existingPreservedJoins: existing.preservedJoins,
mapColumnType: (dimensionType) => dimensionType,
});
const manifestShards: string[] = [];
for (const [shardKey, shard] of [...shards.entries()].sort(([left], [right]) => left.localeCompare(right))) {
const path = `${schemaDir(input.connectionId)}/${shardKey}.yaml`;
await input.project.fileStore.writeFile(
path,
YAML.stringify(shard, { indent: 2, lineWidth: 0 }),
LOCAL_AUTHOR,
LOCAL_AUTHOR_EMAIL,
`scan(${LIVE_DATABASE_ADAPTER}): write manifest shard ${shardKey} syncId=${input.syncId}`,
);
manifestShards.push(path);
}
return {
manifestShards,
manifestShardsWritten: manifestShards.length,
};
}
export async function writeLocalScanEnrichmentArtifacts(
input: WriteLocalScanEnrichmentArtifactsInput,
): Promise<WriteLocalScanEnrichmentArtifactsResult> {
if (input.dryRun) {
return {
enrichmentArtifacts: [],
manifestShards: [],
manifestShardsWritten: 0,
};
}
const enrichmentRoot = artifactDir(input.connectionId, input.syncId);
const descriptionsArtifact = `${enrichmentRoot}/descriptions.json`;
const embeddingsArtifact = `${enrichmentRoot}/embeddings.json`;
const relationshipsArtifact = `${enrichmentRoot}/relationships.json`;
const relationshipProfileArtifact = `${enrichmentRoot}/relationship-profile.json`;
const relationshipDiagnosticsArtifact = `${enrichmentRoot}/relationship-diagnostics.json`;
const enrichmentArtifacts: string[] = [];
if (
input.enrichment.summary.tableDescriptions === 'completed' ||
input.enrichment.summary.columnDescriptions === 'completed'
) {
enrichmentArtifacts.push(descriptionsArtifact);
await writeJsonArtifact(
input.project,
descriptionsArtifact,
input.enrichment.descriptionUpdates,
`scan(${LIVE_DATABASE_ADAPTER}): write enrichment descriptions syncId=${input.syncId}`,
);
}
if (input.enrichment.summary.embeddings === 'completed') {
enrichmentArtifacts.push(embeddingsArtifact);
await writeJsonArtifact(
input.project,
embeddingsArtifact,
input.enrichment.embeddingUpdates,
`scan(${LIVE_DATABASE_ADAPTER}): write enrichment embeddings syncId=${input.syncId}`,
);
}
enrichmentArtifacts.push(relationshipsArtifact, relationshipProfileArtifact, relationshipDiagnosticsArtifact);
const hasResolvedRelationships = input.enrichment.resolvedRelationships !== null;
const relationshipArtifacts = buildKloRelationshipArtifacts({
connectionId: input.connectionId,
resolvedRelationships: hasResolvedRelationships ? (input.enrichment.resolvedRelationships ?? []) : undefined,
compositeRelationships: input.enrichment.compositeRelationships ?? undefined,
relationshipUpdate: input.enrichment.relationshipUpdate ?? {
connectionId: input.connectionId,
accepted: [],
rejected: [],
skipped: [],
},
});
const relationshipProfile =
input.enrichment.relationshipProfile ??
emptyKloRelationshipProfileArtifact({
connectionId: input.connectionId,
driver: input.driver,
reason: 'relationship_profiling_not_run',
});
const relationshipDiagnostics = buildKloRelationshipDiagnostics({
connectionId: input.connectionId,
artifacts: relationshipArtifacts,
profile: relationshipProfile,
warnings: input.enrichment.warnings,
thresholds: input.relationshipSettings
? {
acceptThreshold: input.relationshipSettings.acceptThreshold,
reviewThreshold: input.relationshipSettings.reviewThreshold,
}
: undefined,
policy: input.relationshipSettings
? {
validationRequiredForManifest: input.relationshipSettings.validationRequiredForManifest,
maxCandidatesPerColumn: input.relationshipSettings.maxCandidatesPerColumn,
profileSampleRows: input.relationshipSettings.profileSampleRows,
validationConcurrency: input.relationshipSettings.validationConcurrency,
}
: undefined,
});
await writeJsonArtifact(
input.project,
relationshipsArtifact,
relationshipArtifacts,
`scan(${LIVE_DATABASE_ADAPTER}): write enrichment relationships syncId=${input.syncId}`,
);
await writeJsonArtifact(
input.project,
relationshipProfileArtifact,
relationshipProfile,
`scan(${LIVE_DATABASE_ADAPTER}): write relationship profile syncId=${input.syncId}`,
);
await writeJsonArtifact(
input.project,
relationshipDiagnosticsArtifact,
relationshipDiagnostics,
`scan(${LIVE_DATABASE_ADAPTER}): write relationship diagnostics syncId=${input.syncId}`,
);
const manifestResult = await writeLocalScanManifestShards({
project: input.project,
connectionId: input.connectionId,
syncId: input.syncId,
driver: input.driver,
snapshot: input.enrichment.snapshot,
descriptionUpdates: input.enrichment.descriptionUpdates,
relationshipUpdate: input.enrichment.relationshipUpdate,
dryRun: false,
});
return {
enrichmentArtifacts,
manifestShards: manifestResult.manifestShards,
manifestShardsWritten: manifestResult.manifestShardsWritten,
};
}

View file

@ -0,0 +1,742 @@
import Database from 'better-sqlite3';
import { describe, expect, it, vi } from 'vitest';
import { buildDefaultKloProjectConfig } from '../project/config.js';
import type {
KloScanEnrichmentCompletedStage,
KloScanEnrichmentFailedStage,
KloScanEnrichmentStageLookup,
KloScanEnrichmentStateStore,
} from './enrichment-state.js';
import {
createDeterministicLocalScanEnrichmentProviders,
runLocalScanEnrichment,
snapshotToKloEnrichedSchema,
} from './local-enrichment.js';
import { createLocalScanEnrichmentProvidersFromConfig } from './local-scan.js';
import {
createKloConnectorCapabilities,
type KloQueryResult,
type KloReadOnlyQueryInput,
type KloScanConnector,
type KloScanContext,
type KloSchemaSnapshot,
} from './types.js';
const snapshot: KloSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-29T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'customers',
kind: 'table',
comment: 'Customer accounts',
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Customer id',
},
],
},
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: 'Customer orders',
estimatedRows: 3,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Order id',
},
{
name: 'customer_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: 'Customer id',
},
],
},
],
};
function connector(): KloScanConnector {
return {
id: 'test:warehouse',
driver: 'postgres',
capabilities: createKloConnectorCapabilities({
tableSampling: true,
columnSampling: true,
readOnlySql: true,
columnStats: true,
}),
introspect: vi.fn(async () => snapshot),
sampleTable: vi.fn(async () => ({
headers: ['id', 'customer_id'],
rows: [[1, 10]],
totalRows: 1,
})),
sampleColumn: vi.fn(async () => ({
values: ['10', '11'],
nullCount: 0,
distinctCount: 2,
})),
};
}
class InMemorySqliteExecutor {
readonly db = new Database(':memory:');
executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return Promise.resolve({
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
});
}
close(): void {
this.db.close();
}
}
function noDeclaredRelationshipSnapshot(): KloSchemaSnapshot {
return {
connectionId: 'warehouse',
driver: 'sqlite',
extractedAt: '2026-05-07T00:00:00.000Z',
scope: {},
metadata: {},
tables: [
{
catalog: null,
db: null,
name: 'accounts',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: null,
name: 'orders',
kind: 'table',
comment: null,
estimatedRows: 3,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'account_id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
}
function memoryEnrichmentStateStore(): KloScanEnrichmentStateStore {
const records = new Map<string, KloScanEnrichmentCompletedStage | KloScanEnrichmentFailedStage>();
const key = (input: Pick<KloScanEnrichmentStageLookup, 'runId' | 'stage'>) => `${input.runId}:${input.stage}`;
return {
async findCompletedStage<TOutput>(input: KloScanEnrichmentStageLookup) {
const record = records.get(key(input));
if (!record || record.status !== 'completed' || record.inputHash !== input.inputHash) {
return null;
}
return record as KloScanEnrichmentCompletedStage<TOutput>;
},
async saveCompletedStage(input) {
records.set(key(input), {
...input,
status: 'completed',
errorMessage: null,
});
},
async saveFailedStage(input) {
records.set(key(input), {
...input,
status: 'failed',
output: null,
});
},
async listRunStages(runId) {
return [...records.values()].filter((record) => record.runId === runId);
},
};
}
describe('local scan enrichment', () => {
it('maps a scan snapshot into relationship detector schema', () => {
const schema = snapshotToKloEnrichedSchema(snapshot);
expect(schema.connectionId).toBe('warehouse');
expect(schema.tables).toHaveLength(2);
expect(schema.tables[1]?.columns.map((column) => column.name)).toEqual(['id', 'customer_id']);
expect(schema.tables[1]?.columns[1]).toMatchObject({
id: 'public.orders.customer_id',
tableId: 'public.orders',
primaryKey: false,
sampleValues: null,
embedding: null,
});
});
it('maps snapshot foreign keys into formal schema relationships', () => {
const source = noDeclaredRelationshipSnapshot();
const snapshotWithForeignKey = {
...source,
tables: source.tables.map((table) =>
table.name === 'orders'
? {
...table,
foreignKeys: [
{
fromColumn: 'account_id',
toCatalog: null,
toDb: null,
toTable: 'accounts',
toColumn: 'id',
constraintName: 'orders_account_id_fkey',
},
],
}
: table.name === 'accounts'
? {
...table,
columns: table.columns.map((column) =>
column.name === 'id' ? { ...column, primaryKey: true } : column,
),
}
: table,
),
};
const schema = snapshotToKloEnrichedSchema(snapshotWithForeignKey);
expect(schema.relationships).toEqual([
{
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
source: 'formal',
from: {
tableId: 'orders',
columnIds: ['orders.account_id'],
table: { catalog: null, db: null, name: 'orders' },
columns: ['account_id'],
},
to: {
tableId: 'accounts',
columnIds: ['accounts.id'],
table: { catalog: null, db: null, name: 'accounts' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
]);
});
it('runs deterministic relationship detection for relationship scans', async () => {
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'relationships',
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-1' },
providers: null,
});
expect(result.summary).toMatchObject({
deterministicRelationships: 'completed',
llmRelationshipValidation: 'skipped',
embeddings: 'skipped',
});
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
expect(result.summary.statisticalValidation).toBe('skipped');
expect(result.warnings).toContainEqual({
code: 'relationship_validation_failed',
message: 'KLO scan connector advertises readOnlySql but does not expose executeReadOnly',
recoverable: true,
metadata: { capability: 'readOnlySql' },
});
});
it('runs relationship discovery with connector SQL evidence', async () => {
const executor = new InMemorySqliteExecutor();
try {
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts (id) VALUES (1), (2);
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
`);
const scanConnector = {
...connector(),
driver: 'sqlite' as const,
capabilities: createKloConnectorCapabilities({ readOnlySql: true, columnStats: true }),
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
executeReadOnly: executor.executeReadOnly.bind(executor),
};
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'relationships',
detectRelationships: true,
connector: scanConnector,
context: { runId: 'scan-run-relationship-discovery' },
providers: null,
});
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.summary.statisticalValidation).toBe('completed');
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
expect(result.resolvedRelationships).toEqual([
expect.objectContaining({
status: 'accepted',
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
}),
]);
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
} finally {
executor.close();
}
});
it('honors scan relationship config when LLM proposals are disabled', async () => {
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 });
const getModel = vi.fn(() => ({ modelId: 'provider/language-model', provider: 'gateway' }));
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'relationships',
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-llm-disabled' },
providers: {
...providers,
llm: {
...providers.llm,
getModel: getModel as never,
},
},
relationshipSettings: {
...buildDefaultKloProjectConfig('warehouse').scan.relationships,
llmProposals: false,
maxLlmTablesPerBatch: 40,
},
});
expect(result.summary.llmRelationshipValidation).toBe('skipped');
expect(getModel).not.toHaveBeenCalledWith('candidateExtraction');
});
it('skips relationship detection when scan relationships are disabled', async () => {
const settings = {
...buildDefaultKloProjectConfig('warehouse').scan.relationships,
enabled: false,
};
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
connector: connector(),
context: { runId: 'disabled-relationships' },
providers: createDeterministicLocalScanEnrichmentProviders(),
relationshipSettings: settings,
});
expect(result.summary.deterministicRelationships).toBe('skipped');
expect(result.summary.statisticalValidation).toBe('skipped');
expect(result.summary.llmRelationshipValidation).toBe('skipped');
expect(result.relationships).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
expect(result.relationshipUpdate).toBeNull();
expect(result.relationshipProfile).toBeNull();
expect(result.resolvedRelationships).toBeNull();
});
it('runs configured deterministic enrichment with descriptions and embeddings', async () => {
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-2' },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
});
expect(result.summary).toMatchObject({
dataDictionary: 'completed',
tableDescriptions: 'completed',
columnDescriptions: 'completed',
embeddings: 'completed',
deterministicRelationships: 'completed',
});
expect(result.embeddingUpdates).toHaveLength(3);
expect(result.embeddingUpdates[0]?.embedding).toHaveLength(6);
expect(result.snapshot).toEqual(snapshot);
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
});
it('reports enrichment progress for countable stages', async () => {
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
const progress = {
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
events.push({ progress: progressValue, message, transient: options?.transient });
},
startPhase() {
return progress;
},
};
await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: true,
connector: connector(),
context: { runId: 'scan-run-progress', progress },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
});
expect(events).toEqual(
expect.arrayContaining([
expect.objectContaining({ message: 'Generating descriptions 1/2 tables', transient: true }),
expect.objectContaining({ message: 'Generating descriptions 2/2 tables', transient: true }),
expect.objectContaining({ message: 'Building embeddings 1/1 batches', transient: true }),
expect.objectContaining({ message: 'Detecting relationships' }),
]),
);
});
it('reports progress before enrichment connector introspection starts', async () => {
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
const progress = {
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
events.push({ progress: progressValue, message, transient: options?.transient });
},
startPhase() {
return progress;
},
};
const scanConnector = {
...connector(),
introspect: vi.fn(async () => {
expect(events).toContainEqual(expect.objectContaining({ message: 'Loading enrichment schema snapshot' }));
return snapshot;
}),
};
await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'relationships',
detectRelationships: true,
connector: scanConnector,
context: { runId: 'scan-run-progress-before-introspection', progress },
providers: null,
});
expect(scanConnector.introspect).toHaveBeenCalled();
});
it('splits enrichment embedding requests by provider batch size', async () => {
const manyColumnSnapshot: KloSchemaSnapshot = {
...snapshot,
tables: [
{
catalog: null,
db: 'public',
name: 'wide_orders',
kind: 'table',
comment: 'Wide order facts',
estimatedRows: 3,
foreignKeys: [],
columns: Array.from({ length: 5 }, (_, index) => ({
name: `metric_${index + 1}`,
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number' as const,
nullable: false,
primaryKey: false,
comment: `Metric ${index + 1}`,
})),
},
],
};
const scanConnector = {
...connector(),
introspect: vi.fn(async () => manyColumnSnapshot),
};
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 });
const embedBatch = vi.fn(async (texts: string[]) => {
if (texts.length > 2) {
throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
}
return texts.map((_, index) => [index, index + 1, index + 2]);
});
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: false,
connector: scanConnector,
context: { runId: 'scan-run-batched-embeddings' },
providers: {
llm: deterministicProviders.llm,
embedding: {
dimensions: 3,
maxBatchSize: 2,
embedBatch,
},
},
});
expect(result.embeddingUpdates).toHaveLength(5);
expect(embedBatch.mock.calls.map(([texts]) => texts).map((texts) => texts.length)).toEqual([2, 2, 1]);
});
it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
const stateStore = memoryEnrichmentStateStore();
const scanConnector = connector();
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 });
const first = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: true,
connector: scanConnector,
context: { runId: 'scan-run-resume-1' },
providers,
stateStore,
syncId: 'sync-resume-1',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
});
const getModel = vi.spyOn(providers.llm, 'getModel');
const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
const second = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: true,
connector: scanConnector,
context: { runId: 'scan-run-resume-1' },
providers,
stateStore,
syncId: 'sync-resume-1',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
});
expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
expect(first.state.resumedStages).toEqual([]);
expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
expect(getModel).not.toHaveBeenCalled();
expect(embedBatch).not.toHaveBeenCalled();
expect(second.descriptionUpdates).toEqual(first.descriptionUpdates);
expect(second.embeddingUpdates).toEqual(first.embeddingUpdates);
expect(second.relationships).toEqual(first.relationships);
});
it('does not reuse completed stages when the snapshot changes', async () => {
const stateStore = memoryEnrichmentStateStore();
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 });
const scanConnector = connector();
await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: false,
connector: scanConnector,
context: { runId: 'scan-run-resume-hash' },
providers,
stateStore,
syncId: 'sync-resume-hash',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
});
const firstTable = snapshot.tables[0];
if (!firstTable) {
throw new Error('Expected test snapshot table');
}
const changedConnector = {
...connector(),
introspect: vi.fn(async () => ({
...snapshot,
tables: [{ ...firstTable, name: 'customers' }],
})),
};
const getModel = vi.spyOn(providers.llm, 'getModel');
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: false,
connector: changedConnector,
context: { runId: 'scan-run-resume-hash' },
providers,
stateStore,
syncId: 'sync-resume-hash',
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
});
expect(result.state.resumedStages).toEqual([]);
expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
expect(getModel).toHaveBeenCalled();
});
it('runs providerless enriched scans as relationship-only discovery enrichment', async () => {
const executor = new InMemorySqliteExecutor();
try {
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts (id) VALUES (1), (2);
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
`);
const scanConnector = {
...connector(),
driver: 'sqlite' as const,
capabilities: createKloConnectorCapabilities({ readOnlySql: true, columnStats: true }),
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
executeReadOnly: executor.executeReadOnly.bind(executor),
};
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: false,
connector: scanConnector,
context: { runId: 'scan-run-providerless-enriched' },
providers: null,
});
expect(result.summary).toEqual({
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'completed',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'completed',
});
expect(result.descriptionUpdates).toEqual([]);
expect(result.embeddingUpdates).toEqual([]);
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
expect(result.resolvedRelationships).toEqual([
expect.objectContaining({
status: 'accepted',
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
}),
]);
expect(result.warnings).toContainEqual({
code: 'scan_enrichment_backend_not_configured',
message:
'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
recoverable: true,
metadata: {
skippedStages: ['descriptions', 'embeddings'],
relationshipDetection: true,
},
});
} finally {
executor.close();
}
});
it('resolves gateway LLM providers and OpenAI embeddings from local scan config', () => {
const createKloLlmProvider = vi.fn(() => ({
getModel: vi.fn().mockReturnValue({ modelId: 'provider/language-model', provider: 'gateway' }),
}));
const createKloEmbeddingProvider = vi.fn(() => ({
dimensions: 1536,
maxBatchSize: 8,
embed: vi.fn(),
[['embed', 'Many'].join('')]: vi.fn(),
}));
const providers = createLocalScanEnrichmentProvidersFromConfig(
{
mode: 'llm',
embeddings: {
backend: 'openai',
model: 'provider/embedding-model',
dimensions: 1536,
batchSize: 8,
openai: { api_key: 'env:OPENAI_API_KEY' },
},
},
{
provider: {
backend: 'gateway',
gateway: {},
},
models: { default: 'provider/language-model' },
},
{
createKloLlmProvider: createKloLlmProvider as any,
createKloEmbeddingProvider: createKloEmbeddingProvider as any,
env: { OPENAI_API_KEY: 'openai-key' },
},
);
expect(providers?.embedding.dimensions).toBe(1536);
expect(providers?.embedding.maxBatchSize).toBe(8);
expect(createKloLlmProvider).toHaveBeenCalledWith(
expect.objectContaining({ backend: 'gateway', modelSlots: { default: 'provider/language-model' } }),
);
expect(createKloEmbeddingProvider).toHaveBeenCalledWith(
expect.objectContaining({ backend: 'openai', model: 'provider/embedding-model' }),
);
});
});

View file

@ -0,0 +1,659 @@
import type { KloLlmProvider } from '@klo/llm';
import { buildDefaultKloProjectConfig, type KloScanRelationshipConfig } from '../project/config.js';
import { type KloDescriptionColumnTable, KloDescriptionGenerator } from './description-generation.js';
import { buildKloColumnEmbeddingText } from './embedding-text.js';
import {
completedKloScanEnrichmentStateSummary,
computeKloScanEnrichmentInputHash,
type KloScanEnrichmentStateStore,
summarizeKloScanEnrichmentState,
} from './enrichment-state.js';
import { skippedKloScanEnrichmentSummary } from './enrichment-summary.js';
import type {
KloEmbeddingUpdate,
KloEnrichedColumn,
KloEnrichedRelationship,
KloEnrichedSchema,
KloEnrichedTable,
KloRelationshipEndpoint,
KloRelationshipUpdate,
} from './enrichment-types.js';
import type { KloCompositeRelationshipCandidate } from './relationship-composite-candidates.js';
import type { KloResolvedRelationshipDiscoveryCandidate } from './relationship-graph-resolver.js';
import { discoverKloRelationships } from './relationship-discovery.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
import type {
KloEmbeddingPort,
KloProgressPort,
KloScanConnector,
KloScanContext,
KloScanEnrichmentStage,
KloScanEnrichmentStateSummary,
KloScanEnrichmentSummary,
KloScanMode,
KloScanRelationshipSummary,
KloScanWarning,
KloSchemaColumn,
KloSchemaForeignKey,
KloSchemaSnapshot,
KloSchemaTable,
KloTableRef,
} from './types.js';
export interface DeterministicLocalScanEnrichmentProviderOptions {
embeddingDimensions?: number;
maxBatchSize?: number;
}
export interface KloLocalScanEnrichmentProviders {
llm: KloLlmProvider;
embedding: KloEmbeddingPort;
}
export interface KloLocalScanEnrichmentInput {
connectionId: string;
mode: KloScanMode;
detectRelationships?: boolean;
connector: KloScanConnector;
context: KloScanContext;
providers: KloLocalScanEnrichmentProviders | null;
stateStore?: KloScanEnrichmentStateStore | null;
syncId?: string;
providerIdentity?: Record<string, unknown>;
relationshipSettings?: KloScanRelationshipConfig;
now?: () => Date;
}
export interface KloLocalScanEnrichmentResult {
snapshot: KloSchemaSnapshot;
summary: KloScanEnrichmentSummary;
relationships: KloScanRelationshipSummary;
state: KloScanEnrichmentStateSummary;
warnings: KloScanWarning[];
descriptionUpdates: Array<{
table: KloTableRef;
tableDescription: string | null;
columnDescriptions: Record<string, string | null>;
}>;
embeddingUpdates: KloEmbeddingUpdate[];
relationshipUpdate: KloRelationshipUpdate | null;
relationshipProfile: KloRelationshipProfileArtifact | null;
resolvedRelationships: KloResolvedRelationshipDiscoveryCandidate[] | null;
compositeRelationships: KloCompositeRelationshipCandidate[] | null;
}
function tableId(table: KloSchemaTable): string {
return [table.catalog, table.db, table.name].filter((value): value is string => Boolean(value)).join('.');
}
function columnId(table: KloSchemaTable, column: KloSchemaColumn): string {
return `${tableId(table)}.${column.name}`;
}
function tableRef(table: KloSchemaTable): KloTableRef {
return {
catalog: table.catalog,
db: table.db,
name: table.name,
};
}
function endpoint(table: KloEnrichedTable, column: KloEnrichedColumn): KloRelationshipEndpoint {
return {
tableId: table.id,
columnIds: [column.id],
table: table.ref,
columns: [column.name],
};
}
function relationshipId(from: KloRelationshipEndpoint, to: KloRelationshipEndpoint): string {
return `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`;
}
function targetMatchesForeignKey(table: KloEnrichedTable, foreignKey: KloSchemaForeignKey): boolean {
return (
table.ref.name === foreignKey.toTable &&
(foreignKey.toCatalog === null || table.ref.catalog === foreignKey.toCatalog) &&
(foreignKey.toDb === null || table.ref.db === foreignKey.toDb)
);
}
function formalRelationshipsFromSnapshot(
snapshot: KloSchemaSnapshot,
tables: readonly KloEnrichedTable[],
): KloEnrichedRelationship[] {
const tableById = new Map(tables.map((table) => [table.id, table]));
const relationships: KloEnrichedRelationship[] = [];
for (const sourceTableSnapshot of snapshot.tables) {
const sourceTable = tableById.get(tableId(sourceTableSnapshot));
if (!sourceTable) {
continue;
}
for (const foreignKey of sourceTableSnapshot.foreignKeys) {
const sourceColumn = sourceTable.columns.find((column) => column.name === foreignKey.fromColumn);
const targetTable = tables.find((table) => targetMatchesForeignKey(table, foreignKey));
const targetColumn = targetTable?.columns.find((column) => column.name === foreignKey.toColumn);
if (!sourceColumn || !targetTable || !targetColumn) {
continue;
}
const from = endpoint(sourceTable, sourceColumn);
const to = endpoint(targetTable, targetColumn);
relationships.push({
id: relationshipId(from, to),
source: 'formal',
from,
to,
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
});
}
}
return relationships.sort((left, right) => left.id.localeCompare(right.id));
}
function providerlessEnrichedWarning(relationshipDetection: boolean): KloScanWarning {
return {
code: 'scan_enrichment_backend_not_configured',
message:
'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
recoverable: true,
metadata: {
skippedStages: ['descriptions', 'embeddings'],
relationshipDetection,
},
};
}
function hashEmbedding(text: string, dimensions: number): number[] {
const values = Array.from({ length: dimensions }, (_, index) => {
let hash = index + 17;
for (const char of text) {
hash = (hash * 31 + char.charCodeAt(0) + index) % 1009;
}
return Number(((hash % 200) / 100 - 1).toFixed(4));
});
return values;
}
export function createDeterministicLocalScanEnrichmentProviders(
options: DeterministicLocalScanEnrichmentProviderOptions = {},
): KloLocalScanEnrichmentProviders {
const dimensions = options.embeddingDimensions ?? 8;
const maxBatchSize = options.maxBatchSize ?? 64;
return {
llm: deterministicLlmProvider(),
embedding: {
dimensions,
maxBatchSize,
async embedBatch(texts) {
return texts.map((text) => hashEmbedding(text, dimensions));
},
},
};
}
function deterministicLlmProvider(): KloLlmProvider {
const model = { modelId: 'deterministic-scan', provider: 'deterministic' };
return {
getModel() {
return model as ReturnType<KloLlmProvider['getModel']>;
},
getModelByName() {
return model as ReturnType<KloLlmProvider['getModelByName']>;
},
cacheMarker() {
return undefined;
},
repairToolCallHandler() {
throw new Error('deterministic scan provider does not support tool-call repair');
},
thinkingProviderOptions() {
return {};
},
telemetryConfig() {
return undefined;
},
promptCachingConfig() {
return {
enabled: false,
systemTtl: '1h',
toolsTtl: '1h',
historyTtl: '5m',
cacheSystem: true,
cacheTools: true,
cacheHistory: true,
vertexFallbackTo5m: false,
};
},
activeBackend() {
return 'gateway';
},
};
}
export function snapshotToKloEnrichedSchema(
snapshot: KloSchemaSnapshot,
embeddingsByColumnId: ReadonlyMap<string, number[]> = new Map(),
): KloEnrichedSchema {
const tables: KloEnrichedTable[] = snapshot.tables.map((table) => {
const id = tableId(table);
const ref = tableRef(table);
const columns: KloEnrichedColumn[] = table.columns.map((column) => {
const idForColumn = columnId(table, column);
return {
id: idForColumn,
tableId: id,
tableRef: ref,
name: column.name,
nativeType: column.nativeType,
normalizedType: column.normalizedType,
dimensionType: column.dimensionType,
nullable: column.nullable,
primaryKey: column.primaryKey,
parentColumnId: null,
descriptions: {
...(column.comment ? { db: column.comment } : {}),
},
embedding: embeddingsByColumnId.get(idForColumn) ?? null,
sampleValues: null,
cardinality: null,
};
});
return {
id,
ref,
enabled: true,
descriptions: {
...(table.comment ? { db: table.comment } : {}),
},
columns,
};
});
return {
connectionId: snapshot.connectionId,
tables,
relationships: formalRelationshipsFromSnapshot(snapshot, tables),
};
}
function descriptionTable(table: KloSchemaTable): KloDescriptionColumnTable {
return {
catalog: table.catalog,
db: table.db,
name: table.name,
columns: table.columns.map((column) => ({
name: column.name,
...(column.comment ? { sampleValues: [column.comment], rawDescriptions: { db: column.comment } } : {}),
})),
};
}
function embeddingBatchSize(maxBatchSize: number): number {
return Number.isInteger(maxBatchSize) && maxBatchSize > 0 ? maxBatchSize : 100;
}
async function generateDescriptions(input: {
snapshot: KloSchemaSnapshot;
connector: KloScanConnector;
context: KloScanContext;
providers: KloLocalScanEnrichmentProviders;
progress?: KloProgressPort;
}): Promise<KloLocalScanEnrichmentResult['descriptionUpdates']> {
const generator = new KloDescriptionGenerator({
llmProvider: input.providers.llm,
settings: {
columnMaxWords: 16,
tableMaxWords: 24,
dataSourceMaxWords: 32,
concurrencyLimit: 4,
},
});
const updates: KloLocalScanEnrichmentResult['descriptionUpdates'] = [];
const totalTables = input.snapshot.tables.length;
if (totalTables === 0) {
await input.progress?.update(1, 'No tables to describe');
return updates;
}
for (const [index, table] of input.snapshot.tables.entries()) {
await input.progress?.update(
(index + 1) / totalTables,
`Generating descriptions ${index + 1}/${totalTables} tables`,
{
transient: true,
},
);
const tableInput = descriptionTable(table);
const columnResult = await generator.generateColumnDescriptions({
connectionId: input.snapshot.connectionId,
connector: input.connector,
context: input.context,
dataSourceType: input.snapshot.driver,
supportsNestedAnalysis: input.connector.capabilities.nestedAnalysis,
table: tableInput,
});
const tableDescription = await generator.generateTableDescription({
connectionId: input.snapshot.connectionId,
connector: input.connector,
context: input.context,
dataSourceType: input.snapshot.driver,
table: {
catalog: table.catalog,
db: table.db,
name: table.name,
rawDescriptions: table.comment ? { db: table.comment } : {},
},
});
updates.push({
table: tableRef(table),
tableDescription,
columnDescriptions: Object.fromEntries(columnResult.columnDescriptions),
});
}
await input.progress?.update(1, `Generated descriptions for ${totalTables} tables`);
return updates;
}
async function buildEmbeddings(input: {
snapshot: KloSchemaSnapshot;
providers: KloLocalScanEnrichmentProviders;
descriptions: KloLocalScanEnrichmentResult['descriptionUpdates'];
progress?: KloProgressPort;
}): Promise<{ updates: KloEmbeddingUpdate[]; byColumnId: Map<string, number[]> }> {
const descriptionByTable = new Map(input.descriptions.map((item) => [item.table.name, item]));
const texts: Array<{ columnId: string; text: string }> = [];
for (const table of input.snapshot.tables) {
const tableDescriptions = descriptionByTable.get(table.name);
for (const column of table.columns) {
const id = columnId(table, column);
const text = buildKloColumnEmbeddingText({
tableName: table.name,
columnName: column.name,
columnType: column.nativeType,
resolvedDescription: tableDescriptions?.columnDescriptions[column.name] ?? column.comment,
resolvedTableDescription: tableDescriptions?.tableDescription ?? table.comment,
sampleValues: column.comment ? [column.comment] : null,
foreignKeys: {
outgoing: (table.foreignKeys ?? [])
.filter((foreignKey) => foreignKey.fromColumn === column.name)
.map((foreignKey) => ({ toTable: foreignKey.toTable, toColumn: foreignKey.toColumn })),
incoming: [],
},
});
texts.push({ columnId: id, text });
}
}
const embeddings: number[][] = [];
const maxBatchSize = embeddingBatchSize(input.providers.embedding.maxBatchSize);
const embeddingTexts = texts.map((item) => item.text);
const batchCount = Math.ceil(embeddingTexts.length / maxBatchSize);
if (batchCount === 0) {
await input.progress?.update(1, 'No embeddings to build');
}
for (let offset = 0; offset < embeddingTexts.length; offset += maxBatchSize) {
const batchIndex = Math.floor(offset / maxBatchSize) + 1;
await input.progress?.update(batchIndex / batchCount, `Building embeddings ${batchIndex}/${batchCount} batches`, {
transient: true,
});
const batch = embeddingTexts.slice(offset, offset + maxBatchSize);
const batchEmbeddings = await input.providers.embedding.embedBatch(batch);
if (batchEmbeddings.length !== batch.length) {
throw new Error(`expected ${batch.length} embeddings, received ${batchEmbeddings.length}`);
}
embeddings.push(...batchEmbeddings);
}
const byColumnId = new Map<string, number[]>();
const updates = texts.map((item, index) => {
const embedding = embeddings[index] ?? [];
byColumnId.set(item.columnId, embedding);
return {
columnId: item.columnId,
text: item.text,
embedding,
};
});
if (batchCount > 0) {
await input.progress?.update(1, `Built embeddings for ${updates.length} columns`);
}
return { updates, byColumnId };
}
async function runEnrichmentStage<TOutput>(input: {
stateStore: KloScanEnrichmentStateStore | null | undefined;
runId: string;
connectionId: string;
syncId: string;
mode: KloScanMode;
stage: KloScanEnrichmentStage;
inputHash: string;
now: () => Date;
resumedStages: KloScanEnrichmentStage[];
completedStages: KloScanEnrichmentStage[];
failedStages: KloScanEnrichmentStage[];
compute: () => Promise<TOutput>;
}): Promise<TOutput> {
const existing = await input.stateStore?.findCompletedStage<TOutput>({
runId: input.runId,
stage: input.stage,
inputHash: input.inputHash,
});
if (existing) {
input.resumedStages.push(input.stage);
input.completedStages.push(input.stage);
return existing.output;
}
try {
const output = await input.compute();
input.completedStages.push(input.stage);
await input.stateStore?.saveCompletedStage({
runId: input.runId,
connectionId: input.connectionId,
syncId: input.syncId,
mode: input.mode,
stage: input.stage,
inputHash: input.inputHash,
output,
updatedAt: input.now().toISOString(),
});
return output;
} catch (error) {
input.failedStages.push(input.stage);
await input.stateStore?.saveFailedStage({
runId: input.runId,
connectionId: input.connectionId,
syncId: input.syncId,
mode: input.mode,
stage: input.stage,
inputHash: input.inputHash,
errorMessage: error instanceof Error ? error.message : String(error),
updatedAt: input.now().toISOString(),
});
throw error;
}
}
function embeddingsByColumnId(updates: KloEmbeddingUpdate[]): Map<string, number[]> {
return new Map(updates.map((update) => [update.columnId, update.embedding]));
}
export async function runLocalScanEnrichment(
input: KloLocalScanEnrichmentInput,
): Promise<KloLocalScanEnrichmentResult> {
const progress = input.context.progress;
await progress?.update(0, 'Loading enrichment schema snapshot');
const snapshot = await input.connector.introspect(
{
connectionId: input.connectionId,
driver: input.connector.driver,
mode: input.mode,
detectRelationships: input.detectRelationships,
},
input.context,
);
await progress?.update(0.05, `Loaded schema snapshot with ${snapshot.tables.length} tables`);
const now = input.now ?? (() => new Date());
const state = completedKloScanEnrichmentStateSummary();
const syncId = input.syncId ?? input.context.runId;
const relationshipSettings =
input.relationshipSettings ?? buildDefaultKloProjectConfig(input.connectionId).scan.relationships;
const inputHash = computeKloScanEnrichmentInputHash({
snapshot,
mode: input.mode,
detectRelationships: input.detectRelationships ?? false,
providerIdentity: input.providerIdentity ?? {},
relationshipSettings,
});
const warnings: KloScanWarning[] = [];
let descriptions: KloLocalScanEnrichmentResult['descriptionUpdates'] = [];
let embeddingUpdates: KloEmbeddingUpdate[] = [];
let schema = snapshotToKloEnrichedSchema(snapshot);
const summary: KloScanEnrichmentSummary = { ...skippedKloScanEnrichmentSummary };
const relationshipDetectionEnabled = relationshipSettings.enabled;
const shouldDetectRelationships =
relationshipDetectionEnabled &&
(input.mode === 'relationships' || input.mode === 'enriched' || (input.detectRelationships ?? false));
if (input.mode === 'enriched' && !input.providers) {
warnings.push(providerlessEnrichedWarning(shouldDetectRelationships));
}
if (input.mode === 'enriched' && input.providers) {
const providers = input.providers;
const descriptionProgress = progress?.startPhase(0.45);
descriptions = await runEnrichmentStage({
stateStore: input.stateStore,
runId: input.context.runId,
connectionId: input.connectionId,
syncId,
mode: input.mode,
stage: 'descriptions',
inputHash,
now,
resumedStages: state.resumedStages,
completedStages: state.completedStages,
failedStages: state.failedStages,
compute: () =>
generateDescriptions({
snapshot,
connector: input.connector,
context: input.context,
providers,
progress: descriptionProgress,
}),
});
const embeddingProgress = progress?.startPhase(0.2);
embeddingUpdates = await runEnrichmentStage({
stateStore: input.stateStore,
runId: input.context.runId,
connectionId: input.connectionId,
syncId,
mode: input.mode,
stage: 'embeddings',
inputHash,
now,
resumedStages: state.resumedStages,
completedStages: state.completedStages,
failedStages: state.failedStages,
compute: async () => {
const embeddings = await buildEmbeddings({
snapshot,
providers,
descriptions,
progress: embeddingProgress,
});
return embeddings.updates;
},
});
schema = snapshotToKloEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
summary.dataDictionary = input.connector.sampleColumn ? 'completed' : 'skipped';
summary.tableDescriptions = 'completed';
summary.columnDescriptions = 'completed';
summary.embeddings = 'completed';
}
let relationshipUpdate: KloRelationshipUpdate | null = null;
let relationshipProfile: KloRelationshipProfileArtifact | null = null;
let resolvedRelationships: KloResolvedRelationshipDiscoveryCandidate[] | null = null;
let compositeRelationships: KloCompositeRelationshipCandidate[] | null = null;
let relationships: KloScanRelationshipSummary = { accepted: 0, review: 0, rejected: 0, skipped: 0 };
if (shouldDetectRelationships) {
const relationshipProgress = progress?.startPhase(0.25);
const relationshipStage = await runEnrichmentStage({
stateStore: input.stateStore,
runId: input.context.runId,
connectionId: input.connectionId,
syncId,
mode: input.mode,
stage: 'relationships',
inputHash,
now,
resumedStages: state.resumedStages,
completedStages: state.completedStages,
failedStages: state.failedStages,
compute: async () => {
await relationshipProgress?.update(0, 'Detecting relationships');
const detection = await discoverKloRelationships({
connectionId: input.connectionId,
driver: snapshot.driver,
connector: input.connector,
schema,
context: input.context,
settings: relationshipSettings,
llmProvider: input.providers?.llm ?? null,
});
await relationshipProgress?.update(
1,
`Relationship detection found ${detection.relationships.accepted} accepted, ${detection.relationships.review} review`,
);
return {
relationshipUpdate: detection.relationshipUpdate,
relationshipProfile: detection.profile,
resolvedRelationships: detection.resolvedRelationships,
compositeRelationships: detection.compositeRelationships,
relationships: detection.relationships,
statisticalValidation: detection.statisticalValidation,
llmRelationshipValidation: detection.llmRelationshipValidation,
warnings: detection.warnings,
};
},
});
summary.deterministicRelationships = 'completed';
summary.llmRelationshipValidation = relationshipStage.llmRelationshipValidation;
summary.statisticalValidation = relationshipStage.statisticalValidation;
relationshipUpdate = relationshipStage.relationshipUpdate;
relationshipProfile = relationshipStage.relationshipProfile;
resolvedRelationships = relationshipStage.resolvedRelationships;
compositeRelationships = relationshipStage.compositeRelationships;
relationships = relationshipStage.relationships;
warnings.push(...relationshipStage.warnings);
}
await progress?.update(1, 'Enrichment complete');
return {
snapshot,
summary,
relationships,
state: summarizeKloScanEnrichmentState(state),
warnings,
descriptionUpdates: descriptions,
embeddingUpdates,
relationshipUpdate,
relationshipProfile,
resolvedRelationships,
compositeRelationships,
};
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,516 @@
import type { createKloEmbeddingProvider, createKloLlmProvider } from '@klo/llm';
import {
createDefaultLocalIngestAdapters,
getLocalStageOnlyIngestStatus,
type LocalIngestRunRecord,
runLocalStageOnlyIngest,
type SourceAdapter,
} from '../ingest/index.js';
import {
createLocalKloEmbeddingProviderFromConfig,
createLocalKloLlmProviderFromConfig,
KloScanEmbeddingPortAdapter,
} from '../llm/index.js';
import type { KloProjectLlmConfig, KloScanEnrichmentConfig, KloScanRelationshipConfig } from '../project/config.js';
import type { KloLocalProject } from '../project/index.js';
import { kloLocalStateDbPath } from '../project/local-state-db.js';
import { redactKloScanReport } from './credentials.js';
import { completedKloScanEnrichmentStateSummary } from './enrichment-state.js';
import { failedKloScanEnrichmentSummary, kloScanErrorMessage } from './enrichment-summary.js';
import {
createDeterministicLocalScanEnrichmentProviders,
type KloLocalScanEnrichmentProviders,
runLocalScanEnrichment,
} from './local-enrichment.js';
import { writeLocalScanEnrichmentArtifacts, writeLocalScanManifestShards } from './local-enrichment-artifacts.js';
import { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js';
import { SqliteLocalScanEnrichmentStateStore } from './sqlite-local-enrichment-state-store.js';
import type {
KloConnectionDriver,
KloProgressPort,
KloScanConnector,
KloScanEnrichmentStateSummary,
KloScanMode,
KloScanReport,
KloScanTrigger,
} from './types.js';
export interface RunLocalScanOptions {
project: KloLocalProject;
connectionId: string;
mode?: KloScanMode;
detectRelationships?: boolean;
dryRun?: boolean;
trigger?: KloScanTrigger;
databaseIntrospectionUrl?: string;
adapters?: SourceAdapter[];
jobId?: string;
now?: () => Date;
connector?: KloScanConnector;
createConnector?: (connectionId: string) => KloScanConnector | Promise<KloScanConnector>;
enrichmentProviders?: KloLocalScanEnrichmentProviders | null;
enrichmentStateStore?: SqliteLocalScanEnrichmentStateStore | null;
progress?: KloProgressPort;
}
export interface LocalScanRunResult {
runId: string;
status: 'done';
done: true;
connectionId: string;
mode: KloScanMode;
dryRun: boolean;
syncId: string;
report: KloScanReport;
}
export interface LocalScanStatusResponse {
runId: string;
status: LocalIngestRunRecord['status'];
done: boolean;
connectionId: string;
mode: KloScanMode;
dryRun: boolean;
syncId: string;
progress: number;
startedAt: string;
completedAt: string;
reportPath: string | null;
warnings: KloScanReport['warnings'];
}
export interface LocalScanMcpOptions {
adapters?: SourceAdapter[];
databaseIntrospectionUrl?: string;
jobIdFactory?: () => string;
now?: () => Date;
createConnector?: (connectionId: string) => KloScanConnector | Promise<KloScanConnector>;
}
const LIVE_DATABASE_ADAPTER = 'live-database';
const SCAN_REPORT_FILE = 'scan-report.json';
const LOCAL_AUTHOR = 'klo';
const LOCAL_AUTHOR_EMAIL = 'klo@example.com';
function normalizeDriver(driver: string | undefined): KloConnectionDriver {
const normalized = (driver ?? '').toLowerCase();
if (
normalized === 'postgres' ||
normalized === 'postgresql' ||
normalized === 'sqlite' ||
normalized === 'sqlite3' ||
normalized === 'mysql' ||
normalized === 'clickhouse' ||
normalized === 'sqlserver' ||
normalized === 'bigquery' ||
normalized === 'snowflake' ||
normalized === 'posthog'
) {
return normalized === 'sqlite3' ? 'sqlite' : normalized;
}
throw new Error(
`Standalone klo scan supports postgres/postgresql/sqlite/mysql/clickhouse/sqlserver/bigquery/snowflake/posthog in this phase, received "${driver ?? 'unknown'}"`,
);
}
function tablePathCount(paths: string[]): number {
return paths.filter((path) => path.startsWith('tables/') && path.endsWith('.json')).length;
}
function rawSourcesDir(connectionId: string, syncId: string): string {
return `raw-sources/${connectionId}/${LIVE_DATABASE_ADAPTER}/${syncId}`;
}
function scanReportPath(connectionId: string, syncId: string): string {
return `${rawSourcesDir(connectionId, syncId)}/${SCAN_REPORT_FILE}`;
}
function assertSupportedMode(mode: KloScanMode): void {
if (mode !== 'structural' && mode !== 'relationships' && mode !== 'enriched') {
throw new Error(`Unsupported KLO scan mode: ${mode}`);
}
}
async function resolveScanConnector(options: RunLocalScanOptions, mode: KloScanMode): Promise<KloScanConnector | null> {
if (mode === 'structural' && !options.detectRelationships) {
return null;
}
if (options.connector) {
return options.connector;
}
if (options.createConnector) {
return options.createConnector(options.connectionId);
}
throw new Error('klo scan --enrich and --detect-relationships require a native standalone scan connector');
}
interface LocalScanEnrichmentProviderDeps {
createKloLlmProvider?: typeof createKloLlmProvider;
createKloEmbeddingProvider?: typeof createKloEmbeddingProvider;
env?: NodeJS.ProcessEnv;
}
export function createLocalScanEnrichmentProvidersFromConfig(
config: KloScanEnrichmentConfig,
llmConfig: KloProjectLlmConfig,
deps: LocalScanEnrichmentProviderDeps = {},
): KloLocalScanEnrichmentProviders | null {
if (config.mode === 'deterministic') {
return createDeterministicLocalScanEnrichmentProviders();
}
if (config.mode !== 'llm' || !config.embeddings) {
return null;
}
const llm = createLocalKloLlmProviderFromConfig(llmConfig, deps);
const embeddingProvider = createLocalKloEmbeddingProviderFromConfig(config.embeddings, deps);
if (!llm || !embeddingProvider) {
return null;
}
return {
llm,
embedding: new KloScanEmbeddingPortAdapter(embeddingProvider),
};
}
function createLocalScanEnrichmentStateStore(options: RunLocalScanOptions): SqliteLocalScanEnrichmentStateStore | null {
if (options.dryRun) {
return null;
}
if (options.enrichmentStateStore !== undefined) {
return options.enrichmentStateStore;
}
return new SqliteLocalScanEnrichmentStateStore({ dbPath: kloLocalStateDbPath(options.project) });
}
function localScanProviderIdentity(
config: KloScanEnrichmentConfig,
llmConfig: KloProjectLlmConfig,
relationships: KloScanRelationshipConfig,
): Record<string, unknown> {
return {
mode: config.mode,
embeddingDimensions: config.embeddings?.dimensions ?? null,
llmModel: llmConfig.models.default ?? null,
embeddingModel: config.embeddings?.model ?? null,
batchSize: config.embeddings?.batchSize ?? null,
baseUrlConfigured: Boolean(llmConfig.provider.gateway?.base_url),
relationships,
};
}
function reportFromIngest(input: {
record: LocalIngestRunRecord;
driver: KloConnectionDriver;
mode: KloScanMode;
dryRun: boolean;
trigger: KloScanTrigger;
createdAt: string;
}): KloScanReport {
const reportPath = input.dryRun ? null : scanReportPath(input.record.connectionId, input.record.syncId);
return {
connectionId: input.record.connectionId,
driver: input.driver,
syncId: input.record.syncId,
runId: input.record.runId,
trigger: input.trigger,
mode: input.mode,
dryRun: input.dryRun,
artifactPaths: {
rawSourcesDir: input.dryRun ? null : rawSourcesDir(input.record.connectionId, input.record.syncId),
reportPath,
manifestShards: [],
enrichmentArtifacts: [],
},
diffSummary: {
tablesAdded: tablePathCount(input.record.diffPaths.added),
tablesModified: tablePathCount(input.record.diffPaths.modified),
tablesDeleted: tablePathCount(input.record.diffPaths.deleted),
tablesUnchanged: tablePathCount(input.record.diffPaths.unchanged),
columnsAdded: 0,
columnsModified: 0,
columnsDeleted: 0,
},
manifestShardsWritten: 0,
structuralSyncStats: {
tablesCreated: 0,
tablesUpdated: 0,
tablesDeleted: 0,
columnsCreated: 0,
columnsUpdated: 0,
columnsDeleted: 0,
},
enrichment: {
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'skipped',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
},
capabilityGaps: [],
warnings: [],
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
enrichmentState: completedKloScanEnrichmentStateSummary(),
createdAt: input.createdAt,
};
}
async function writeScanReport(project: KloLocalProject, report: KloScanReport): Promise<void> {
if (!report.artifactPaths.reportPath) {
return;
}
await project.fileStore.writeFile(
report.artifactPaths.reportPath,
`${JSON.stringify(report, null, 2)}\n`,
LOCAL_AUTHOR,
LOCAL_AUTHOR_EMAIL,
`scan(${LIVE_DATABASE_ADAPTER}): ${report.runId} syncId=${report.syncId}`,
);
}
function scanDiffSummaryFromRecord(record: LocalIngestRunRecord): KloScanReport['diffSummary'] {
return {
tablesAdded: tablePathCount(record.diffPaths.added),
tablesModified: tablePathCount(record.diffPaths.modified),
tablesDeleted: tablePathCount(record.diffPaths.deleted),
tablesUnchanged: tablePathCount(record.diffPaths.unchanged),
columnsAdded: 0,
columnsModified: 0,
columnsDeleted: 0,
};
}
function hasNoContentChanges(record: LocalIngestRunRecord): boolean {
return (
record.previousRunId !== null &&
record.diffSummary.added === 0 &&
record.diffSummary.modified === 0 &&
record.diffSummary.deleted === 0
);
}
function scanChangeSummary(diffSummary: KloScanReport['diffSummary']): string {
const changedTables = diffSummary.tablesAdded + diffSummary.tablesModified + diffSummary.tablesDeleted;
const totalTables = changedTables + diffSummary.tablesUnchanged;
const changeNoun = changedTables === 1 ? 'change' : 'changes';
const tableNoun = totalTables === 1 ? 'table' : 'tables';
return `Semantic layer comparison found ${changedTables} ${changeNoun} across ${totalTables} ${tableNoun}`;
}
async function readScanReport(
project: KloLocalProject,
connectionId: string,
syncId: string,
): Promise<KloScanReport | null> {
try {
const raw = await project.fileStore.readFile(scanReportPath(connectionId, syncId));
return JSON.parse(raw.content) as KloScanReport;
} catch {
return null;
}
}
export async function runLocalScan(options: RunLocalScanOptions): Promise<LocalScanRunResult> {
const mode = options.mode ?? 'structural';
assertSupportedMode(mode);
await options.progress?.update(0.05, 'Preparing scan');
const connector = await resolveScanConnector(options, mode);
const connection = options.project.config.connections[options.connectionId];
if (!connection) {
throw new Error(`Connection "${options.connectionId}" is not configured in klo.yaml`);
}
const driver = normalizeDriver(connection.driver);
const adapters =
options.adapters ??
createDefaultLocalIngestAdapters(options.project, { databaseIntrospectionUrl: options.databaseIntrospectionUrl });
const enrichmentProviders =
connector && (mode !== 'structural' || options.detectRelationships)
? options.enrichmentProviders !== undefined
? options.enrichmentProviders
: createLocalScanEnrichmentProvidersFromConfig(options.project.config.scan.enrichment, options.project.config.llm)
: null;
await options.progress?.update(0.15, 'Inspecting database schema');
const record = await runLocalStageOnlyIngest({
project: options.project,
adapters,
adapter: LIVE_DATABASE_ADAPTER,
connectionId: options.connectionId,
trigger: 'manual_resync',
jobId: options.jobId,
now: options.now,
dryRun: options.dryRun,
});
await options.progress?.update(0.55, scanChangeSummary(scanDiffSummaryFromRecord(record)));
let report = reportFromIngest({
record,
driver,
mode,
dryRun: options.dryRun ?? false,
trigger: options.trigger ?? 'cli',
createdAt: (options.now?.() ?? new Date()).toISOString(),
});
let reusedExistingScanArtifacts = false;
const existingReport =
!report.dryRun && !connector && hasNoContentChanges(record)
? await readScanReport(options.project, record.connectionId, record.syncId)
: null;
if (existingReport && existingReport.mode === mode && existingReport.dryRun === report.dryRun) {
report.artifactPaths = existingReport.artifactPaths;
report.capabilityGaps = existingReport.capabilityGaps;
report.warnings = existingReport.warnings;
report.relationships = existingReport.relationships;
report.enrichment = existingReport.enrichment;
report.enrichmentState = existingReport.enrichmentState;
reusedExistingScanArtifacts = true;
}
const enrichmentStateStore = connector ? createLocalScanEnrichmentStateStore(options) : null;
let enrichmentState: KloScanEnrichmentStateSummary = completedKloScanEnrichmentStateSummary();
if (!reusedExistingScanArtifacts && !report.dryRun && report.artifactPaths.rawSourcesDir) {
await options.progress?.update(0.7, 'Writing schema artifacts');
const structuralSnapshot = await readLocalScanStructuralSnapshot({
project: options.project,
connectionId: options.connectionId,
driver,
rawSourcesDir: report.artifactPaths.rawSourcesDir,
extractedAtFallback: report.createdAt,
});
const manifestArtifacts = await writeLocalScanManifestShards({
project: options.project,
connectionId: options.connectionId,
syncId: record.syncId,
driver,
snapshot: structuralSnapshot,
dryRun: false,
});
report.artifactPaths.manifestShards = manifestArtifacts.manifestShards;
report.manifestShardsWritten = manifestArtifacts.manifestShardsWritten;
}
if (connector) {
try {
await options.progress?.update(
0.82,
mode === 'relationships' || options.detectRelationships
? 'Detecting relationships'
: 'Enriching schema metadata',
);
const enrichment = await runLocalScanEnrichment({
connectionId: options.connectionId,
mode,
detectRelationships: options.detectRelationships,
connector,
context: { runId: record.runId, progress: options.progress?.startPhase(0.18) },
providers: enrichmentProviders,
stateStore: enrichmentStateStore,
syncId: record.syncId,
providerIdentity: localScanProviderIdentity(
options.project.config.scan.enrichment,
options.project.config.llm,
options.project.config.scan.relationships,
),
relationshipSettings: options.project.config.scan.relationships,
now: options.now,
});
const artifacts = await writeLocalScanEnrichmentArtifacts({
project: options.project,
connectionId: options.connectionId,
syncId: record.syncId,
driver,
enrichment,
dryRun: options.dryRun ?? false,
relationshipSettings: options.project.config.scan.relationships,
});
report.enrichment = enrichment.summary;
report.relationships = enrichment.relationships;
enrichmentState = enrichment.state;
report.enrichmentState = enrichmentState;
report.warnings.push(...enrichment.warnings);
report.artifactPaths.enrichmentArtifacts = artifacts.enrichmentArtifacts;
report.artifactPaths.manifestShards = artifacts.manifestShards;
report.manifestShardsWritten = artifacts.manifestShardsWritten;
} catch (error) {
const message = kloScanErrorMessage(error);
report.enrichment = failedKloScanEnrichmentSummary(mode, options.detectRelationships ?? false);
const stages = await enrichmentStateStore?.listRunStages(record.runId);
if (stages) {
enrichmentState = completedKloScanEnrichmentStateSummary();
for (const stage of stages) {
if (stage.status === 'completed') {
enrichmentState.completedStages.push(stage.stage);
} else {
enrichmentState.failedStages.push(stage.stage);
}
}
report.enrichmentState = enrichmentState;
}
report.warnings.push({
code: 'enrichment_failed',
message: `KLO scan enrichment failed after structural scan completed: ${message}`,
recoverable: true,
metadata: { mode, detectRelationships: options.detectRelationships ?? false },
});
}
}
report = redactKloScanReport(report);
if (!reusedExistingScanArtifacts) {
await writeScanReport(options.project, report);
}
await options.progress?.update(1, 'Scan completed');
return {
runId: record.runId,
status: 'done',
done: true,
connectionId: record.connectionId,
mode,
dryRun: options.dryRun ?? false,
syncId: record.syncId,
report,
};
}
export async function getLocalScanReport(project: KloLocalProject, runId: string): Promise<KloScanReport | null> {
const status = await getLocalStageOnlyIngestStatus(project, runId);
if (!status || status.adapter !== LIVE_DATABASE_ADAPTER) {
return null;
}
const report = await readScanReport(project, status.connectionId, status.syncId);
if (!report) {
return null;
}
return {
...report,
runId: status.runId,
syncId: status.syncId,
diffSummary: scanDiffSummaryFromRecord(status),
};
}
export async function getLocalScanStatus(
project: KloLocalProject,
runId: string,
): Promise<LocalScanStatusResponse | null> {
const status = await getLocalStageOnlyIngestStatus(project, runId);
if (!status || status.adapter !== LIVE_DATABASE_ADAPTER) {
return null;
}
const report = await getLocalScanReport(project, runId);
return {
runId: status.runId,
status: status.status,
done: status.done,
connectionId: status.connectionId,
mode: report?.mode ?? 'structural',
dryRun: report?.dryRun ?? false,
syncId: status.syncId,
progress: status.progress,
startedAt: status.startedAt,
completedAt: status.completedAt,
reportPath: report?.artifactPaths.reportPath ?? null,
warnings: report?.warnings ?? [],
};
}

View file

@ -0,0 +1,196 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { initKloProject, type KloLocalProject } from '../project/index.js';
import { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js';
describe('readLocalScanStructuralSnapshot', () => {
let tempDir: string;
let project: KloLocalProject;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-local-structural-artifacts-'));
project = await initKloProject({
projectDir: join(tempDir, 'project'),
projectName: 'warehouse',
});
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('rebuilds a canonical snapshot from persisted live-database raw files', async () => {
const rawRoot = 'raw-sources/warehouse/live-database/sync-1';
await project.fileStore.writeFile(
`${rawRoot}/connection.json`,
`${JSON.stringify(
{
connectionId: 'warehouse',
extractedAt: '2026-04-29T12:00:00.000Z',
metadata: { source: 'sqlite-smoke' },
tableCount: 2,
},
null,
2,
)}\n`,
'klo',
'klo@example.com',
'Seed connection artifact',
);
await project.fileStore.writeFile(
`${rawRoot}/tables/customers.json`,
`${JSON.stringify(
{
name: 'customers',
catalog: null,
db: 'public',
kind: 'table',
comment: 'Customer table',
estimatedRows: 12,
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Customer id',
},
],
foreignKeys: [],
},
null,
2,
)}\n`,
'klo',
'klo@example.com',
'Seed customers artifact',
);
await project.fileStore.writeFile(
`${rawRoot}/tables/orders.json`,
`${JSON.stringify(
{
name: 'orders',
catalog: null,
db: 'public',
kind: 'table',
comment: null,
estimatedRows: 20,
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
{
name: 'customer_id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: null,
},
],
},
null,
2,
)}\n`,
'klo',
'klo@example.com',
'Seed orders artifact',
);
const snapshot = await readLocalScanStructuralSnapshot({
project,
connectionId: 'warehouse',
driver: 'sqlite',
rawSourcesDir: rawRoot,
extractedAtFallback: '2026-04-29T13:00:00.000Z',
});
expect(snapshot).toMatchObject({
connectionId: 'warehouse',
driver: 'sqlite',
extractedAt: '2026-04-29T12:00:00.000Z',
metadata: { source: 'sqlite-smoke' },
tables: [
{
db: 'public',
name: 'customers',
comment: 'Customer table',
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Customer id',
},
],
},
{
db: 'public',
name: 'orders',
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: null,
},
],
},
],
});
});
it('uses the scan report timestamp when connection.json omits extractedAt', async () => {
const rawRoot = 'raw-sources/warehouse/live-database/sync-2';
await project.fileStore.writeFile(
`${rawRoot}/connection.json`,
'{"connectionId":"warehouse","metadata":{}}\n',
'klo',
'klo@example.com',
'Seed connection artifact without extractedAt',
);
await project.fileStore.writeFile(
`${rawRoot}/tables/orders.json`,
'{"name":"orders","catalog":null,"db":null,"kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n',
'klo',
'klo@example.com',
'Seed orders artifact',
);
const snapshot = await readLocalScanStructuralSnapshot({
project,
connectionId: 'warehouse',
driver: 'postgres',
rawSourcesDir: rawRoot,
extractedAtFallback: '2026-04-29T13:00:00.000Z',
});
expect(snapshot.extractedAt).toBe('2026-04-29T13:00:00.000Z');
});
});

View file

@ -0,0 +1,125 @@
import type { KloLocalProject } from '../project/index.js';
import type {
KloConnectionDriver,
KloSchemaColumn,
KloSchemaForeignKey,
KloSchemaSnapshot,
KloSchemaTable,
} from './types.js';
export interface ReadLocalScanStructuralSnapshotInput {
project: KloLocalProject;
connectionId: string;
driver: KloConnectionDriver;
rawSourcesDir: string;
extractedAtFallback: string;
}
interface LiveDatabaseConnectionArtifact {
connectionId?: unknown;
extractedAt?: unknown;
metadata?: unknown;
scope?: unknown;
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function metadataRecord(value: unknown): Record<string, unknown> {
return isRecord(value) ? value : {};
}
function optionalStringOrNull(value: unknown): string | null | undefined {
if (value === undefined) {
return undefined;
}
return typeof value === 'string' ? value : null;
}
function parseColumn(rawColumn: unknown, path: string): KloSchemaColumn {
if (
!isRecord(rawColumn) ||
typeof rawColumn.name !== 'string' ||
typeof rawColumn.nativeType !== 'string' ||
typeof rawColumn.normalizedType !== 'string' ||
(rawColumn.dimensionType !== 'time' &&
rawColumn.dimensionType !== 'string' &&
rawColumn.dimensionType !== 'number' &&
rawColumn.dimensionType !== 'boolean')
) {
throw new Error(`Invalid KLO schema column artifact: ${path}`);
}
return {
name: rawColumn.name,
nativeType: rawColumn.nativeType,
normalizedType: rawColumn.normalizedType,
dimensionType: rawColumn.dimensionType,
nullable: rawColumn.nullable === true,
primaryKey: rawColumn.primaryKey === true,
comment: optionalStringOrNull(rawColumn.comment) ?? null,
};
}
function parseForeignKey(rawForeignKey: unknown, path: string): KloSchemaForeignKey {
if (
!isRecord(rawForeignKey) ||
typeof rawForeignKey.fromColumn !== 'string' ||
typeof rawForeignKey.toTable !== 'string' ||
typeof rawForeignKey.toColumn !== 'string'
) {
throw new Error(`Invalid KLO schema foreign key artifact: ${path}`);
}
return {
fromColumn: rawForeignKey.fromColumn,
toCatalog: optionalStringOrNull(rawForeignKey.toCatalog) ?? null,
toDb: optionalStringOrNull(rawForeignKey.toDb) ?? null,
toTable: rawForeignKey.toTable,
toColumn: rawForeignKey.toColumn,
constraintName: optionalStringOrNull(rawForeignKey.constraintName) ?? null,
};
}
function parseTable(raw: string, path: string): KloSchemaTable {
const parsed = JSON.parse(raw) as unknown;
if (!isRecord(parsed) || typeof parsed.name !== 'string' || !Array.isArray(parsed.columns)) {
throw new Error(`Invalid KLO schema table artifact: ${path}`);
}
return {
catalog: optionalStringOrNull(parsed.catalog) ?? null,
db: optionalStringOrNull(parsed.db) ?? null,
name: parsed.name,
kind:
parsed.kind === 'view' || parsed.kind === 'external' || parsed.kind === 'event_stream' ? parsed.kind : 'table',
comment: optionalStringOrNull(parsed.comment) ?? null,
estimatedRows: typeof parsed.estimatedRows === 'number' ? parsed.estimatedRows : null,
columns: parsed.columns.map((column) => parseColumn(column, path)),
foreignKeys: Array.isArray(parsed.foreignKeys)
? parsed.foreignKeys.map((foreignKey) => parseForeignKey(foreignKey, path))
: [],
};
}
export async function readLocalScanStructuralSnapshot(
input: ReadLocalScanStructuralSnapshotInput,
): Promise<KloSchemaSnapshot> {
const connectionRaw = await input.project.fileStore.readFile(`${input.rawSourcesDir}/connection.json`);
const connection = JSON.parse(connectionRaw.content) as LiveDatabaseConnectionArtifact;
const listedTables = await input.project.fileStore.listFiles(`${input.rawSourcesDir}/tables`);
const tablePaths = listedTables.files.filter((path) => path.endsWith('.json')).sort();
const tables: KloSchemaTable[] = [];
for (const path of tablePaths) {
const tableRaw = await input.project.fileStore.readFile(path);
tables.push(parseTable(tableRaw.content, path));
}
return {
connectionId: typeof connection.connectionId === 'string' ? connection.connectionId : input.connectionId,
driver: input.driver,
extractedAt: typeof connection.extractedAt === 'string' ? connection.extractedAt : input.extractedAtFallback,
scope: isRecord(connection.scope) ? connection.scope : {},
metadata: metadataRecord(connection.metadata),
tables,
};
}

View file

@ -0,0 +1,376 @@
import { describe, expect, it, vi } from 'vitest';
import {
createKloConnectorCapabilities,
type KloScanConnector,
type KloScanContext,
type KloScanEnrichmentStateSummary,
type KloScanInput,
KloScanOrchestrator,
type KloSchemaSnapshot,
} from './index.js';
function snapshot(): KloSchemaSnapshot {
return {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-29T00:00:00.000Z',
scope: { schemas: ['public'] },
metadata: { source: 'test' },
tables: [
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: 'Orders table',
estimatedRows: null,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Order id',
},
],
foreignKeys: [],
},
],
};
}
function connector(
capabilities = createKloConnectorCapabilities({ tableSampling: true, columnSampling: true }),
): KloScanConnector {
return {
id: 'connector-1',
driver: 'postgres',
capabilities,
introspect: vi.fn(async () => snapshot()),
};
}
function context(): KloScanContext {
return {
runId: 'scan-run-1',
logger: {
debug: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
},
};
}
const input: KloScanInput = {
connectionId: 'warehouse',
driver: 'postgres',
mode: 'structural',
};
describe('KloScanOrchestrator', () => {
it('runs structural scans through connector introspection and structural host callback', async () => {
const scanConnector = connector();
const scanContext = context();
const runStructural = vi.fn(async (scanSnapshot: KloSchemaSnapshot) => ({
result: { synced: true },
diffSummary: { tablesAdded: scanSnapshot.tables.length, columnsAdded: 1 },
structuralSyncStats: { tablesCreated: 1, columnsCreated: 1 },
artifactPaths: { manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'] },
}));
const result = await new KloScanOrchestrator({
now: () => new Date('2026-04-29T00:10:00.000Z'),
syncIdFactory: () => 'sync-1',
}).run({
connector: scanConnector,
input,
trigger: 'schema_scan',
context: scanContext,
runStructural,
});
expect(scanConnector.introspect).toHaveBeenCalledWith(input, scanContext);
expect(runStructural).toHaveBeenCalledWith(snapshot(), scanContext);
expect(result.snapshot.connectionId).toBe('warehouse');
expect(result.structural.result).toEqual({ synced: true });
expect(result.enrichment).toBeNull();
expect(result.report).toMatchObject({
connectionId: 'warehouse',
driver: 'postgres',
syncId: 'sync-1',
runId: 'scan-run-1',
trigger: 'schema_scan',
mode: 'structural',
dryRun: false,
diffSummary: {
tablesAdded: 1,
columnsAdded: 1,
},
structuralSyncStats: {
tablesCreated: 1,
columnsCreated: 1,
},
manifestShardsWritten: 1,
artifactPaths: {
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
},
enrichment: {
dataDictionary: 'skipped',
columnDescriptions: 'skipped',
tableDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'skipped',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
},
enrichmentState: {
resumedStages: [],
completedStages: [],
failedStages: [],
},
createdAt: '2026-04-29T00:10:00.000Z',
});
});
it('runs enriched scans through structural and enrichment host callbacks', async () => {
const scanConnector = connector(
createKloConnectorCapabilities({
tableSampling: true,
columnSampling: true,
columnStats: true,
readOnlySql: true,
}),
);
const scanContext = context();
const result = await new KloScanOrchestrator({ syncIdFactory: () => 'sync-2' }).run({
connector: scanConnector,
input: { ...input, mode: 'enriched', detectRelationships: true },
trigger: 'schema_scan',
context: scanContext,
runStructural: vi.fn(async () => ({
result: { schemaId: 'schema-1' },
structuralSyncStats: { tablesCreated: 1 },
})),
runEnrichment: vi.fn(async () => ({
result: { enriched: true },
enrichment: {
dataDictionary: 'completed',
columnDescriptions: 'completed',
tableDescriptions: 'completed',
embeddings: 'completed',
deterministicRelationships: 'completed',
statisticalValidation: 'completed',
} as const,
relationships: { accepted: 2, rejected: 1 },
})),
});
expect(result.enrichment?.result).toEqual({ enriched: true });
expect(result.report.enrichment.columnDescriptions).toBe('completed');
expect(result.report.relationships).toEqual({ accepted: 2, review: 0, rejected: 1, skipped: 0 });
expect(result.report.capabilityGaps).toEqual([]);
expect(result.report.warnings).toEqual([]);
});
it('reports host enrichment state summaries from enriched scan phases', async () => {
const scanConnector = connector(
createKloConnectorCapabilities({
tableSampling: true,
columnSampling: true,
columnStats: true,
readOnlySql: true,
}),
);
const enrichmentState: Partial<KloScanEnrichmentStateSummary> = {
resumedStages: ['relationships', 'descriptions', 'descriptions'],
completedStages: ['embeddings', 'descriptions', 'relationships'],
failedStages: [],
};
const result = await new KloScanOrchestrator({ syncIdFactory: () => 'sync-state' }).run({
connector: scanConnector,
input: { ...input, mode: 'enriched', detectRelationships: true },
trigger: 'schema_scan',
context: context(),
runStructural: vi.fn(async () => ({ result: { synced: true } })),
runEnrichment: vi.fn(async () => ({
result: { enriched: true },
enrichmentState,
})),
});
expect(result.report.enrichmentState).toEqual({
resumedStages: ['descriptions', 'relationships'],
completedStages: ['descriptions', 'embeddings', 'relationships'],
failedStages: [],
});
});
it('records recoverable warnings for missing optional capabilities during enriched scans', async () => {
const result = await new KloScanOrchestrator({ syncIdFactory: () => 'sync-3' }).run({
connector: connector(createKloConnectorCapabilities()),
input: { ...input, mode: 'enriched', detectRelationships: true },
trigger: 'schema_scan',
context: context(),
runStructural: vi.fn(async () => ({ result: {} })),
runEnrichment: vi.fn(async () => ({ result: {} })),
});
expect(result.report.capabilityGaps).toEqual(['tableSampling', 'columnSampling', 'columnStats', 'readOnlySql']);
expect(result.report.warnings.map((warning) => warning.code)).toEqual([
'connector_capability_missing',
'connector_capability_missing',
'connector_capability_missing',
'connector_capability_missing',
]);
expect(result.report.warnings.every((warning) => warning.recoverable)).toBe(true);
});
it('redacts structural and enrichment warning metadata before returning reports', async () => {
const result = await new KloScanOrchestrator({ syncIdFactory: () => 'sync-redacted' }).run({
connector: connector(
createKloConnectorCapabilities({
tableSampling: true,
columnSampling: true,
columnStats: true,
readOnlySql: true,
}),
),
input: { ...input, mode: 'enriched' },
trigger: 'schema_scan',
context: context(),
runStructural: vi.fn(async () => ({
result: {},
warnings: [
{
code: 'sampling_failed',
message: 'structural warning',
recoverable: true,
metadata: {
url: 'postgres://reader:secret@example.test/db', // pragma: allowlist secret
table: 'orders',
},
} as const,
],
})),
runEnrichment: vi.fn(async () => ({
result: {},
warnings: [
{
code: 'embedding_unavailable',
message: 'enrichment warning',
recoverable: true,
metadata: {
nested: {
api_key: 'sk_test_123', // pragma: allowlist secret
schema: 'public',
},
},
} as const,
],
})),
});
expect(result.report.warnings).toEqual([
{
code: 'sampling_failed',
message: 'structural warning',
recoverable: true,
metadata: {
url: '<redacted>',
table: 'orders',
},
},
{
code: 'embedding_unavailable',
message: 'enrichment warning',
recoverable: true,
metadata: {
nested: {
api_key: '<redacted>',
schema: 'public',
},
},
},
]);
});
it('keeps structural results when the enrichment phase fails after structural sync', async () => {
const scanConnector = connector(
createKloConnectorCapabilities({
tableSampling: true,
columnSampling: true,
columnStats: true,
readOnlySql: true,
}),
);
const runStructural = vi.fn(async () => ({
result: { synced: true },
artifactPaths: {
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-failed-enrichment',
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
},
manifestShardsWritten: 1,
}));
const runEnrichment = vi.fn(async () => {
throw new Error('AI Gateway timed out');
});
const result = await new KloScanOrchestrator({
now: () => new Date('2026-04-29T18:00:00.000Z'),
syncIdFactory: () => 'sync-failed-enrichment',
}).run({
connector: scanConnector,
input: { ...input, mode: 'enriched', detectRelationships: true },
trigger: 'schema_scan',
context: context(),
runStructural,
runEnrichment,
});
expect(result.structural.result).toEqual({ synced: true });
expect(result.enrichment).toBeNull();
expect(result.report.artifactPaths.manifestShards).toEqual(['semantic-layer/warehouse/_schema/public.yaml']);
expect(result.report.manifestShardsWritten).toBe(1);
expect(result.report.enrichment).toEqual({
dataDictionary: 'failed',
tableDescriptions: 'failed',
columnDescriptions: 'failed',
embeddings: 'failed',
deterministicRelationships: 'failed',
llmRelationshipValidation: 'failed',
statisticalValidation: 'failed',
});
expect(result.report.warnings).toEqual([
{
code: 'enrichment_failed',
message: 'KLO scan enrichment failed after structural scan completed: AI Gateway timed out',
recoverable: true,
metadata: {
mode: 'enriched',
detectRelationships: true,
},
},
]);
});
it('marks dry-run reports without changing host callback behavior', async () => {
const runStructural = vi.fn(async () => ({ result: { planned: true }, manifestShardsWritten: 0 }));
const result = await new KloScanOrchestrator({ syncIdFactory: () => 'sync-4' }).run({
connector: connector(),
input: { ...input, dryRun: true },
trigger: 'cli',
context: context(),
runStructural,
});
expect(runStructural).toHaveBeenCalledTimes(1);
expect(result.report.dryRun).toBe(true);
expect(result.report.trigger).toBe('cli');
});
});

View file

@ -0,0 +1,297 @@
import { redactKloScanReport } from './credentials.js';
import { completedKloScanEnrichmentStateSummary, summarizeKloScanEnrichmentState } from './enrichment-state.js';
import {
failedKloScanEnrichmentSummary,
kloScanErrorMessage,
skippedKloScanEnrichmentSummary,
} from './enrichment-summary.js';
import type {
KloConnectorCapabilities,
KloScanArtifactPaths,
KloScanConnector,
KloScanContext,
KloScanDiffSummary,
KloScanEnrichmentSummary,
KloScanEnrichmentStateSummary,
KloScanInput,
KloScanRelationshipSummary,
KloScanReport,
KloScanTrigger,
KloScanWarning,
KloSchemaSnapshot,
KloStructuralSyncStats,
} from './types.js';
type CapabilityGap = keyof Omit<KloConnectorCapabilities, 'structuralIntrospection'>;
export interface KloStructuralScanPhaseResult<TResult = unknown> {
result: TResult;
diffSummary?: Partial<KloScanDiffSummary>;
structuralSyncStats?: Partial<KloStructuralSyncStats>;
manifestShardsWritten?: number;
artifactPaths?: Partial<KloScanArtifactPaths>;
relationships?: Partial<KloScanRelationshipSummary>;
warnings?: KloScanWarning[];
}
export interface KloEnrichmentScanPhaseResult<TResult = unknown> {
result: TResult;
enrichment?: Partial<KloScanEnrichmentSummary>;
enrichmentState?: Partial<KloScanEnrichmentStateSummary>;
manifestShardsWritten?: number;
artifactPaths?: Partial<KloScanArtifactPaths>;
relationships?: Partial<KloScanRelationshipSummary>;
warnings?: KloScanWarning[];
}
export interface KloScanOrchestratorRunInput<TStructuralResult = unknown, TEnrichmentResult = unknown> {
connector: KloScanConnector;
input: KloScanInput;
trigger: KloScanTrigger;
context: KloScanContext;
syncId?: string;
runStructural: (
snapshot: KloSchemaSnapshot,
context: KloScanContext,
) => Promise<KloStructuralScanPhaseResult<TStructuralResult>>;
runEnrichment?: (
snapshot: KloSchemaSnapshot,
structural: KloStructuralScanPhaseResult<TStructuralResult>,
context: KloScanContext,
) => Promise<KloEnrichmentScanPhaseResult<TEnrichmentResult>>;
}
export interface KloScanOrchestratorRunResult<TStructuralResult = unknown, TEnrichmentResult = unknown> {
snapshot: KloSchemaSnapshot;
structural: KloStructuralScanPhaseResult<TStructuralResult>;
enrichment: KloEnrichmentScanPhaseResult<TEnrichmentResult> | null;
report: KloScanReport;
}
export interface KloScanOrchestratorOptions {
now?: () => Date;
syncIdFactory?: (input: KloScanInput, context: KloScanContext) => string;
}
const emptyDiffSummary: KloScanDiffSummary = {
tablesAdded: 0,
tablesModified: 0,
tablesDeleted: 0,
tablesUnchanged: 0,
columnsAdded: 0,
columnsModified: 0,
columnsDeleted: 0,
};
const emptyStructuralSyncStats: KloStructuralSyncStats = {
tablesCreated: 0,
tablesUpdated: 0,
tablesDeleted: 0,
columnsCreated: 0,
columnsUpdated: 0,
columnsDeleted: 0,
};
const emptyArtifactPaths: KloScanArtifactPaths = {
rawSourcesDir: null,
reportPath: null,
manifestShards: [],
enrichmentArtifacts: [],
};
function mergeDiffSummary(input?: Partial<KloScanDiffSummary>): KloScanDiffSummary {
return { ...emptyDiffSummary, ...input };
}
function mergeStructuralSyncStats(input?: Partial<KloStructuralSyncStats>): KloStructuralSyncStats {
return { ...emptyStructuralSyncStats, ...input };
}
function mergeEnrichmentSummary(input?: Partial<KloScanEnrichmentSummary>): KloScanEnrichmentSummary {
return { ...skippedKloScanEnrichmentSummary, ...input };
}
function mergeEnrichmentState(input?: Partial<KloScanEnrichmentStateSummary>): KloScanEnrichmentStateSummary {
if (!input) {
return completedKloScanEnrichmentStateSummary();
}
return summarizeKloScanEnrichmentState({
resumedStages: input.resumedStages ?? [],
completedStages: input.completedStages ?? [],
failedStages: input.failedStages ?? [],
});
}
function mergeArtifactPaths(
structural?: Partial<KloScanArtifactPaths>,
enrichment?: Partial<KloScanArtifactPaths>,
): KloScanArtifactPaths {
return {
...emptyArtifactPaths,
...structural,
...enrichment,
manifestShards: [...(structural?.manifestShards ?? []), ...(enrichment?.manifestShards ?? [])],
enrichmentArtifacts: [...(structural?.enrichmentArtifacts ?? []), ...(enrichment?.enrichmentArtifacts ?? [])],
};
}
function mergeRelationshipSummary(
structural?: Partial<KloScanRelationshipSummary>,
enrichment?: Partial<KloScanRelationshipSummary>,
): KloScanRelationshipSummary {
return {
accepted: (structural?.accepted ?? 0) + (enrichment?.accepted ?? 0),
review: (structural?.review ?? 0) + (enrichment?.review ?? 0),
rejected: (structural?.rejected ?? 0) + (enrichment?.rejected ?? 0),
skipped: (structural?.skipped ?? 0) + (enrichment?.skipped ?? 0),
};
}
function manifestShardsWritten(phase: {
manifestShardsWritten?: number;
artifactPaths?: Partial<KloScanArtifactPaths>;
}): number {
return phase.manifestShardsWritten ?? phase.artifactPaths?.manifestShards?.length ?? 0;
}
function requiredCapabilities(mode: KloScanInput['mode'], detectRelationships: boolean | undefined): CapabilityGap[] {
const required = new Set<CapabilityGap>();
if (mode === 'enriched') {
required.add('tableSampling');
required.add('columnSampling');
required.add('columnStats');
required.add('readOnlySql');
}
if (mode === 'relationships' || detectRelationships) {
required.add('columnStats');
required.add('readOnlySql');
}
return [...required];
}
function capabilityGaps(capabilities: KloConnectorCapabilities, input: KloScanInput): CapabilityGap[] {
return requiredCapabilities(input.mode ?? 'structural', input.detectRelationships).filter(
(capability) => !capabilities[capability],
);
}
function warningsForCapabilityGaps(gaps: CapabilityGap[]): KloScanWarning[] {
return gaps.map((gap) => ({
code: 'connector_capability_missing',
message: `KLO scan connector is missing optional capability: ${gap}`,
recoverable: true,
metadata: { capability: gap },
}));
}
function assertNotAborted(context: KloScanContext): void {
if (context.signal?.aborted) {
throw new Error('KLO scan aborted');
}
}
export class KloScanOrchestrator {
private readonly now: () => Date;
private readonly syncIdFactory: (input: KloScanInput, context: KloScanContext) => string;
constructor(options: KloScanOrchestratorOptions = {}) {
this.now = options.now ?? (() => new Date());
this.syncIdFactory = options.syncIdFactory ?? ((_, context) => context.runId);
}
async run<TStructuralResult = unknown, TEnrichmentResult = unknown>(
input: KloScanOrchestratorRunInput<TStructuralResult, TEnrichmentResult>,
): Promise<KloScanOrchestratorRunResult<TStructuralResult, TEnrichmentResult>> {
const mode = input.input.mode ?? 'structural';
const syncId = input.syncId ?? this.syncIdFactory(input.input, input.context);
const gaps = capabilityGaps(input.connector.capabilities, input.input);
const warnings = warningsForCapabilityGaps(gaps);
input.context.logger?.info('Starting KLO scan', {
connectionId: input.input.connectionId,
connectorId: input.connector.id,
mode,
trigger: input.trigger,
});
assertNotAborted(input.context);
const snapshot = await input.connector.introspect(input.input, input.context);
assertNotAborted(input.context);
const structural = await input.runStructural(snapshot, input.context);
let enrichment: KloEnrichmentScanPhaseResult<TEnrichmentResult> | null = null;
let failedEnrichment: KloScanEnrichmentSummary | null = null;
if (mode !== 'structural' || input.input.detectRelationships) {
if (input.runEnrichment) {
assertNotAborted(input.context);
try {
enrichment = await input.runEnrichment(snapshot, structural, input.context);
} catch (error) {
const message = kloScanErrorMessage(error);
failedEnrichment = failedKloScanEnrichmentSummary(mode, input.input.detectRelationships ?? false);
warnings.push({
code: 'enrichment_failed',
message: `KLO scan enrichment failed after structural scan completed: ${message}`,
recoverable: true,
metadata: { mode, detectRelationships: input.input.detectRelationships ?? false },
});
input.context.logger?.warn('KLO scan enrichment failed after structural scan completed', {
connectionId: input.input.connectionId,
runId: input.context.runId,
mode,
error: message,
});
}
} else {
failedEnrichment = failedKloScanEnrichmentSummary(mode, input.input.detectRelationships ?? false);
warnings.push({
code: 'connector_capability_missing',
message: 'KLO scan requested enrichment or relationship detection, but no enrichment phase was provided',
recoverable: true,
metadata: { mode, detectRelationships: input.input.detectRelationships ?? false },
});
}
}
const manifestShardCount = manifestShardsWritten(structural) + (enrichment ? manifestShardsWritten(enrichment) : 0);
const report: KloScanReport = redactKloScanReport({
connectionId: input.input.connectionId,
driver: input.input.driver,
syncId,
runId: input.context.runId,
trigger: input.trigger,
mode,
dryRun: input.input.dryRun ?? false,
artifactPaths: mergeArtifactPaths(structural.artifactPaths, enrichment?.artifactPaths),
diffSummary: mergeDiffSummary(structural.diffSummary),
manifestShardsWritten: manifestShardCount,
structuralSyncStats: mergeStructuralSyncStats(structural.structuralSyncStats),
enrichment: mergeEnrichmentSummary(enrichment?.enrichment ?? failedEnrichment ?? undefined),
capabilityGaps: gaps,
warnings: [...warnings, ...(structural.warnings ?? []), ...(enrichment?.warnings ?? [])],
relationships: mergeRelationshipSummary(structural.relationships, enrichment?.relationships),
enrichmentState: mergeEnrichmentState(enrichment?.enrichmentState),
createdAt: this.now().toISOString(),
});
input.context.logger?.info('Completed KLO scan', {
connectionId: report.connectionId,
runId: report.runId,
syncId: report.syncId,
warnings: report.warnings.length,
});
return {
snapshot,
structural,
enrichment,
report,
};
}
}

View file

@ -0,0 +1,310 @@
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { dirname, join } from 'node:path';
import { runLocalStageOnlyIngest, type SourceAdapter } from '../ingest/index.js';
import { initKloProject, loadKloProject } from '../project/index.js';
import { describe, expect, it } from 'vitest';
import { readLocalScanRelationshipArtifacts } from './relationship-artifacts.js';
import type { KloRelationshipArtifact, KloRelationshipDiagnosticsArtifact } from './relationship-diagnostics.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
import type { KloScanReport } from './types.js';
async function writeProjectFile(projectDir: string, relativePath: string, content: string): Promise<void> {
const absolutePath = join(projectDir, relativePath);
await mkdir(dirname(absolutePath), { recursive: true });
await writeFile(absolutePath, content, 'utf-8');
}
async function writeWarehouseConfig(projectDir: string): Promise<void> {
await writeFile(
join(projectDir, 'klo.yaml'),
[
'project: warehouse',
'connections:',
' warehouse:',
' driver: sqlite',
' path: warehouse.db',
' readonly: true',
'ingest:',
' adapters:',
' - live-database',
'',
].join('\n'),
'utf-8',
);
}
function liveDatabaseAdapter(): SourceAdapter {
return {
source: 'live-database',
skillNames: ['live_database_ingest'],
async fetch(_pullConfig, stagedDir) {
await mkdir(join(stagedDir, 'tables'), { recursive: true });
await writeFile(join(stagedDir, 'connection.json'), '{"connectionId":"warehouse"}\n', 'utf-8');
await writeFile(join(stagedDir, 'foreign-keys.json'), '{"foreignKeys":[]}\n', 'utf-8');
await writeFile(
join(stagedDir, 'tables', 'orders.json'),
'{"name":"orders","db":"public","columns":[{"name":"id","type":"integer","nullable":false,"primaryKey":true}]}\n',
'utf-8',
);
},
async detect(stagedDir) {
await writeFile(join(stagedDir, 'connection.json'), '{"connectionId":"warehouse"}\n', 'utf-8');
return true;
},
async chunk() {
return {
workUnits: [
{
unitKey: 'live-database-public-orders',
rawFiles: ['tables/orders.json'],
dependencyPaths: ['connection.json', 'foreign-keys.json'],
peerFileIndex: [],
},
],
};
},
};
}
async function createLiveDatabaseRun(projectDir: string, runId: string) {
await initKloProject({ projectDir, projectName: 'warehouse' });
await writeWarehouseConfig(projectDir);
const project = await loadKloProject({ projectDir });
await runLocalStageOnlyIngest({
project,
adapters: [liveDatabaseAdapter()],
adapter: 'live-database',
connectionId: 'warehouse',
jobId: runId,
now: () => new Date('2026-05-07T10:00:00.000Z'),
});
return project;
}
function scanReport(enrichmentArtifacts: string[], syncId = '2026-05-07-100000-scan-run-review'): KloScanReport {
return {
connectionId: 'warehouse',
driver: 'sqlite',
syncId,
runId: 'scan-run-review',
trigger: 'cli',
mode: 'relationships',
dryRun: false,
artifactPaths: {
rawSourcesDir: `raw-sources/warehouse/live-database/${syncId}`,
reportPath: `raw-sources/warehouse/live-database/${syncId}/scan-report.json`,
manifestShards: [],
enrichmentArtifacts,
},
diffSummary: {
tablesAdded: 0,
tablesModified: 0,
tablesDeleted: 0,
tablesUnchanged: 2,
columnsAdded: 0,
columnsModified: 0,
columnsDeleted: 0,
},
manifestShardsWritten: 0,
structuralSyncStats: {
tablesCreated: 0,
tablesUpdated: 0,
tablesDeleted: 0,
columnsCreated: 0,
columnsUpdated: 0,
columnsDeleted: 0,
},
enrichment: {
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'completed',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'skipped',
},
capabilityGaps: [],
warnings: [],
relationships: { accepted: 0, review: 1, rejected: 1, skipped: 0 },
enrichmentState: {
resumedStages: [],
completedStages: ['relationships'],
failedStages: [],
},
createdAt: '2026-05-07T10:00:00.000Z',
};
}
const relationshipArtifact: KloRelationshipArtifact = {
connectionId: 'warehouse',
accepted: [],
review: [
{
id: 'orders:orders.customer_id->customers:customers.id',
status: 'review',
source: 'deterministic_name',
from: {
tableId: 'orders',
columnIds: ['orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'customers',
columnIds: ['customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.62,
pkScore: 0.91,
fkScore: 0.62,
score: 0.62,
evidence: { sources: ['table_suffix'] },
validation: { status: 'unavailable' },
graph: { reasons: ['validation_unavailable_review_only'] },
reasons: ['validation_unavailable_review_only', 'fk_score_review'],
},
],
rejected: [
{
id: 'orders:orders.note_id->notes:notes.id',
status: 'rejected',
source: 'deterministic_name',
from: {
tableId: 'orders',
columnIds: ['orders.note_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['note_id'],
},
to: {
tableId: 'notes',
columnIds: ['notes.id'],
table: { catalog: null, db: 'public', name: 'notes' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.2,
pkScore: 0.4,
fkScore: 0.2,
score: 0.2,
evidence: { sources: ['exact_column_match'] },
validation: { status: 'failed' },
graph: { reasons: ['low_source_coverage'] },
reasons: ['low_source_coverage'],
},
],
skipped: [],
};
const diagnosticsArtifact: KloRelationshipDiagnosticsArtifact = {
connectionId: 'warehouse',
generatedAt: '2026-05-07T10:00:00.000Z',
summary: { accepted: 0, review: 1, rejected: 1, skipped: 0 },
noAcceptedReason: 'relationship candidates require review before manifest writes',
candidateCountsBySource: { deterministic_name: 2 },
validation: { available: false, sqlAvailable: false, queryCount: 0 },
thresholds: { acceptThreshold: 0.85, reviewThreshold: 0.55 },
policy: {
validationRequiredForManifest: true,
maxCandidatesPerColumn: 25,
profileSampleRows: 10000,
validationConcurrency: 4,
},
warnings: [],
profileWarnings: ['KLO scan connector cannot run read-only SQL relationship validation'],
};
const profileArtifact: KloRelationshipProfileArtifact = {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: false,
tables: [],
columns: {},
queryCount: 0,
warnings: ['KLO scan connector cannot run read-only SQL relationship validation'],
};
describe('local scan relationship artifact reader', () => {
it('loads relationship, diagnostics, and profile artifacts for a scan run', async () => {
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-artifacts-'));
try {
const project = await createLiveDatabaseRun(projectDir, 'scan-run-review');
const syncId = '2026-05-07-100000-scan-run-review';
const report = scanReport(
[
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationships.json`,
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-profile.json`,
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-diagnostics.json`,
],
syncId,
);
await writeProjectFile(projectDir, report.artifactPaths.reportPath ?? '', `${JSON.stringify(report, null, 2)}\n`);
await writeProjectFile(
projectDir,
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationships.json`,
`${JSON.stringify(relationshipArtifact, null, 2)}\n`,
);
await writeProjectFile(
projectDir,
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-diagnostics.json`,
`${JSON.stringify(diagnosticsArtifact, null, 2)}\n`,
);
await writeProjectFile(
projectDir,
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-profile.json`,
`${JSON.stringify(profileArtifact, null, 2)}\n`,
);
const result = await readLocalScanRelationshipArtifacts(project, 'scan-run-review');
expect(result).toMatchObject({
runId: 'scan-run-review',
connectionId: 'warehouse',
syncId,
paths: {
relationships: `raw-sources/warehouse/live-database/${syncId}/enrichment/relationships.json`,
diagnostics: `raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-diagnostics.json`,
profile: `raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-profile.json`,
},
});
expect(result?.relationships.review[0]).toMatchObject({
id: 'orders:orders.customer_id->customers:customers.id',
status: 'review',
reasons: ['validation_unavailable_review_only', 'fk_score_review'],
});
expect(result?.diagnostics?.noAcceptedReason).toBe('relationship candidates require review before manifest writes');
expect(result?.profile?.sqlAvailable).toBe(false);
} finally {
await rm(projectDir, { recursive: true, force: true });
}
});
it('returns null when the scan run has no report', async () => {
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-artifacts-missing-run-'));
try {
await initKloProject({ projectDir, projectName: 'warehouse' });
const project = await loadKloProject({ projectDir });
await expect(readLocalScanRelationshipArtifacts(project, 'missing-run')).resolves.toBeNull();
} finally {
await rm(projectDir, { recursive: true, force: true });
}
});
it('throws a focused error when a scan report does not reference relationships.json', async () => {
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-artifacts-missing-artifact-'));
try {
const project = await createLiveDatabaseRun(projectDir, 'scan-run-review');
const report = scanReport([]);
await writeProjectFile(projectDir, report.artifactPaths.reportPath ?? '', `${JSON.stringify(report, null, 2)}\n`);
await expect(readLocalScanRelationshipArtifacts(project, 'scan-run-review')).rejects.toThrow(
'Scan report "scan-run-review" does not reference relationships.json',
);
} finally {
await rm(projectDir, { recursive: true, force: true });
}
});
});

View file

@ -0,0 +1,75 @@
import type { KloLocalProject } from '../project/index.js';
import { getLocalScanReport } from './local-scan.js';
import type { KloRelationshipArtifact, KloRelationshipDiagnosticsArtifact } from './relationship-diagnostics.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
import type { KloScanReport } from './types.js';
export type KloRelationshipArtifactStatus = 'accepted' | 'review' | 'rejected' | 'skipped' | 'all';
export interface ReadLocalScanRelationshipArtifactsResult {
runId: string;
connectionId: string;
syncId: string;
report: KloScanReport;
relationships: KloRelationshipArtifact;
diagnostics: KloRelationshipDiagnosticsArtifact | null;
profile: KloRelationshipProfileArtifact | null;
paths: {
relationships: string;
diagnostics: string | null;
profile: string | null;
};
}
function findArtifactPath(report: KloScanReport, fileName: string): string | null {
return report.artifactPaths.enrichmentArtifacts.find((path) => path.endsWith(`/enrichment/${fileName}`)) ?? null;
}
async function readJsonArtifact<T>(project: KloLocalProject, path: string): Promise<T> {
const raw = await project.fileStore.readFile(path);
return JSON.parse(raw.content) as T;
}
async function readOptionalJsonArtifact<T>(project: KloLocalProject, path: string | null): Promise<T | null> {
if (!path) {
return null;
}
try {
return await readJsonArtifact<T>(project, path);
} catch {
return null;
}
}
export async function readLocalScanRelationshipArtifacts(
project: KloLocalProject,
runId: string,
): Promise<ReadLocalScanRelationshipArtifactsResult | null> {
const report = await getLocalScanReport(project, runId);
if (!report) {
return null;
}
const relationshipsPath = findArtifactPath(report, 'relationships.json');
if (!relationshipsPath) {
throw new Error(`Scan report "${runId}" does not reference relationships.json`);
}
const diagnosticsPath = findArtifactPath(report, 'relationship-diagnostics.json');
const profilePath = findArtifactPath(report, 'relationship-profile.json');
return {
runId,
connectionId: report.connectionId,
syncId: report.syncId,
report,
relationships: await readJsonArtifact<KloRelationshipArtifact>(project, relationshipsPath),
diagnostics: await readOptionalJsonArtifact<KloRelationshipDiagnosticsArtifact>(project, diagnosticsPath),
profile: await readOptionalJsonArtifact<KloRelationshipProfileArtifact>(project, profilePath),
paths: {
relationships: relationshipsPath,
diagnostics: diagnosticsPath,
profile: profilePath,
},
};
}

View file

@ -0,0 +1,451 @@
import { describe, expect, it } from 'vitest';
import {
buildKloRelationshipBenchmarkReport,
formatKloRelationshipBenchmarkReportMarkdown,
} from './relationship-benchmark-report.js';
import type {
KloRelationshipBenchmarkCaseResult,
KloRelationshipBenchmarkFixture,
KloRelationshipBenchmarkSuiteResult,
} from './relationship-benchmarks.js';
type CaseResultOverrides = Omit<Partial<KloRelationshipBenchmarkCaseResult>, 'metrics'> & {
metrics?: Partial<KloRelationshipBenchmarkCaseResult['metrics']>;
};
function caseResult(overrides: CaseResultOverrides = {}): KloRelationshipBenchmarkCaseResult {
return {
fixtureId: overrides.fixtureId ?? 'demo_b2b_no_declared_constraints',
mode: overrides.mode ?? 'declared_pks_and_declared_fks_removed',
metrics: {
pkPrecision: 1,
pkRecall: 0.5,
pkF1: 0.6666666666666666,
fkPrecision: 1,
fkRecall: 1,
fkF1: 1,
acceptedFalsePositiveCount: 0,
reviewRecall: 0,
acceptedOrReviewRecall: 1,
runtimeSeconds: 0.012345,
sqlQueries: 14,
llmCalls: 0,
...(overrides.metrics ?? {}),
},
expected: overrides.expected ?? {
pk: ['accounts.(id)', 'users.(id)'],
fk: ['users.(account_id)->accounts.(id)'],
},
predicted: overrides.predicted ?? {
pk: ['accounts.(id)'],
fk: ['users.(account_id)->accounts.(id)'],
acceptedFk: ['users.(account_id)->accounts.(id)'],
reviewFk: [],
},
falsePositives: overrides.falsePositives ?? { pk: [], fk: [] },
falseNegatives: overrides.falseNegatives ?? { pk: ['users.(id)'], fk: [] },
skippedComposite: overrides.skippedComposite ?? { pk: [], fk: [] },
validationBlocked: overrides.validationBlocked ?? false,
};
}
function fixture(overrides: Partial<KloRelationshipBenchmarkFixture> = {}): KloRelationshipBenchmarkFixture {
return {
id: overrides.id ?? 'demo_b2b_no_declared_constraints',
name: overrides.name ?? 'Packaged B2B demo with declared PK and FK metadata masked',
tier: overrides.tier ?? 'smoke',
origin: overrides.origin ?? 'synthetic',
thresholdEligible: overrides.thresholdEligible,
validationBudget: overrides.validationBudget,
snapshot: overrides.snapshot ?? {
connectionId: 'demo_b2b',
driver: 'sqlite',
extractedAt: '2026-05-07T00:00:00.000Z',
scope: {},
metadata: {},
tables: [],
},
expected: overrides.expected ?? { expectedPks: [], expectedLinks: [] },
defaultModes: overrides.defaultModes ?? ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
dataPath: overrides.dataPath ?? '/tmp/demo.sqlite',
columnEmbeddings: overrides.columnEmbeddings ?? {},
};
}
describe('relationship benchmark report', () => {
it('classifies run, validation-blocked, and not-run benchmark cases', () => {
const suite: KloRelationshipBenchmarkSuiteResult = {
cases: [
caseResult(),
caseResult({
mode: 'validation_disabled',
validationBlocked: true,
metrics: { fkRecall: 0, acceptedOrReviewRecall: 1, sqlQueries: 0 },
predicted: {
pk: ['accounts.(id)'],
fk: ['users.(account_id)->accounts.(id)'],
acceptedFk: [],
reviewFk: ['users.(account_id)->accounts.(id)'],
},
}),
],
validationBlockedCases: ['demo_b2b_no_declared_constraints:validation_disabled'],
aggregate: {
caseCount: 2,
headlineCaseCount: 1,
headlinePkRecall: 0.5,
headlineFkRecall: 1,
headlineAcceptedOrReviewRecall: 1,
meanPkRecall: 0.5,
meanFkRecall: 0.5,
meanAcceptedOrReviewRecall: 1,
},
};
const report = buildKloRelationshipBenchmarkReport({
fixtures: [fixture()],
suite,
modes: ['declared_pks_and_declared_fks_removed', 'validation_disabled', 'profiling_disabled'],
});
expect(report.headline).toEqual({
caseCount: 2,
headlineCaseCount: 1,
headlinePkRecall: 0.5,
headlineFkRecall: 1,
headlineAcceptedOrReviewRecall: 1,
acceptedFalsePositiveCount: 0,
validationBlockedCount: 1,
});
expect(report.cases.map((item) => `${item.fixtureId}:${item.mode}:${item.status}`)).toEqual([
'demo_b2b_no_declared_constraints:declared_pks_and_declared_fks_removed:run',
'demo_b2b_no_declared_constraints:validation_disabled:validation_blocked',
'demo_b2b_no_declared_constraints:profiling_disabled:not_run',
]);
expect(report.cases[2]?.reason).toBe('mode not selected by fixture defaultModes');
});
it('surfaces validation budget review candidates in the report reason', () => {
const suite: KloRelationshipBenchmarkSuiteResult = {
cases: [
caseResult({
fixtureId: 'scale_stress_no_declared_constraints',
metrics: { fkRecall: 0.5, acceptedOrReviewRecall: 1 },
predicted: {
pk: ['dim_entity_00.(entity_00_key)'],
fk: [
'fact_activity_000.(entity_00_key)->dim_entity_00.(entity_00_key)',
'fact_activity_001.(entity_00_key)->dim_entity_00.(entity_00_key)',
],
acceptedFk: ['fact_activity_000.(entity_00_key)->dim_entity_00.(entity_00_key)'],
reviewFk: ['fact_activity_001.(entity_00_key)->dim_entity_00.(entity_00_key)'],
},
}),
],
validationBlockedCases: [],
aggregate: {
caseCount: 1,
headlineCaseCount: 0,
headlinePkRecall: 1,
headlineFkRecall: 0.5,
headlineAcceptedOrReviewRecall: 1,
meanPkRecall: 1,
meanFkRecall: 0.5,
meanAcceptedOrReviewRecall: 1,
},
};
const report = buildKloRelationshipBenchmarkReport({
fixtures: [
fixture({
id: 'scale_stress_no_declared_constraints',
name: 'Scale stress fixture',
tier: 'row_bearing',
validationBudget: 800,
defaultModes: ['declared_pks_and_declared_fks_removed'],
}),
],
suite,
modes: ['declared_pks_and_declared_fks_removed'],
});
expect(report.cases[0]?.reason).toBe('review candidate validation reasons: validation_unattempted (1)');
expect(formatKloRelationshipBenchmarkReportMarkdown(report)).toContain('validation_unattempted');
});
it('uses benchmark suite eligibility for product and smoke report rows', () => {
const productCase = caseResult({ fixtureId: 'product_curated' });
const productBlocked = caseResult({
fixtureId: 'product_curated',
mode: 'validation_disabled',
validationBlocked: true,
metrics: { fkRecall: 0, acceptedOrReviewRecall: 1, sqlQueries: 0 },
});
const smokeCase = caseResult({ fixtureId: 'smoke_even_if_marked' });
const suite: KloRelationshipBenchmarkSuiteResult = {
cases: [productCase, productBlocked, smokeCase],
validationBlockedCases: ['product_curated:validation_disabled'],
aggregate: {
caseCount: 3,
headlineCaseCount: 1,
headlinePkRecall: 0.5,
headlineFkRecall: 1,
headlineAcceptedOrReviewRecall: 1,
meanPkRecall: 0.5,
meanFkRecall: 0.6666666666666666,
meanAcceptedOrReviewRecall: 1,
},
};
const report = buildKloRelationshipBenchmarkReport({
fixtures: [
fixture({
id: 'product_curated',
name: 'Curated product fixture',
tier: 'product',
thresholdEligible: true,
defaultModes: ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
}),
fixture({
id: 'smoke_even_if_marked',
name: 'Marked smoke fixture',
tier: 'smoke',
thresholdEligible: true,
defaultModes: ['declared_pks_and_declared_fks_removed'],
}),
],
suite,
modes: ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
});
expect(report.cases.map((item) => `${item.fixtureId}:${item.mode}:${item.tuningEligible}`)).toEqual([
'product_curated:declared_pks_and_declared_fks_removed:true',
'product_curated:validation_disabled:false',
'smoke_even_if_marked:declared_pks_and_declared_fks_removed:false',
'smoke_even_if_marked:validation_disabled:false',
]);
expect(formatKloRelationshipBenchmarkReportMarkdown(report)).toContain(
'| product_curated | product | declared_pks_and_declared_fks_removed | run | yes |',
);
});
it('formats a compact Markdown report with false negatives and blocked modes', () => {
const suite: KloRelationshipBenchmarkSuiteResult = {
cases: [
caseResult({
metrics: { fkRecall: 0, acceptedOrReviewRecall: 0 },
falseNegatives: { pk: ['users.(id)'], fk: ['users.(account_id)->accounts.(id)'] },
}),
],
validationBlockedCases: [],
aggregate: {
caseCount: 1,
headlineCaseCount: 1,
headlinePkRecall: 0.5,
headlineFkRecall: 0,
headlineAcceptedOrReviewRecall: 0,
meanPkRecall: 0.5,
meanFkRecall: 0,
meanAcceptedOrReviewRecall: 0,
},
};
const markdown = formatKloRelationshipBenchmarkReportMarkdown(
buildKloRelationshipBenchmarkReport({
fixtures: [fixture()],
suite,
modes: ['declared_pks_and_declared_fks_removed'],
}),
);
expect(markdown).toContain('# KLO Relationship Discovery Benchmark Evidence');
expect(markdown).toContain(
'| demo_b2b_no_declared_constraints | smoke | declared_pks_and_declared_fks_removed | run | no | 0.500 | 0.000 | 0.000 | 0 |',
);
expect(markdown).toContain(
'- `demo_b2b_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: users.(id)',
);
expect(markdown).toContain(
'- `demo_b2b_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: users.(account_id)->accounts.(id)',
);
});
it('keeps headline failures separate from non-headline failure details', () => {
const suite: KloRelationshipBenchmarkSuiteResult = {
cases: [
caseResult({
fixtureId: 'product_curated',
falseNegatives: { pk: [], fk: [] },
metrics: { pkRecall: 1, fkRecall: 1, acceptedOrReviewRecall: 1 },
}),
caseResult({
fixtureId: 'product_curated',
mode: 'embeddings_disabled',
falseNegatives: {
pk: ['customers.(id)'],
fk: ['orders.(buyer_ref)->customers.(id)'],
},
metrics: { pkRecall: 0.5, fkRecall: 0, acceptedOrReviewRecall: 0 },
}),
],
validationBlockedCases: [],
aggregate: {
caseCount: 2,
headlineCaseCount: 1,
headlinePkRecall: 1,
headlineFkRecall: 1,
headlineAcceptedOrReviewRecall: 1,
meanPkRecall: 0.75,
meanFkRecall: 0.5,
meanAcceptedOrReviewRecall: 0.5,
},
};
const markdown = formatKloRelationshipBenchmarkReportMarkdown(
buildKloRelationshipBenchmarkReport({
fixtures: [
fixture({
id: 'product_curated',
name: 'Curated product fixture',
tier: 'product',
thresholdEligible: true,
defaultModes: ['declared_pks_and_declared_fks_removed', 'embeddings_disabled'],
}),
],
suite,
modes: ['declared_pks_and_declared_fks_removed', 'embeddings_disabled'],
}),
);
expect(markdown).toContain('## Failure Details');
expect(markdown).toContain('### Headline False Negative FKs\n\n- none');
expect(markdown).toContain(
'- `product_curated` / `embeddings_disabled` / `run`: orders.(buyer_ref)->customers.(id)',
);
expect(markdown).toContain('- `product_curated` / `embeddings_disabled` / `run`: customers.(id)');
});
it('formats headline failure context from remaining headline false negatives', () => {
const suite: KloRelationshipBenchmarkSuiteResult = {
cases: [
caseResult({
fixtureId: 'public_headline_fixture',
metrics: { pkRecall: 0.5, fkRecall: 0, acceptedOrReviewRecall: 0 },
falseNegatives: {
pk: ['parent_table.(opaque_key)'],
fk: ['child_table.(parent_table_id)->parent_table.(opaque_key)'],
},
}),
],
validationBlockedCases: [],
aggregate: {
caseCount: 1,
headlineCaseCount: 1,
headlinePkRecall: 0.5,
headlineFkRecall: 0,
headlineAcceptedOrReviewRecall: 0,
meanPkRecall: 0.5,
meanFkRecall: 0,
meanAcceptedOrReviewRecall: 0,
},
};
const markdown = formatKloRelationshipBenchmarkReportMarkdown(
buildKloRelationshipBenchmarkReport({
fixtures: [
fixture({
id: 'public_headline_fixture',
name: 'Public headline fixture',
tier: 'row_bearing',
thresholdEligible: true,
defaultModes: ['declared_pks_and_declared_fks_removed'],
}),
],
suite,
modes: ['declared_pks_and_declared_fks_removed'],
}),
);
expect(markdown).toContain('## Headline Failure Context');
expect(markdown).toContain('- Remaining headline false-negative PKs: 1');
expect(markdown).toContain('- Remaining headline false-negative FKs: 1');
expect(markdown).toContain(
'- `public_headline_fixture` / `declared_pks_and_declared_fks_removed` / `run`: parent_table.(opaque_key)',
);
expect(markdown).toContain(
'- `public_headline_fixture` / `declared_pks_and_declared_fks_removed` / `run`: child_table.(parent_table_id)->parent_table.(opaque_key)',
);
});
it('formats skipped composite ground truth separately from false-negative details', () => {
const compositePk = 'order_lines.(order_id,line_number)';
const compositeFk = 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)';
const suite: KloRelationshipBenchmarkSuiteResult = {
cases: [
caseResult({
fixtureId: 'composite_keys_no_declared_constraints',
metrics: { pkRecall: 0, fkRecall: 0, acceptedOrReviewRecall: 0 },
expected: {
pk: [compositePk],
fk: [compositeFk],
},
predicted: {
pk: [],
fk: [],
acceptedFk: [],
reviewFk: [],
},
falseNegatives: {
pk: [compositePk],
fk: [compositeFk],
},
skippedComposite: {
pk: [compositePk],
fk: [compositeFk],
},
}),
],
validationBlockedCases: [],
aggregate: {
caseCount: 1,
headlineCaseCount: 1,
headlinePkRecall: 0,
headlineFkRecall: 0,
headlineAcceptedOrReviewRecall: 0,
meanPkRecall: 0,
meanFkRecall: 0,
meanAcceptedOrReviewRecall: 0,
},
};
const report = buildKloRelationshipBenchmarkReport({
fixtures: [
fixture({
id: 'composite_keys_no_declared_constraints',
name: 'Composite key fixture with no declared constraints',
tier: 'row_bearing',
defaultModes: ['declared_pks_and_declared_fks_removed'],
}),
],
suite,
modes: ['declared_pks_and_declared_fks_removed'],
});
expect(report.cases[0]?.skippedComposite).toEqual({
pk: [compositePk],
fk: [compositeFk],
});
const markdown = formatKloRelationshipBenchmarkReportMarkdown(report);
expect(markdown).toContain('## Composite Ground Truth Skips');
expect(markdown).toContain(
'### Skipped Composite PKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_lines.(order_id,line_number)',
);
expect(markdown).toContain(
'### Skipped Composite FKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
);
expect(markdown).toContain(
'### Headline False Negative FKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
);
});
});

View file

@ -0,0 +1,363 @@
import { isKloRelationshipBenchmarkTuningEligible } from './relationship-benchmarks.js';
import type {
KloRelationshipBenchmarkCaseResult,
KloRelationshipBenchmarkFixture,
KloRelationshipBenchmarkMode,
KloRelationshipBenchmarkSuiteResult,
} from './relationship-benchmarks.js';
export type KloRelationshipBenchmarkReportCaseStatus = 'run' | 'validation_blocked' | 'not_run';
export interface KloRelationshipBenchmarkReportCase {
fixtureId: string;
fixtureName: string;
tier: string;
mode: KloRelationshipBenchmarkMode;
status: KloRelationshipBenchmarkReportCaseStatus;
reason: string | null;
tuningEligible: boolean;
metrics: {
pkRecall: number | null;
fkRecall: number | null;
acceptedOrReviewRecall: number | null;
acceptedFalsePositiveCount: number | null;
sqlQueries: number | null;
llmCalls: number | null;
runtimeSeconds: number | null;
};
falsePositives: {
pk: string[];
fk: string[];
};
falseNegatives: {
pk: string[];
fk: string[];
};
skippedComposite: {
pk: string[];
fk: string[];
};
}
export interface KloRelationshipBenchmarkReport {
generatedAt: string;
headline: {
caseCount: number;
headlineCaseCount: number;
headlinePkRecall: number;
headlineFkRecall: number;
headlineAcceptedOrReviewRecall: number;
acceptedFalsePositiveCount: number;
validationBlockedCount: number;
};
cases: KloRelationshipBenchmarkReportCase[];
}
function key(fixtureId: string, mode: KloRelationshipBenchmarkMode): string {
return `${fixtureId}:${mode}`;
}
function fixed(value: number | null): string {
return value === null ? '-' : value.toFixed(3);
}
function reportCaseReason(input: {
fixture: KloRelationshipBenchmarkFixture;
result: KloRelationshipBenchmarkCaseResult;
}): string | null {
if (input.result.validationBlocked) {
return 'validation unavailable for this benchmark mode';
}
if (input.fixture.validationBudget !== undefined && input.result.predicted.reviewFk.length > 0) {
return `review candidate validation reasons: validation_unattempted (${input.result.predicted.reviewFk.length})`;
}
return null;
}
function reportCaseFromResult(input: {
fixture: KloRelationshipBenchmarkFixture;
mode: KloRelationshipBenchmarkMode;
result: KloRelationshipBenchmarkCaseResult;
}): KloRelationshipBenchmarkReportCase {
const status = input.result.validationBlocked ? 'validation_blocked' : 'run';
return {
fixtureId: input.fixture.id,
fixtureName: input.fixture.name,
tier: input.fixture.tier,
mode: input.mode,
status,
reason: reportCaseReason({ fixture: input.fixture, result: input.result }),
tuningEligible: isKloRelationshipBenchmarkTuningEligible({
fixture: input.fixture,
mode: input.mode,
validationBlocked: input.result.validationBlocked,
}),
metrics: {
pkRecall: input.result.metrics.pkRecall,
fkRecall: input.result.metrics.fkRecall,
acceptedOrReviewRecall: input.result.metrics.acceptedOrReviewRecall,
acceptedFalsePositiveCount: input.result.metrics.acceptedFalsePositiveCount,
sqlQueries: input.result.metrics.sqlQueries,
llmCalls: input.result.metrics.llmCalls,
runtimeSeconds: input.result.metrics.runtimeSeconds,
},
falsePositives: input.result.falsePositives,
falseNegatives: input.result.falseNegatives,
skippedComposite: input.result.skippedComposite,
};
}
function notRunCase(input: {
fixture: KloRelationshipBenchmarkFixture;
mode: KloRelationshipBenchmarkMode;
reason: string;
}): KloRelationshipBenchmarkReportCase {
return {
fixtureId: input.fixture.id,
fixtureName: input.fixture.name,
tier: input.fixture.tier,
mode: input.mode,
status: 'not_run',
reason: input.reason,
tuningEligible: false,
metrics: {
pkRecall: null,
fkRecall: null,
acceptedOrReviewRecall: null,
acceptedFalsePositiveCount: null,
sqlQueries: null,
llmCalls: null,
runtimeSeconds: null,
},
falsePositives: { pk: [], fk: [] },
falseNegatives: { pk: [], fk: [] },
skippedComposite: { pk: [], fk: [] },
};
}
export function buildKloRelationshipBenchmarkReport(input: {
fixtures: readonly KloRelationshipBenchmarkFixture[];
suite: KloRelationshipBenchmarkSuiteResult;
modes: readonly KloRelationshipBenchmarkMode[];
generatedAt?: string;
}): KloRelationshipBenchmarkReport {
const resultsByKey = new Map(input.suite.cases.map((result) => [key(result.fixtureId, result.mode), result]));
const cases: KloRelationshipBenchmarkReportCase[] = [];
for (const fixture of input.fixtures) {
const selectedModes = new Set(fixture.defaultModes);
for (const mode of input.modes) {
const result = resultsByKey.get(key(fixture.id, mode));
if (result) {
cases.push(reportCaseFromResult({ fixture, mode, result }));
continue;
}
cases.push(
notRunCase({
fixture,
mode,
reason: selectedModes.has(mode) ? 'mode produced no benchmark result' : 'mode not selected by fixture defaultModes',
}),
);
}
}
return {
generatedAt: input.generatedAt ?? new Date().toISOString(),
headline: {
caseCount: input.suite.aggregate.caseCount,
headlineCaseCount: input.suite.aggregate.headlineCaseCount,
headlinePkRecall: input.suite.aggregate.headlinePkRecall,
headlineFkRecall: input.suite.aggregate.headlineFkRecall,
headlineAcceptedOrReviewRecall: input.suite.aggregate.headlineAcceptedOrReviewRecall,
acceptedFalsePositiveCount: input.suite.cases.reduce(
(sum, result) => sum + result.metrics.acceptedFalsePositiveCount,
0,
),
validationBlockedCount: input.suite.validationBlockedCases.length,
},
cases,
};
}
type KloRelationshipBenchmarkFailureSelector = (
item: KloRelationshipBenchmarkReportCase,
) => readonly string[];
function sortedFailureLines(input: {
cases: readonly KloRelationshipBenchmarkReportCase[];
select: KloRelationshipBenchmarkFailureSelector;
}): string[] {
return input.cases
.flatMap((item) =>
input.select(item).map((value) => ({
fixtureId: item.fixtureId,
mode: item.mode,
status: item.status,
value,
})),
)
.sort((left, right) => {
const leftKey = `${left.fixtureId}:${left.mode}:${left.status}:${left.value}`;
const rightKey = `${right.fixtureId}:${right.mode}:${right.status}:${right.value}`;
return leftKey.localeCompare(rightKey);
})
.map((item) => `- \`${item.fixtureId}\` / \`${item.mode}\` / \`${item.status}\`: ${item.value}`);
}
function failureBlock(input: {
title: string;
cases: readonly KloRelationshipBenchmarkReportCase[];
select: KloRelationshipBenchmarkFailureSelector;
}): string[] {
const values = sortedFailureLines({ cases: input.cases, select: input.select });
return ['', `### ${input.title}`, '', ...(values.length > 0 ? values : ['- none'])];
}
function headlineFailureContextBlocks(report: KloRelationshipBenchmarkReport): string[] {
const headlineCases = report.cases.filter((item) => item.tuningEligible);
const remainingPkMisses = sortedFailureLines({
cases: headlineCases,
select: (item) => item.falseNegatives.pk,
});
const remainingFkMisses = sortedFailureLines({
cases: headlineCases,
select: (item) => item.falseNegatives.fk,
});
return [
'',
'## Headline Failure Context',
'',
'Remaining headline misses after this run are listed here so recall gains and still-open algorithmic gaps are visible in the regenerated evidence report.',
'',
`- Remaining headline false-negative PKs: ${remainingPkMisses.length}`,
`- Remaining headline false-negative FKs: ${remainingFkMisses.length}`,
'',
'### Remaining Headline False Negative PKs',
'',
...(remainingPkMisses.length > 0 ? remainingPkMisses : ['- none']),
'',
'### Remaining Headline False Negative FKs',
'',
...(remainingFkMisses.length > 0 ? remainingFkMisses : ['- none']),
];
}
function failureDetailBlocks(report: KloRelationshipBenchmarkReport): string[] {
const headlineCases = report.cases.filter((item) => item.tuningEligible);
const otherCases = report.cases.filter((item) => !item.tuningEligible);
return [
'',
'## Failure Details',
...failureBlock({
title: 'Headline False Positive PKs',
cases: headlineCases,
select: (item) => item.falsePositives.pk,
}),
...failureBlock({
title: 'Headline False Positive FKs',
cases: headlineCases,
select: (item) => item.falsePositives.fk,
}),
...failureBlock({
title: 'Headline False Negative PKs',
cases: headlineCases,
select: (item) => item.falseNegatives.pk,
}),
...failureBlock({
title: 'Headline False Negative FKs',
cases: headlineCases,
select: (item) => item.falseNegatives.fk,
}),
...failureBlock({
title: 'Other False Positive PKs',
cases: otherCases,
select: (item) => item.falsePositives.pk,
}),
...failureBlock({
title: 'Other False Positive FKs',
cases: otherCases,
select: (item) => item.falsePositives.fk,
}),
...failureBlock({
title: 'Other False Negative PKs',
cases: otherCases,
select: (item) => item.falseNegatives.pk,
}),
...failureBlock({
title: 'Other False Negative FKs',
cases: otherCases,
select: (item) => item.falseNegatives.fk,
}),
];
}
function compositeSkipBlocks(report: KloRelationshipBenchmarkReport): string[] {
const headlineCases = report.cases.filter((item) => item.tuningEligible);
return [
'',
'## Composite Ground Truth Skips',
...failureBlock({
title: 'Skipped Composite PKs',
cases: headlineCases,
select: (item) => item.skippedComposite.pk,
}),
...failureBlock({
title: 'Skipped Composite FKs',
cases: headlineCases,
select: (item) => item.skippedComposite.fk,
}),
];
}
export function formatKloRelationshipBenchmarkReportMarkdown(report: KloRelationshipBenchmarkReport): string {
const lines = [
'# KLO Relationship Discovery Benchmark Evidence',
'',
`Generated: ${report.generatedAt}`,
'',
'## Headline',
'',
`- Cases run: ${report.headline.caseCount}`,
`- Headline cases: ${report.headline.headlineCaseCount}`,
`- Headline PK recall: ${fixed(report.headline.headlinePkRecall)}`,
`- Headline FK recall: ${fixed(report.headline.headlineFkRecall)}`,
`- Headline accepted-or-review recall: ${fixed(report.headline.headlineAcceptedOrReviewRecall)}`,
`- Accepted false positives: ${report.headline.acceptedFalsePositiveCount}`,
`- Validation-blocked cases: ${report.headline.validationBlockedCount}`,
'',
'## Cases',
'',
'| Fixture | Tier | Mode | Status | Tuning Eligible | PK Recall | FK Recall | Accepted+Review Recall | Accepted FP | Reason |',
'| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | --- |',
];
for (const item of report.cases) {
lines.push(
[
`| ${item.fixtureId}`,
item.tier,
item.mode,
item.status,
item.tuningEligible ? 'yes' : 'no',
fixed(item.metrics.pkRecall),
fixed(item.metrics.fkRecall),
fixed(item.metrics.acceptedOrReviewRecall),
String(item.metrics.acceptedFalsePositiveCount ?? '-'),
`${item.reason ?? ''} |`,
].join(' | '),
);
}
lines.push(...headlineFailureContextBlocks(report));
lines.push(...failureDetailBlocks(report));
lines.push(...compositeSkipBlocks(report));
lines.push('');
return `${lines.join('\n')}\n`;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,902 @@
import { createHash } from 'node:crypto';
import { mkdtemp, readdir, readFile, stat, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { gunzipSync } from 'node:zlib';
import Database from 'better-sqlite3';
import YAML from 'yaml';
import { z } from 'zod';
import type { KloEnrichedRelationship, KloEnrichedSchema, KloRelationshipType } from './enrichment-types.js';
import { snapshotToKloEnrichedSchema } from './local-enrichment.js';
import type { KloRelationshipDiscoveryCandidate } from './relationship-candidates.js';
import {
generateKloRelationshipDiscoveryCandidates,
mergeKloRelationshipDiscoveryCandidates,
} from './relationship-candidates.js';
import type { KloLlmProvider } from '@klo/llm';
import { proposeKloRelationshipCandidatesWithLlm } from './relationship-llm-proposal.js';
import {
discoverKloCompositeRelationships,
type KloCompositePrimaryKeyCandidate,
type KloCompositeRelationshipCandidate,
} from './relationship-composite-candidates.js';
import { emptyKloRelationshipProfileArtifact } from './relationship-diagnostics.js';
import { collectKloFormalMetadataRelationships } from './relationship-formal-metadata.js';
import { resolveKloRelationshipGraph } from './relationship-graph-resolver.js';
import { type KloRelationshipReadOnlyExecutor, profileKloRelationshipSchema } from './relationship-profiling.js';
import type { KloRelationshipValidationBudget } from './relationship-budget.js';
import type { KloRelationshipFixtureOrigin } from './relationship-scoring.js';
import { validateKloRelationshipDiscoveryCandidates } from './relationship-validation.js';
import type { KloQueryResult, KloReadOnlyQueryInput, KloScanContext, KloSchemaSnapshot } from './types.js';
export const KLO_RELATIONSHIP_BENCHMARK_MODES = [
'metadata_present',
'declared_fks_removed',
'declared_pks_removed',
'declared_pks_and_declared_fks_removed',
'llm_disabled',
'profiling_disabled',
'validation_disabled',
'embeddings_disabled',
] as const;
export type KloRelationshipBenchmarkMode = (typeof KLO_RELATIONSHIP_BENCHMARK_MODES)[number];
export const KLO_RELATIONSHIP_BENCHMARK_TIERS = ['unit', 'row_bearing', 'schema_only', 'smoke', 'product'] as const;
export type KloRelationshipBenchmarkTier = (typeof KLO_RELATIONSHIP_BENCHMARK_TIERS)[number];
export type KloRelationshipBenchmarkStatus = 'accepted' | 'review' | 'rejected';
export interface KloRelationshipBenchmarkExpectedPk {
table: string;
columns: string[];
}
export interface KloRelationshipBenchmarkExpectedLink {
fromTable: string;
fromColumns: string[];
toTable: string;
toColumns: string[];
relationship: KloRelationshipType;
}
export interface KloRelationshipBenchmarkExpectedLinks {
expectedPks: KloRelationshipBenchmarkExpectedPk[];
expectedLinks: KloRelationshipBenchmarkExpectedLink[];
}
export interface KloRelationshipBenchmarkFixture {
id: string;
name: string;
tier: KloRelationshipBenchmarkTier;
origin: KloRelationshipFixtureOrigin;
thresholdEligible?: boolean;
validationBudget?: KloRelationshipValidationBudget;
snapshot: KloSchemaSnapshot;
expected: KloRelationshipBenchmarkExpectedLinks;
defaultModes: KloRelationshipBenchmarkMode[];
dataPath: string | null;
columnEmbeddings: Record<string, number[]>;
}
export interface KloRelationshipBenchmarkDetectedPk {
table: string;
columns: string[];
score: number;
status: KloRelationshipBenchmarkStatus;
}
export interface KloRelationshipBenchmarkDetectedLink {
fromTable: string;
fromColumns: string[];
toTable: string;
toColumns: string[];
relationship: KloRelationshipType;
score: number;
status: KloRelationshipBenchmarkStatus;
source: string;
}
export interface KloRelationshipBenchmarkDetectorResult {
pks: KloRelationshipBenchmarkDetectedPk[];
links: KloRelationshipBenchmarkDetectedLink[];
validationBlocked: boolean;
sqlQueries: number;
llmCalls: number;
runtimeSeconds: number;
}
export interface KloRelationshipBenchmarkDetectorInput {
fixtureId: string;
mode: KloRelationshipBenchmarkMode;
snapshot: KloSchemaSnapshot;
schema: KloEnrichedSchema;
dataPath: string | null;
validationBudget?: KloRelationshipValidationBudget;
}
export interface KloRelationshipBenchmarkDetector {
detect(input: KloRelationshipBenchmarkDetectorInput): Promise<KloRelationshipBenchmarkDetectorResult>;
}
export interface KloRelationshipBenchmarkMetrics {
pkPrecision: number;
pkRecall: number;
pkF1: number;
fkPrecision: number;
fkRecall: number;
fkF1: number;
acceptedFalsePositiveCount: number;
reviewRecall: number;
acceptedOrReviewRecall: number;
runtimeSeconds: number;
sqlQueries: number;
llmCalls: number;
}
export interface KloRelationshipBenchmarkCaseResult {
fixtureId: string;
mode: KloRelationshipBenchmarkMode;
metrics: KloRelationshipBenchmarkMetrics;
expected: {
pk: string[];
fk: string[];
};
predicted: {
pk: string[];
fk: string[];
acceptedFk: string[];
reviewFk: string[];
};
falsePositives: {
pk: string[];
fk: string[];
};
falseNegatives: {
pk: string[];
fk: string[];
};
skippedComposite: {
pk: string[];
fk: string[];
};
validationBlocked: boolean;
}
export interface KloRelationshipBenchmarkSuiteResult {
cases: KloRelationshipBenchmarkCaseResult[];
validationBlockedCases: string[];
aggregate: {
caseCount: number;
headlineCaseCount: number;
headlinePkRecall: number;
headlineFkRecall: number;
headlineAcceptedOrReviewRecall: number;
meanPkRecall: number;
meanFkRecall: number;
meanAcceptedOrReviewRecall: number;
};
}
class KloRelationshipBenchmarkSqliteExecutor implements KloRelationshipReadOnlyExecutor {
private readonly db: Database.Database;
queryCount = 0;
constructor(dataPath: string) {
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
}
async executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
this.queryCount += 1;
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return {
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
};
}
close(): void {
this.db.close();
}
}
async function fixtureText(fixtureDir: string, fileName: string): Promise<string> {
const rawPath = join(fixtureDir, fileName);
try {
return await readFile(rawPath, 'utf-8');
} catch (error) {
if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
throw error;
}
}
const compressed = await readFile(`${rawPath}.gz`);
return gunzipSync(compressed).toString('utf-8');
}
async function fixtureDataPath(fixtureDir: string): Promise<string | null> {
const dataPath = join(fixtureDir, 'data.sqlite');
try {
const dataStat = await stat(dataPath);
return dataStat.isFile() ? dataPath : null;
} catch (error) {
if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
throw error;
}
}
const compressedPath = `${dataPath}.gz`;
try {
const compressedStat = await stat(compressedPath);
if (!compressedStat.isFile()) {
return null;
}
const digest = createHash('sha256').update(fixtureDir).digest('hex').slice(0, 16);
const tempRoot = await mkdtemp(join(tmpdir(), `klo-relationship-benchmark-${digest}-`));
const extractedPath = join(tempRoot, 'data.sqlite');
await writeFile(extractedPath, gunzipSync(await readFile(compressedPath)));
return extractedPath;
} catch (error) {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
return null;
}
throw error;
}
}
async function fixtureColumnEmbeddings(fixtureDir: string): Promise<Record<string, number[]>> {
const embeddingsPath = join(fixtureDir, 'column-embeddings.json');
try {
const raw = await readFile(embeddingsPath, 'utf-8');
const parsed = JSON.parse(raw) as Record<string, unknown>;
return Object.fromEntries(
Object.entries(parsed).flatMap(([columnId, value]) => {
if (!Array.isArray(value) || value.some((item) => typeof item !== 'number')) {
return [];
}
return [[columnId, value as number[]]];
}),
);
} catch {
return {};
}
}
const modeSchema = z.enum(KLO_RELATIONSHIP_BENCHMARK_MODES);
const tierSchema = z.enum(KLO_RELATIONSHIP_BENCHMARK_TIERS);
const originSchema = z.enum(['synthetic', 'public', 'customer']);
const validationBudgetSchema = z.union([z.literal('all'), z.number().int().nonnegative()]);
const fixtureConfigSchema = z.object({
id: z.string().min(1),
name: z.string().min(1),
tier: tierSchema.default('unit'),
origin: originSchema,
thresholdEligible: z.boolean().optional(),
validationBudget: validationBudgetSchema.optional(),
defaultModes: z.array(modeSchema).min(1),
});
const expectedLinksSchema = z.object({
expectedPks: z.array(
z.object({
table: z.string().min(1),
columns: z.array(z.string().min(1)).min(1),
}),
),
expectedLinks: z.array(
z.object({
fromTable: z.string().min(1),
fromColumns: z.array(z.string().min(1)).min(1),
toTable: z.string().min(1),
toColumns: z.array(z.string().min(1)).min(1),
relationship: z.enum(['many_to_one', 'one_to_many', 'one_to_one']),
}),
),
});
function sortedUnique(values: Iterable<string>): string[] {
return Array.from(new Set(values)).sort((left, right) => left.localeCompare(right));
}
function tupleKey(columns: readonly string[]): string {
return `(${columns.join(',')})`;
}
function pkKey(pk: Pick<KloRelationshipBenchmarkExpectedPk, 'table' | 'columns'>): string {
return `${pk.table}.${tupleKey(pk.columns)}`;
}
function fkKey(
link: Pick<KloRelationshipBenchmarkExpectedLink, 'fromTable' | 'fromColumns' | 'toTable' | 'toColumns'>,
): string {
return `${link.fromTable}.${tupleKey(link.fromColumns)}->${link.toTable}.${tupleKey(link.toColumns)}`;
}
function relationshipKey(link: KloRelationshipBenchmarkDetectedLink): string {
return fkKey(link);
}
function relationshipToBenchmarkLink(candidate: KloEnrichedRelationship): KloRelationshipBenchmarkDetectedLink {
return {
fromTable: candidate.from.table.name,
fromColumns: candidate.from.columns,
toTable: candidate.to.table.name,
toColumns: candidate.to.columns,
relationship: candidate.relationshipType,
score: candidate.confidence,
status: 'accepted',
source: candidate.source,
};
}
function broadCandidateToBenchmarkLink(
candidate: Pick<KloRelationshipDiscoveryCandidate, 'confidence' | 'from' | 'relationshipType' | 'source' | 'to'>,
): KloRelationshipBenchmarkDetectedLink {
return {
fromTable: candidate.from.table.name,
fromColumns: candidate.from.columns,
toTable: candidate.to.table.name,
toColumns: candidate.to.columns,
relationship: candidate.relationshipType,
score: candidate.confidence,
status: 'review',
source: candidate.source,
};
}
function compositePkToBenchmarkPk(candidate: KloCompositePrimaryKeyCandidate): KloRelationshipBenchmarkDetectedPk {
return {
table: candidate.table.name,
columns: candidate.columns,
score: candidate.score,
status: candidate.status,
};
}
function compositeRelationshipToBenchmarkLink(
candidate: KloCompositeRelationshipCandidate,
): KloRelationshipBenchmarkDetectedLink {
return {
fromTable: candidate.from.table.name,
fromColumns: candidate.from.columns,
toTable: candidate.to.table.name,
toColumns: candidate.to.columns,
relationship: candidate.relationshipType,
score: candidate.confidence,
status: candidate.status,
source: candidate.source,
};
}
function ratio(numerator: number, denominator: number): number {
return denominator === 0 ? 1 : numerator / denominator;
}
function f1(precision: number, recall: number): number {
return precision + recall === 0 ? 0 : (2 * precision * recall) / (precision + recall);
}
function difference(left: readonly string[], right: readonly string[]): string[] {
const rightSet = new Set(right);
return left.filter((item) => !rightSet.has(item));
}
function intersectionSize(left: readonly string[], right: readonly string[]): number {
const rightSet = new Set(right);
return left.filter((item) => rightSet.has(item)).length;
}
function compositePkKeys(expected: KloRelationshipBenchmarkExpectedLinks): string[] {
return sortedUnique(expected.expectedPks.filter((pk) => pk.columns.length > 1).map(pkKey));
}
function compositeFkKeys(expected: KloRelationshipBenchmarkExpectedLinks): string[] {
return sortedUnique(
expected.expectedLinks.filter((link) => link.fromColumns.length > 1 || link.toColumns.length > 1).map(fkKey),
);
}
function scalarExpectedPkKeys(expected: KloRelationshipBenchmarkExpectedLinks): string[] {
return sortedUnique(expected.expectedPks.map(pkKey));
}
function scalarExpectedFkKeys(expected: KloRelationshipBenchmarkExpectedLinks): string[] {
return sortedUnique(expected.expectedLinks.map(fkKey));
}
function scoreBenchmarkCase(input: {
fixtureId: string;
mode: KloRelationshipBenchmarkMode;
expected: KloRelationshipBenchmarkExpectedLinks;
detected: KloRelationshipBenchmarkDetectorResult;
}): KloRelationshipBenchmarkCaseResult {
const expectedPk = scalarExpectedPkKeys(input.expected);
const expectedFk = scalarExpectedFkKeys(input.expected);
const predictedPk = sortedUnique(input.detected.pks.map(pkKey));
const predictedFk = sortedUnique(input.detected.links.map(relationshipKey));
const acceptedFk = sortedUnique(
input.detected.links.filter((link) => link.status === 'accepted').map(relationshipKey),
);
const reviewFk = sortedUnique(input.detected.links.filter((link) => link.status === 'review').map(relationshipKey));
const acceptedOrReviewFk = sortedUnique([...acceptedFk, ...reviewFk]);
const truePositivePk = intersectionSize(predictedPk, expectedPk);
const truePositiveFk = intersectionSize(acceptedFk, expectedFk);
const acceptedOrReviewTruePositiveFk = intersectionSize(acceptedOrReviewFk, expectedFk);
const reviewTruePositiveFk = intersectionSize(reviewFk, expectedFk);
const pkPrecision = ratio(truePositivePk, predictedPk.length);
const pkRecall = ratio(truePositivePk, expectedPk.length);
const fkPrecision = ratio(truePositiveFk, acceptedFk.length);
const fkRecall = ratio(truePositiveFk, expectedFk.length);
const falsePositiveFk = difference(acceptedFk, expectedFk);
return {
fixtureId: input.fixtureId,
mode: input.mode,
metrics: {
pkPrecision,
pkRecall,
pkF1: f1(pkPrecision, pkRecall),
fkPrecision,
fkRecall,
fkF1: f1(fkPrecision, fkRecall),
acceptedFalsePositiveCount: falsePositiveFk.length,
reviewRecall: ratio(reviewTruePositiveFk, expectedFk.length),
acceptedOrReviewRecall: ratio(acceptedOrReviewTruePositiveFk, expectedFk.length),
runtimeSeconds: input.detected.runtimeSeconds,
sqlQueries: input.detected.sqlQueries,
llmCalls: input.detected.llmCalls,
},
expected: {
pk: expectedPk,
fk: expectedFk,
},
predicted: {
pk: predictedPk,
fk: predictedFk,
acceptedFk,
reviewFk,
},
falsePositives: {
pk: difference(predictedPk, expectedPk),
fk: falsePositiveFk,
},
falseNegatives: {
pk: difference(expectedPk, predictedPk),
fk: difference(expectedFk, acceptedOrReviewFk),
},
skippedComposite: {
pk: difference(compositePkKeys(input.expected), predictedPk),
fk: difference(compositeFkKeys(input.expected), acceptedOrReviewFk),
},
validationBlocked: input.detected.validationBlocked,
};
}
export function maskKloRelationshipBenchmarkSnapshot(
snapshot: KloSchemaSnapshot,
mode: KloRelationshipBenchmarkMode,
): KloSchemaSnapshot {
const relationshipDiscoveryMode =
mode === 'declared_pks_and_declared_fks_removed' ||
mode === 'llm_disabled' ||
mode === 'profiling_disabled' ||
mode === 'validation_disabled' ||
mode === 'embeddings_disabled';
const removePks = relationshipDiscoveryMode || mode === 'declared_pks_removed';
const removeFks = relationshipDiscoveryMode || mode === 'declared_fks_removed';
return {
...snapshot,
scope: { ...snapshot.scope },
metadata: { ...snapshot.metadata },
tables: snapshot.tables.map((table) => ({
...table,
columns: table.columns.map((column) => ({
...column,
primaryKey: removePks ? false : column.primaryKey,
})),
foreignKeys: removeFks ? [] : table.foreignKeys.map((foreignKey) => ({ ...foreignKey })),
})),
};
}
export function isKloRelationshipBenchmarkTuningEligible(input: {
fixture: Pick<KloRelationshipBenchmarkFixture, 'tier' | 'thresholdEligible'>;
mode: KloRelationshipBenchmarkMode;
validationBlocked: boolean;
}): boolean {
if (input.validationBlocked || input.mode !== 'declared_pks_and_declared_fks_removed') {
return false;
}
if (input.fixture.tier === 'smoke' || input.fixture.tier === 'schema_only') {
return false;
}
if (input.fixture.thresholdEligible !== undefined) {
return input.fixture.thresholdEligible;
}
return input.fixture.tier === 'unit' || input.fixture.tier === 'row_bearing';
}
export function kloRelationshipBenchmarkDetectorWithLlm(
llmProvider: KloLlmProvider,
): KloRelationshipBenchmarkDetector {
return {
async detect(input) {
const startedAt = performance.now();
const formalMetadata = collectKloFormalMetadataRelationships(input.schema);
const formalLinks = formalMetadata.accepted.map((relationship) => relationshipToBenchmarkLink(relationship));
const acceptedKeys = new Set(formalLinks.map(fkKey));
const sqliteDataAvailable = Boolean(input.dataPath && input.snapshot.driver === 'sqlite');
const profilingExecutor =
sqliteDataAvailable && input.mode !== 'profiling_disabled'
? new KloRelationshipBenchmarkSqliteExecutor(input.dataPath as string)
: null;
const validationExecutor = profilingExecutor && input.mode !== 'validation_disabled' ? profilingExecutor : null;
const profiles =
input.mode === 'profiling_disabled'
? emptyKloRelationshipProfileArtifact({
connectionId: input.snapshot.connectionId,
driver: input.snapshot.driver,
reason: 'relationship_benchmark_profiling_disabled',
})
: await profileKloRelationshipSchema({
connectionId: input.snapshot.connectionId,
driver: input.snapshot.driver,
schema: input.schema,
executor: profilingExecutor,
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:profile` },
});
const broadRelationshipCandidates = generateKloRelationshipDiscoveryCandidates(input.schema, {
profiles,
useEmbeddings: input.mode !== 'embeddings_disabled',
});
const llmProposalResult =
input.mode === 'llm_disabled'
? { candidates: [], warnings: [], llmCalls: 0, summary: 'skipped' as const }
: await proposeKloRelationshipCandidatesWithLlm({
connectionId: input.snapshot.connectionId,
schema: input.schema,
profile: profiles,
llmProvider,
});
const candidates = mergeKloRelationshipDiscoveryCandidates([
...broadRelationshipCandidates,
...llmProposalResult.candidates,
]);
const validationBudget =
input.validationBudget === 'all'
? 'all'
: input.validationBudget === undefined
? 'all'
: Math.max(0, input.validationBudget - profiles.queryCount);
const validatedBroadCandidates = await validateKloRelationshipDiscoveryCandidates({
connectionId: input.snapshot.connectionId,
driver: input.snapshot.driver,
candidates,
profiles,
executor: validationExecutor,
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:validate` },
tableCount: input.schema.tables.length,
settings: {
validationBudget,
},
});
const compositeDetection =
validationBudget === 'all' &&
validationExecutor &&
input.mode !== 'profiling_disabled' &&
input.mode !== 'validation_disabled'
? await discoverKloCompositeRelationships({
connectionId: input.snapshot.connectionId,
driver: input.snapshot.driver,
schema: input.schema,
profiles,
executor: validationExecutor,
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:composite` },
})
: { primaryKeys: [], relationships: [], queryCount: 0, warnings: [] };
profilingExecutor?.close();
const graph = resolveKloRelationshipGraph({
schema: input.schema,
profiles,
candidates: validatedBroadCandidates,
});
const acceptedBroadCandidates = graph.relationships
.filter((candidate) => candidate.status === 'accepted')
.map((candidate) => ({
...broadCandidateToBenchmarkLink(candidate),
score: candidate.fkScore,
status: 'accepted' as const,
}))
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
const reviewCandidates = graph.relationships
.filter((candidate) => candidate.status === 'review')
.map((candidate) => ({
...broadCandidateToBenchmarkLink(candidate),
score: candidate.fkScore,
status: 'review' as const,
}))
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
const resolvedPks = graph.pks
.filter((pk) => pk.status !== 'rejected')
.map((pk) => ({
table: pk.table,
columns: pk.columns,
score: pk.pkScore,
status: pk.status,
}));
const compositePks = compositeDetection.primaryKeys.map(compositePkToBenchmarkPk);
const allPksByKey = new Map([...resolvedPks, ...compositePks].map((candidate) => [pkKey(candidate), candidate]));
const pks = sortedUnique(allPksByKey.keys()).flatMap((key) => {
const candidate = allPksByKey.get(key);
return candidate ? [candidate] : [];
});
return {
pks,
links: [
...formalLinks,
...acceptedBroadCandidates,
...reviewCandidates,
...compositeDetection.relationships
.map(compositeRelationshipToBenchmarkLink)
.filter((candidate) => !acceptedKeys.has(fkKey(candidate))),
],
validationBlocked:
input.mode === 'validation_disabled' ||
input.mode === 'profiling_disabled' ||
(input.dataPath !== null && broadRelationshipCandidates.length > 0 && !profiles.sqlAvailable),
sqlQueries: profilingExecutor?.queryCount ?? profiles.queryCount,
llmCalls: llmProposalResult.llmCalls,
runtimeSeconds: Number(((performance.now() - startedAt) / 1000).toFixed(6)),
};
},
};
}
export function currentKloRelationshipBenchmarkDetector(): KloRelationshipBenchmarkDetector {
return {
async detect(input) {
const startedAt = performance.now();
const formalMetadata = collectKloFormalMetadataRelationships(input.schema);
const formalLinks = formalMetadata.accepted.map((relationship) => relationshipToBenchmarkLink(relationship));
const acceptedKeys = new Set(formalLinks.map(fkKey));
const sqliteDataAvailable = Boolean(input.dataPath && input.snapshot.driver === 'sqlite');
const profilingExecutor =
sqliteDataAvailable && input.mode !== 'profiling_disabled'
? new KloRelationshipBenchmarkSqliteExecutor(input.dataPath as string)
: null;
const validationExecutor = profilingExecutor && input.mode !== 'validation_disabled' ? profilingExecutor : null;
const profiles =
input.mode === 'profiling_disabled'
? emptyKloRelationshipProfileArtifact({
connectionId: input.snapshot.connectionId,
driver: input.snapshot.driver,
reason: 'relationship_benchmark_profiling_disabled',
})
: await profileKloRelationshipSchema({
connectionId: input.snapshot.connectionId,
driver: input.snapshot.driver,
schema: input.schema,
executor: profilingExecutor,
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:profile` },
});
const broadRelationshipCandidates = generateKloRelationshipDiscoveryCandidates(input.schema, {
profiles,
useEmbeddings: input.mode !== 'embeddings_disabled',
});
const validationBudget =
input.validationBudget === 'all'
? 'all'
: input.validationBudget === undefined
? 'all'
: Math.max(0, input.validationBudget - profiles.queryCount);
const validatedBroadCandidates = await validateKloRelationshipDiscoveryCandidates({
connectionId: input.snapshot.connectionId,
driver: input.snapshot.driver,
candidates: broadRelationshipCandidates,
profiles,
executor: validationExecutor,
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:validate` },
tableCount: input.schema.tables.length,
settings: {
validationBudget,
},
});
const compositeDetection =
validationBudget === 'all' &&
validationExecutor &&
input.mode !== 'profiling_disabled' &&
input.mode !== 'validation_disabled'
? await discoverKloCompositeRelationships({
connectionId: input.snapshot.connectionId,
driver: input.snapshot.driver,
schema: input.schema,
profiles,
executor: validationExecutor,
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:composite` },
})
: { primaryKeys: [], relationships: [], queryCount: 0, warnings: [] };
profilingExecutor?.close();
const graph = resolveKloRelationshipGraph({
schema: input.schema,
profiles,
candidates: validatedBroadCandidates,
});
const acceptedBroadCandidates = graph.relationships
.filter((candidate) => candidate.status === 'accepted')
.map((candidate) => ({
...broadCandidateToBenchmarkLink(candidate),
score: candidate.fkScore,
status: 'accepted' as const,
}))
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
const reviewCandidates = graph.relationships
.filter((candidate) => candidate.status === 'review')
.map((candidate) => ({
...broadCandidateToBenchmarkLink(candidate),
score: candidate.fkScore,
status: 'review' as const,
}))
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
const resolvedPks = graph.pks
.filter((pk) => pk.status !== 'rejected')
.map((pk) => ({
table: pk.table,
columns: pk.columns,
score: pk.pkScore,
status: pk.status,
}));
const compositePks = compositeDetection.primaryKeys.map(compositePkToBenchmarkPk);
const allPksByKey = new Map([...resolvedPks, ...compositePks].map((candidate) => [pkKey(candidate), candidate]));
const pks = sortedUnique(allPksByKey.keys()).flatMap((key) => {
const candidate = allPksByKey.get(key);
return candidate ? [candidate] : [];
});
return {
pks,
links: [
...formalLinks,
...acceptedBroadCandidates,
...reviewCandidates,
...compositeDetection.relationships
.map(compositeRelationshipToBenchmarkLink)
.filter((candidate) => !acceptedKeys.has(fkKey(candidate))),
],
validationBlocked:
input.mode === 'validation_disabled' ||
input.mode === 'profiling_disabled' ||
(input.dataPath !== null && broadRelationshipCandidates.length > 0 && !profiles.sqlAvailable),
sqlQueries: profilingExecutor?.queryCount ?? profiles.queryCount,
llmCalls: 0,
runtimeSeconds: Number(((performance.now() - startedAt) / 1000).toFixed(6)),
};
},
};
}
export async function loadKloRelationshipBenchmarkFixture(
fixtureDir: string,
): Promise<KloRelationshipBenchmarkFixture> {
const [fixtureRaw, snapshotRaw, expectedRaw] = await Promise.all([
fixtureText(fixtureDir, 'fixture.yaml'),
fixtureText(fixtureDir, 'snapshot.json'),
fixtureText(fixtureDir, 'expected-links.yaml'),
]);
const fixture = fixtureConfigSchema.parse(YAML.parse(fixtureRaw));
const expected = expectedLinksSchema.parse(YAML.parse(expectedRaw));
const snapshot = JSON.parse(snapshotRaw) as KloSchemaSnapshot;
return {
...fixture,
snapshot,
expected,
dataPath: await fixtureDataPath(fixtureDir),
columnEmbeddings: await fixtureColumnEmbeddings(fixtureDir),
};
}
export async function loadKloRelationshipBenchmarkFixtures(
fixtureRoot: string,
): Promise<KloRelationshipBenchmarkFixture[]> {
const entries = await readdir(fixtureRoot, { withFileTypes: true });
const fixtureDirs = entries
.filter((entry) => entry.isDirectory())
.map((entry) => join(fixtureRoot, entry.name))
.sort((left, right) => left.localeCompare(right));
return Promise.all(fixtureDirs.map((fixtureDir) => loadKloRelationshipBenchmarkFixture(fixtureDir)));
}
export async function runKloRelationshipBenchmarkCase(input: {
fixture: KloRelationshipBenchmarkFixture;
mode: KloRelationshipBenchmarkMode;
detector?: KloRelationshipBenchmarkDetector;
}): Promise<KloRelationshipBenchmarkCaseResult> {
const snapshot = maskKloRelationshipBenchmarkSnapshot(input.fixture.snapshot, input.mode);
const embeddings =
input.mode === 'embeddings_disabled'
? new Map<string, number[]>()
: new Map(Object.entries(input.fixture.columnEmbeddings));
const schema = snapshotToKloEnrichedSchema(snapshot, embeddings);
const detected = await (input.detector ?? currentKloRelationshipBenchmarkDetector()).detect({
fixtureId: input.fixture.id,
mode: input.mode,
snapshot,
schema,
dataPath: input.fixture.dataPath,
validationBudget: input.fixture.validationBudget,
});
return scoreBenchmarkCase({
fixtureId: input.fixture.id,
mode: input.mode,
expected: input.fixture.expected,
detected,
});
}
export async function runKloRelationshipBenchmarkSuite(input: {
fixtures: KloRelationshipBenchmarkFixture[];
detector?: KloRelationshipBenchmarkDetector;
}): Promise<KloRelationshipBenchmarkSuiteResult> {
const cases: KloRelationshipBenchmarkCaseResult[] = [];
for (const fixture of input.fixtures) {
for (const mode of fixture.defaultModes) {
cases.push(
await runKloRelationshipBenchmarkCase({
fixture,
mode,
detector: input.detector,
}),
);
}
}
const fixtureById = new Map(input.fixtures.map((fixture) => [fixture.id, fixture]));
const headlineCases = cases.filter((item) => {
const fixture = fixtureById.get(item.fixtureId);
return fixture
? isKloRelationshipBenchmarkTuningEligible({
fixture,
mode: item.mode,
validationBlocked: item.validationBlocked,
})
: false;
});
const aggregateCases = cases.length === 0 ? [] : cases;
return {
cases,
validationBlockedCases: cases
.filter((item) => item.validationBlocked)
.map((item) => `${item.fixtureId}:${item.mode}`),
aggregate: {
caseCount: cases.length,
headlineCaseCount: headlineCases.length,
headlinePkRecall: mean(headlineCases.map((item) => item.metrics.pkRecall)),
headlineFkRecall: mean(headlineCases.map((item) => item.metrics.fkRecall)),
headlineAcceptedOrReviewRecall: mean(headlineCases.map((item) => item.metrics.acceptedOrReviewRecall)),
meanPkRecall: mean(aggregateCases.map((item) => item.metrics.pkRecall)),
meanFkRecall: mean(aggregateCases.map((item) => item.metrics.fkRecall)),
meanAcceptedOrReviewRecall: mean(aggregateCases.map((item) => item.metrics.acceptedOrReviewRecall)),
},
};
}
function mean(values: number[]): number {
if (values.length === 0) {
return 0;
}
return values.reduce((sum, value) => sum + value, 0) / values.length;
}

View file

@ -0,0 +1,86 @@
import { describe, expect, it } from 'vitest';
import { applyKloRelationshipValidationBudget, defaultKloRelationshipValidationBudget } from './relationship-budget.js';
interface Candidate {
id: string;
confidence: number;
}
describe('relationship validation budget', () => {
it('computes the default validation budget from table count', () => {
expect(defaultKloRelationshipValidationBudget(0)).toBe(0);
expect(defaultKloRelationshipValidationBudget(3)).toBe(6);
expect(defaultKloRelationshipValidationBudget(400)).toBe(800);
expect(defaultKloRelationshipValidationBudget(900)).toBe(1000);
expect(defaultKloRelationshipValidationBudget(-4)).toBe(0);
expect(defaultKloRelationshipValidationBudget(3.8)).toBe(6);
});
it('splits candidates by descending score with stable tie ordering', () => {
const result = applyKloRelationshipValidationBudget<Candidate>({
candidates: [
{ id: 'first', confidence: 0.8 },
{ id: 'second', confidence: 0.9 },
{ id: 'third', confidence: 0.9 },
{ id: 'fourth', confidence: 0.2 },
],
tableCount: 100,
budget: 2,
score: (candidate) => candidate.confidence,
});
expect(result.effectiveBudget).toBe(2);
expect(result.toValidate.map((entry) => entry.candidate.id)).toEqual(['second', 'third']);
expect(result.deferred.map((entry) => entry.candidate.id)).toEqual(['first', 'fourth']);
expect(result.toValidate.map((entry) => entry.originalIndex)).toEqual([1, 2]);
});
it('uses the default budget when the budget is omitted', () => {
const candidates = Array.from({ length: 8 }, (_, index) => ({
id: `candidate-${index}`,
confidence: 1 - index / 10,
}));
const result = applyKloRelationshipValidationBudget<Candidate>({
candidates,
tableCount: 2,
score: (candidate) => candidate.confidence,
});
expect(result.effectiveBudget).toBe(4);
expect(result.toValidate).toHaveLength(4);
expect(result.deferred).toHaveLength(4);
});
it('treats budget zero as disabling SQL validation', () => {
const result = applyKloRelationshipValidationBudget<Candidate>({
candidates: [
{ id: 'first', confidence: 1 },
{ id: 'second', confidence: 0.5 },
],
tableCount: 10,
budget: 0,
score: (candidate) => candidate.confidence,
});
expect(result.effectiveBudget).toBe(0);
expect(result.toValidate).toEqual([]);
expect(result.deferred.map((entry) => entry.candidate.id)).toEqual(['first', 'second']);
});
it('treats budget all as validating every candidate', () => {
const result = applyKloRelationshipValidationBudget<Candidate>({
candidates: [
{ id: 'first', confidence: 0.1 },
{ id: 'second', confidence: 0.9 },
],
tableCount: 1,
budget: 'all',
score: (candidate) => candidate.confidence,
});
expect(result.effectiveBudget).toBe('all');
expect(result.toValidate.map((entry) => entry.candidate.id)).toEqual(['first', 'second']);
expect(result.deferred).toEqual([]);
});
});

View file

@ -0,0 +1,60 @@
export type KloRelationshipValidationBudget = number | 'all' | undefined;
export interface KloRelationshipBudgetedCandidate<TCandidate> {
candidate: TCandidate;
originalIndex: number;
score: number;
}
export interface KloRelationshipValidationBudgetResult<TCandidate> {
effectiveBudget: number | 'all';
toValidate: KloRelationshipBudgetedCandidate<TCandidate>[];
deferred: KloRelationshipBudgetedCandidate<TCandidate>[];
}
export interface ApplyKloRelationshipValidationBudgetInput<TCandidate> {
candidates: readonly TCandidate[];
tableCount: number;
budget?: KloRelationshipValidationBudget;
score: (candidate: TCandidate) => number;
}
export function defaultKloRelationshipValidationBudget(tableCount: number): number {
const safeTableCount = Number.isFinite(tableCount) ? Math.max(0, Math.floor(tableCount)) : 0;
return Math.min(2 * safeTableCount, 1000);
}
export function applyKloRelationshipValidationBudget<TCandidate>(
input: ApplyKloRelationshipValidationBudgetInput<TCandidate>,
): KloRelationshipValidationBudgetResult<TCandidate> {
const ranked = input.candidates
.map((candidate, originalIndex) => ({
candidate,
originalIndex,
score: input.score(candidate),
}))
.sort((left, right) => {
const scoreDelta = right.score - left.score;
return scoreDelta === 0 ? left.originalIndex - right.originalIndex : scoreDelta;
});
if (input.budget === 'all') {
return {
effectiveBudget: 'all',
toValidate: input.candidates.map((candidate, originalIndex) => ({
candidate,
originalIndex,
score: input.score(candidate),
})),
deferred: [],
};
}
const effectiveBudget = input.budget ?? defaultKloRelationshipValidationBudget(input.tableCount);
const safeBudget = Math.max(0, Math.floor(effectiveBudget));
return {
effectiveBudget: safeBudget,
toValidate: ranked.slice(0, safeBudget),
deferred: ranked.slice(safeBudget),
};
}

View file

@ -0,0 +1,881 @@
import { describe, expect, it } from 'vitest';
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
import { normalizeKloRelationshipName } from './relationship-name-similarity.js';
import {
generateKloRelationshipDiscoveryCandidates,
inferKloRelationshipTargetPks,
mergeKloRelationshipDiscoveryCandidates,
} from './relationship-candidates.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
function column(
tableId: string,
id: string,
name: string,
options: Partial<KloEnrichedColumn> = {},
): KloEnrichedColumn {
const tableRef = options.tableRef ?? { catalog: null, db: 'public', name: tableId };
return {
id,
tableId,
tableRef,
name,
nativeType: options.nativeType ?? 'INTEGER',
normalizedType: options.normalizedType ?? 'integer',
dimensionType: options.dimensionType ?? 'number',
nullable: options.nullable ?? true,
primaryKey: options.primaryKey ?? false,
parentColumnId: options.parentColumnId ?? null,
descriptions: options.descriptions ?? {},
embedding: options.embedding ?? null,
sampleValues: options.sampleValues ?? null,
cardinality: options.cardinality ?? null,
};
}
function table(id: string, name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
const ref = { catalog: null, db: 'public', name };
return {
id,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: id, tableRef: ref })),
};
}
function schema(tables: KloEnrichedTable[]): KloEnrichedSchema {
return {
connectionId: 'warehouse',
tables,
relationships: [],
};
}
function planCodeProfiles(): KloRelationshipProfileArtifact {
return {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
queryCount: 0,
tables: [
{ table: { catalog: null, db: 'public', name: 'stg_plans' }, rowCount: 4 },
{ table: { catalog: null, db: 'public', name: 'mart_account_segments' }, rowCount: 4 },
{ table: { catalog: null, db: 'public', name: 'stg_plan_segment_mapping' }, rowCount: 4 },
],
warnings: [],
columns: {
'stg_plans.plan_code': {
table: { catalog: null, db: 'public', name: 'stg_plans' },
column: 'plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
'stg_plans.created_at': {
table: { catalog: null, db: 'public', name: 'stg_plans' },
column: 'created_at',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['2026-05-01', '2026-05-02', '2026-05-03', '2026-05-04'],
minTextLength: 10,
maxTextLength: 10,
},
'stg_plans.email': {
table: { catalog: null, db: 'public', name: 'stg_plans' },
column: 'email',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['a@example.test', 'b@example.test', 'c@example.test', 'd@example.test'],
minTextLength: 14,
maxTextLength: 14,
},
'stg_plans.is_deleted': {
table: { catalog: null, db: 'public', name: 'stg_plans' },
column: 'is_deleted',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['deleted-a', 'deleted-b', 'deleted-c', 'deleted-d'],
minTextLength: 9,
maxTextLength: 9,
},
'mart_account_segments.current_plan_code': {
table: { catalog: null, db: 'public', name: 'mart_account_segments' },
column: 'current_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
'mart_account_segments.normalized_plan_code': {
table: { catalog: null, db: 'public', name: 'mart_account_segments' },
column: 'normalized_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
'stg_plan_segment_mapping.canonical_plan_code': {
table: { catalog: null, db: 'public', name: 'stg_plan_segment_mapping' },
column: 'canonical_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
'stg_plans.canonical_plan_code': {
table: { catalog: null, db: 'public', name: 'stg_plans' },
column: 'canonical_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
},
};
}
describe('relationship discovery candidates', () => {
it('normalizes warehouse prefixes and emits review candidates without declared primary keys', () => {
const accounts = table('accounts-id', 'dim_accounts', [
column('accounts-id', 'accounts-id-col', 'id', { primaryKey: false }),
column('accounts-id', 'accounts-name-col', 'account_name', { nativeType: 'TEXT', normalizedType: 'text' }),
]);
const invoices = table('invoices-id', 'fct_invoices', [
column('invoices-id', 'invoice-id-col', 'id', { primaryKey: false }),
column('invoices-id', 'account-id-col', 'account_id', { primaryKey: false }),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([accounts, invoices]));
expect(candidates).toHaveLength(1);
expect(candidates[0]).toMatchObject({
from: { tableId: 'invoices-id', columnIds: ['account-id-col'], columns: ['account_id'] },
to: { tableId: 'accounts-id', columnIds: ['accounts-id-col'], columns: ['id'] },
relationshipType: 'many_to_one',
status: 'review',
source: 'normalized_table_match',
evidence: {
sourceColumnBase: 'account',
targetTableBase: 'account',
targetKeyScore: 0.92,
},
});
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.8);
expect(candidates[0]?.evidence.signalVector).toMatchObject({
nameSimilarity: 0.92,
typeCompatibility: 1,
valueOverlap: 0,
embeddingSimilarity: 0,
profileUniqueness: 0.92,
});
expect(candidates[0]?.evidence.scoreBreakdown?.score).toBe(candidates[0]?.confidence);
expect(candidates[0]?.evidence.scoreBreakdown?.contributions.nameSimilarity).toBeGreaterThan(0);
expect(candidates[0]?.evidence.reasons).toEqual(
expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like']),
);
});
it('generates candidates for PascalCase ID columns without declared keys', () => {
const artists = table('artist-id', 'Artist', [
column('artist-id', 'artist-id-col', 'ArtistId', { primaryKey: false }),
column('artist-id', 'artist-name-col', 'Name', { nativeType: 'TEXT', normalizedType: 'text' }),
]);
const albums = table('album-id', 'Album', [
column('album-id', 'album-id-col', 'AlbumId', { primaryKey: false }),
column('album-id', 'artist-id-fk-col', 'ArtistId', { primaryKey: false }),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([artists, albums]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toEqual(['Album.ArtistId->Artist.ArtistId']);
expect(candidates[0]).toMatchObject({
source: 'normalized_table_match',
evidence: {
sourceColumnBase: 'artist',
targetTableBase: 'artist',
targetColumnBase: 'artist_id',
targetKeyScore: 0.9,
reasons: expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like']),
},
});
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.9);
});
it('uses the locality cap before scanning parent tables', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
const invoices = table('invoices-id', 'invoices', [
column('invoices-id', 'invoice-id-col', 'id'),
column('invoices-id', 'account-id-col', 'account_id'),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([accounts, invoices]), {
maxCandidateParentTables: 0,
});
expect(candidates).toEqual([]);
});
it('keeps the nearest parent when the locality cap is one', () => {
const artists = table('artist-id', 'Artist', [
column('artist-id', 'artist-id-col', 'ArtistId', { primaryKey: false }),
column('artist-id', 'artist-name-col', 'Name', { nativeType: 'TEXT', normalizedType: 'text' }),
]);
const albums = table('album-id', 'Album', [
column('album-id', 'album-id-col', 'AlbumId', { primaryKey: false }),
column('album-id', 'artist-id-fk-col', 'ArtistId', { primaryKey: false }),
]);
const fillerTables = Array.from({ length: 25 }, (_, index) =>
table(`filler-${index}`, `WarehouseFiller${index}`, [
column(`filler-${index}`, `filler-${index}-id`, 'WarehouseFillerId', { primaryKey: false }),
]),
);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([albums, ...fillerTables, artists]), {
maxCandidateParentTables: 1,
});
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toEqual(['Album.ArtistId->Artist.ArtistId']);
});
it('uses final table tokens from dotted parent table names', () => {
const customers = table('customer-id', 'SalesLT.Customer', [
column('customer-id', 'customer-id-col', 'CustomerID', { primaryKey: false }),
column('customer-id', 'customer-name-col', 'CustomerName', { nativeType: 'TEXT', normalizedType: 'text' }),
]);
const orders = table('order-id', 'SalesLT.SalesOrderHeader', [
column('order-id', 'order-id-col', 'SalesOrderID', { primaryKey: false }),
column('order-id', 'customer-id-fk-col', 'CustomerID', { primaryKey: false }),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([customers, orders]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toEqual(['SalesLT.SalesOrderHeader.CustomerID->SalesLT.Customer.CustomerID']);
expect(candidates[0]).toMatchObject({
evidence: {
sourceColumnBase: 'customer',
targetTableBase: 'sales_lt_customer',
targetColumnBase: 'customer_id',
targetKeyScore: 0.9,
reasons: expect.arrayContaining(['foreign_key_suffix', 'inflection', 'target_key_like']),
},
});
});
it('emits lower-confidence parent-table-name candidates when the target key name differs from the table name', () => {
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
column('customer-account-id', 'business-entity-id-col', 'BusinessEntityID', { primaryKey: true }),
column('customer-account-id', 'account-name-col', 'AccountName', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const subscriptions = table('subscriptions-id', 'fct_subscriptions', [
column('subscriptions-id', 'subscription-id-col', 'SubscriptionID', { primaryKey: false }),
column('subscriptions-id', 'customer-account-id-col', 'CustomerAccountID', { primaryKey: false }),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([customerAccounts, subscriptions]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toEqual(['fct_subscriptions.CustomerAccountID->crm.CustomerAccount.BusinessEntityID']);
expect(candidates[0]).toMatchObject({
source: 'parent_table_name_match',
relationshipType: 'many_to_one',
status: 'review',
evidence: {
sourceColumnBase: 'customer_account',
targetTableBase: 'crm_customer_account',
targetColumnBase: 'business_entity_id',
targetKeyScore: 1,
nameScore: 0.82,
reasons: expect.arrayContaining(['foreign_key_suffix', 'parent_table_name_match', 'target_key_like']),
},
});
expect(candidates[0]?.evidence.signalVector).toMatchObject({
nameSimilarity: 0.82,
typeCompatibility: 1,
});
expect(candidates[0]?.evidence.scoreBreakdown?.score).toBe(candidates[0]?.confidence);
});
it('does not emit parent-table-name candidates when the target key type is incompatible', () => {
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
column('customer-account-id', 'business-entity-id-col', 'BusinessEntityID', {
primaryKey: true,
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const subscriptions = table('subscriptions-id', 'fct_subscriptions', [
column('subscriptions-id', 'customer-account-id-col', 'CustomerAccountID', {
primaryKey: false,
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
}),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([customerAccounts, subscriptions]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).not.toContain('fct_subscriptions.CustomerAccountID->crm.CustomerAccount.BusinessEntityID');
});
it('does not use parent-table-name matching to create same-table same-column self-links', () => {
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
column('customer-account-id', 'customer-account-id-col', 'CustomerAccountID', { primaryKey: false }),
column('customer-account-id', 'account-name-col', 'AccountName', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([customerAccounts]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).not.toContain('crm.CustomerAccount.CustomerAccountID->crm.CustomerAccount.CustomerAccountID');
});
it('uses profile evidence to generate natural-key candidates without id-like target names', () => {
const countries = table('countries-id', 'dim_countries', [
column('countries-id', 'countries-code-col', 'iso_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('countries-id', 'countries-name-col', 'name', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const accounts = table('accounts-id', 'fct_accounts', [
column('accounts-id', 'account-id-col', 'id', { primaryKey: false }),
column('accounts-id', 'country-code-col', 'country_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const profiles = {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
queryCount: 0,
tables: [],
warnings: [],
columns: {
'dim_countries.iso_code': {
table: { catalog: null, db: 'public', name: 'dim_countries' },
column: 'iso_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['DE', 'FR', 'US'],
minTextLength: 2,
maxTextLength: 2,
},
'fct_accounts.country_code': {
table: { catalog: null, db: 'public', name: 'fct_accounts' },
column: 'country_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 0.75,
nullRate: 0,
sampleValues: ['FR', 'US'],
minTextLength: 2,
maxTextLength: 2,
},
},
} satisfies KloRelationshipProfileArtifact;
const candidates = generateKloRelationshipDiscoveryCandidates(schema([countries, accounts]), { profiles });
expect(candidates).toHaveLength(1);
expect(candidates[0]).toMatchObject({
source: 'profile_match',
from: { tableId: 'accounts-id', columnIds: ['country-code-col'], columns: ['country_code'] },
to: { tableId: 'countries-id', columnIds: ['countries-code-col'], columns: ['iso_code'] },
evidence: {
sourceColumnBase: 'country',
targetTableBase: 'country',
targetColumnBase: 'iso_code',
targetKeyScore: 0.86,
},
});
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.78);
expect(candidates[0]?.evidence.reasons).toEqual(
expect.arrayContaining([
'foreign_key_code_suffix',
'normalized_table_name',
'profile_unique_target',
'profile_sample_overlap',
]),
);
});
it('drops same-table same-column self-links using ordered endpoint equality', () => {
const accounts = table('accounts-id', 'stg_accounts', [
column('accounts-id', 'accounts-account-id-col', 'account_id', { primaryKey: false }),
column('accounts-id', 'accounts-name-col', 'account_name', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([accounts]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).not.toContain('stg_accounts.account_id->stg_accounts.account_id');
});
it('keeps legitimate same-table different-column self-references', () => {
const employees = table('employees-id', 'employees', [
column('employees-id', 'employees-id-col', 'id', { primaryKey: false }),
column('employees-id', 'employees-parent-id-col', 'parent_id', { primaryKey: false }),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([employees]));
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toContain('employees.parent_id->employees.id');
expect(candidates[0]).toMatchObject({
source: 'self_reference',
evidence: {
reasons: expect.arrayContaining(['self_reference']),
},
});
});
it('emits column_suffix_match candidates for relationship-key-shaped trailing target columns', () => {
const plans = table('plans-id', 'stg_plans', [
column('plans-id', 'plans-plan-code-col', 'plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('plans-id', 'plans-canonical-plan-code-col', 'canonical_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('plans-id', 'plans-created-at-col', 'created_at', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('plans-id', 'plans-email-col', 'email', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('plans-id', 'plans-is-deleted-col', 'is_deleted', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const accountSegments = table('account-segments-id', 'mart_account_segments', [
column('account-segments-id', 'current-plan-code-col', 'current_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('account-segments-id', 'normalized-plan-code-col', 'normalized_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('account-segments-id', 'source-created-at-col', 'source_created_at', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('account-segments-id', 'billing-email-col', 'billing_email', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
column('account-segments-id', 'source-is-deleted-col', 'source_is_deleted', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const mapping = table('mapping-id', 'stg_plan_segment_mapping', [
column('mapping-id', 'mapping-canonical-plan-code-col', 'canonical_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([plans, accountSegments, mapping]), {
profiles: planCodeProfiles(),
});
const candidateKeys = candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
);
expect(candidateKeys).toEqual([
'mart_account_segments.current_plan_code->stg_plans.plan_code',
'mart_account_segments.normalized_plan_code->stg_plans.plan_code',
'stg_plan_segment_mapping.canonical_plan_code->stg_plans.plan_code',
'stg_plans.canonical_plan_code->stg_plans.plan_code',
]);
expect(candidates).toEqual(
expect.arrayContaining([
expect.objectContaining({
source: 'column_suffix_match',
confidence: expect.any(Number),
evidence: expect.objectContaining({
nameScore: 0.78,
targetKeyScore: 0.86,
reasons: expect.arrayContaining(['column_suffix_match', 'profile_unique_target']),
}),
}),
]),
);
expect(candidateKeys).not.toContain('mart_account_segments.source_created_at->stg_plans.created_at');
expect(candidateKeys).not.toContain('mart_account_segments.billing_email->stg_plans.email');
expect(candidateKeys).not.toContain('mart_account_segments.source_is_deleted->stg_plans.is_deleted');
const suffixCandidate = candidates.find(
(candidate) => candidate.from.table.name === 'mart_account_segments' && candidate.from.columns[0] === 'current_plan_code',
);
expect(suffixCandidate?.confidence).toBe(suffixCandidate?.evidence.scoreBreakdown?.score);
expect(suffixCandidate?.evidence.signalVector).toMatchObject({
nameSimilarity: 0.78,
typeCompatibility: 1,
valueOverlap: 1,
profileUniqueness: 1,
profileNullRate: 1,
});
});
it('does not suffix-match bare single-token targets or incompatible target types', () => {
const users = table('users-id', 'users', [
column('users-id', 'users-id-col', 'id', { primaryKey: false }),
column('users-id', 'users-account-id-col', 'account_id', { primaryKey: false }),
]);
const plans = table('plans-id', 'plans', [
column('plans-id', 'plans-plan-code-col', 'plan_code', {
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
}),
]);
const accounts = table('accounts-id', 'accounts', [
column('accounts-id', 'current-plan-code-col', 'current_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const profiles = {
...planCodeProfiles(),
columns: {
...planCodeProfiles().columns,
'users.id': {
table: { catalog: null, db: 'public', name: 'users' },
column: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
'plans.plan_code': {
table: { catalog: null, db: 'public', name: 'plans' },
column: 'plan_code',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
},
} satisfies KloRelationshipProfileArtifact;
const candidates = generateKloRelationshipDiscoveryCandidates(schema([users, plans, accounts]), { profiles });
const candidateKeys = candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
);
expect(candidateKeys).not.toContain('users.account_id->users.id');
expect(candidateKeys).not.toContain('accounts.current_plan_code->plans.plan_code');
});
it('uses column embeddings as a recall source for non-standard source names', () => {
const customers = table('customers-id', 'customers', [
column('customers-id', 'customers-id-col', 'id', {
primaryKey: false,
embedding: [1, 0, 0],
}),
column('customers-id', 'customers-name-col', 'name', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
embedding: [0, 1, 0],
}),
]);
const orders = table('orders-id', 'orders', [
column('orders-id', 'orders-id-col', 'id', {
primaryKey: false,
embedding: [0, 0, 1],
}),
column('orders-id', 'buyer-ref-col', 'buyer_ref', {
primaryKey: false,
embedding: [0.995, 0.005, 0],
}),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([customers, orders]), {
embeddingSimilarityThreshold: 0.95,
});
expect(candidates).toHaveLength(1);
expect(candidates[0]).toMatchObject({
source: 'embedding_similarity',
from: { tableId: 'orders-id', columnIds: ['buyer-ref-col'], columns: ['buyer_ref'] },
to: { tableId: 'customers-id', columnIds: ['customers-id-col'], columns: ['id'] },
relationshipType: 'many_to_one',
status: 'review',
evidence: {
sourceColumnBase: 'buyer_ref',
targetTableBase: 'customer',
targetColumnBase: 'id',
targetKeyScore: 0.92,
embeddingSimilarity: expect.any(Number),
},
});
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.9);
expect(candidates[0]?.evidence.reasons).toEqual(
expect.arrayContaining(['embedding_similarity', 'target_key_like']),
);
});
it('singularizes names and caps candidates per source column deterministically', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
const archivedAccounts = table('archived-accounts-id', 'accounts_archive', [
column('archived-accounts-id', 'archived-accounts-id-col', 'id'),
]);
const events = table('events-id', 'product_events', [
column('events-id', 'event-id-col', 'id'),
column('events-id', 'account-id-col', 'account_id'),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([events, archivedAccounts, accounts]), {
maxCandidatesPerColumn: 1,
});
expect(
candidates.map(
(candidate) =>
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
),
).toEqual(['product_events.account_id->accounts.id']);
});
it('infers target primary-key candidates from incoming review links', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
const users = table('users-id', 'users', [column('users-id', 'users-id-col', 'id')]);
const events = table('events-id', 'product_events', [
column('events-id', 'event-id-col', 'id'),
column('events-id', 'account-id-col', 'account_id'),
column('events-id', 'user-id-col', 'user_id'),
]);
const candidates = generateKloRelationshipDiscoveryCandidates(schema([accounts, users, events]));
const inferredPks = inferKloRelationshipTargetPks(candidates);
expect(inferredPks).toEqual([
{
table: 'accounts',
columns: ['id'],
score: expect.any(Number),
status: 'review',
incomingCandidateCount: 1,
},
{
table: 'users',
columns: ['id'],
score: expect.any(Number),
status: 'review',
incomingCandidateCount: 1,
},
]);
expect(inferredPks.every((pk) => pk.score >= 0.8)).toBe(true);
});
it('does not generate candidates from primary-key source columns or incompatible target types', () => {
const accounts = table('accounts-id', 'accounts', [
column('accounts-id', 'accounts-id-col', 'id', { nativeType: 'TEXT', normalizedType: 'text' }),
]);
const invoices = table('invoices-id', 'invoices', [
column('invoices-id', 'invoice-id-col', 'id', { primaryKey: true }),
column('invoices-id', 'account-id-col', 'account_id', { nativeType: 'INTEGER', normalizedType: 'integer' }),
]);
expect(generateKloRelationshipDiscoveryCandidates(schema([accounts, invoices]))).toEqual([]);
});
it('normalizes layer prefixes, punctuation, plural forms, and non-plural trailing s words', () => {
expect(normalizeKloRelationshipName('mart__Sales_Accounts')).toMatchObject({
normalized: 'sales_accounts',
singular: 'sales_account',
tokens: ['sales', 'accounts'],
});
expect(normalizeKloRelationshipName('dim_users')).toMatchObject({
normalized: 'users',
singular: 'user',
tokens: ['users'],
});
expect(normalizeKloRelationshipName('Address')).toMatchObject({
normalized: 'address',
singular: 'address',
plural: 'addresses',
tokens: ['address'],
});
});
it('merges duplicate deterministic and LLM proposal candidates without losing LLM rationale', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
const invoices = table('invoices-id', 'invoices', [column('invoices-id', 'account-id-col', 'account_id')]);
const [deterministic] = generateKloRelationshipDiscoveryCandidates(schema([accounts, invoices]));
if (!deterministic) {
throw new Error('Expected deterministic relationship candidate');
}
const llmCandidate = {
...deterministic,
confidence: 0.99,
source: 'llm_proposal' as const,
evidence: {
...deterministic.evidence,
reasons: ['llm_proposal', 'llm_pk_proposal'],
llmConfidence: 0.89,
llmRationale: 'Invoices point at the owning account dimension.',
},
};
const merged = mergeKloRelationshipDiscoveryCandidates([deterministic, llmCandidate]);
expect(merged).toHaveLength(1);
expect(merged[0]).toMatchObject({
id: deterministic.id,
source: 'normalized_table_match',
confidence: 0.99,
evidence: {
llmConfidence: 0.89,
llmRationale: 'Invoices point at the owning account dimension.',
},
});
expect(merged[0]?.evidence.reasons).toEqual(
expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like', 'llm_proposal']),
);
});
});

View file

@ -0,0 +1,756 @@
import type {
KloEnrichedColumn,
KloEnrichedSchema,
KloEnrichedTable,
KloRelationshipEndpoint,
KloRelationshipType,
} from './enrichment-types.js';
import { localCandidateTables } from './relationship-locality.js';
import {
normalizeKloRelationshipName,
pluralizeKloRelationshipToken,
singularizeKloRelationshipToken,
} from './relationship-name-similarity.js';
export type { KloRelationshipNormalizedName } from './relationship-name-similarity.js';
export { normalizeKloRelationshipName } from './relationship-name-similarity.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
import {
scoreKloRelationshipCandidate,
type KloRelationshipScoreBreakdown,
type KloRelationshipSignalVector,
} from './relationship-scoring.js';
export type KloRelationshipDiscoveryCandidateSource =
| 'exact_column_match'
| 'normalized_table_match'
| 'parent_table_name_match'
| 'inflection'
| 'self_reference'
| 'profile_match'
| 'column_suffix_match'
| 'embedding_similarity'
| 'llm_proposal';
export type KloRelationshipDiscoveryCandidateStatus = 'review';
export interface KloRelationshipDiscoveryCandidateEvidence {
sourceColumnBase: string;
targetTableBase: string;
targetColumnBase: string;
targetKeyScore: number;
nameScore: number;
reasons: string[];
signalVector?: KloRelationshipSignalVector;
scoreBreakdown?: KloRelationshipScoreBreakdown;
embeddingSimilarity?: number;
llmConfidence?: number;
llmRationale?: string;
}
export interface KloRelationshipDiscoveryCandidate {
id: string;
from: KloRelationshipEndpoint;
to: KloRelationshipEndpoint;
relationshipType: KloRelationshipType;
confidence: number;
source: KloRelationshipDiscoveryCandidateSource;
status: KloRelationshipDiscoveryCandidateStatus;
evidence: KloRelationshipDiscoveryCandidateEvidence;
}
export interface KloRelationshipDiscoveryCandidateOptions {
maxCandidatesPerColumn?: number;
maxCandidateParentTables?: number;
maxEmbeddingCandidatesPerColumn?: number;
minConfidence?: number;
embeddingSimilarityThreshold?: number;
useEmbeddings?: boolean;
profiles?: KloRelationshipProfileArtifact;
}
export interface KloRelationshipInferredTargetPk {
table: string;
columns: string[];
score: number;
status: 'review';
incomingCandidateCount: number;
}
interface KloRelationshipSourceColumnReference {
base: string;
reason: string;
}
interface KloRelationshipTargetKeyEvidence {
score: number;
reasons: string[];
}
const INTEGER_TYPES = new Set(['integer', 'int', 'bigint', 'smallint', 'tinyint', 'int4', 'int8', 'number']);
const STRING_TYPES = new Set(['text', 'varchar', 'character varying', 'char', 'character', 'string']);
const UUID_TYPES = new Set(['uuid', 'uniqueidentifier']);
const SELF_REFERENCE_NAMES = new Set(['parent_id', 'manager_id', 'reported_to_id', 'supervisor_id', 'reports_to_id']);
const REFERENCE_SUFFIXES: Array<{ suffix: string; reason: string }> = [
{ suffix: '_id', reason: 'foreign_key_suffix' },
{ suffix: '_key', reason: 'foreign_key_key_suffix' },
{ suffix: '_code', reason: 'foreign_key_code_suffix' },
{ suffix: '_uuid', reason: 'foreign_key_uuid_suffix' },
];
const RELATIONSHIP_KEY_TARGET_SUFFIXES = ['_id', '_key', '_code', '_uuid'] as const;
function isRelationshipKeyShapedTarget(column: KloEnrichedColumn): boolean {
const normalized = normalizeKloRelationshipName(column.name);
return (
normalized.tokens.length >= 2 &&
RELATIONSHIP_KEY_TARGET_SUFFIXES.some((suffix) => normalized.normalized.endsWith(suffix))
);
}
function columnSuffixMatchesTarget(input: { fromColumn: KloEnrichedColumn; toColumn: KloEnrichedColumn }): boolean {
const source = normalizeKloRelationshipName(input.fromColumn.name).normalized;
const target = normalizeKloRelationshipName(input.toColumn.name).normalized;
return source !== target && target.length > 0 && source.endsWith(`_${target}`);
}
function normalizeType(column: KloEnrichedColumn): string {
const rawType = (column.normalizedType || column.nativeType || '').toLowerCase().trim();
return rawType.includes('(') ? (rawType.split('(')[0] ?? '') : rawType;
}
function typesCompatible(left: KloEnrichedColumn, right: KloEnrichedColumn): boolean {
const leftType = normalizeType(left);
const rightType = normalizeType(right);
if (leftType === rightType) {
return true;
}
if (INTEGER_TYPES.has(leftType) && INTEGER_TYPES.has(rightType)) {
return true;
}
if (STRING_TYPES.has(leftType) && STRING_TYPES.has(rightType)) {
return true;
}
return UUID_TYPES.has(leftType) && UUID_TYPES.has(rightType);
}
function cosineSimilarity(left: readonly number[] | null, right: readonly number[] | null): number {
if (!left || !right || left.length === 0 || left.length !== right.length) {
return 0;
}
let dot = 0;
let leftMagnitude = 0;
let rightMagnitude = 0;
for (let index = 0; index < left.length; index += 1) {
const leftValue = left[index] ?? 0;
const rightValue = right[index] ?? 0;
dot += leftValue * rightValue;
leftMagnitude += leftValue * leftValue;
rightMagnitude += rightValue * rightValue;
}
if (leftMagnitude === 0 || rightMagnitude === 0) {
return 0;
}
return dot / (Math.sqrt(leftMagnitude) * Math.sqrt(rightMagnitude));
}
function hasUsableEmbedding(column: KloEnrichedColumn): boolean {
return Array.isArray(column.embedding) && column.embedding.length > 0;
}
function sourceColumnReference(column: KloEnrichedColumn): KloRelationshipSourceColumnReference | null {
const normalized = normalizeKloRelationshipName(column.name);
if (SELF_REFERENCE_NAMES.has(normalized.normalized)) {
return { base: normalized.normalized.replace(/_id$/u, ''), reason: 'foreign_key_suffix' };
}
for (const item of REFERENCE_SUFFIXES) {
if (!normalized.normalized.endsWith(item.suffix)) {
continue;
}
const base = normalized.normalized.slice(0, -item.suffix.length);
if (base.length > 1) {
return { base: singularizeKloRelationshipToken(base), reason: item.reason };
}
}
return null;
}
function addNormalizedTableAlias(aliases: Set<string>, name: string): void {
const normalized = normalizeKloRelationshipName(name);
if (normalized.normalized.length > 0) {
aliases.add(normalized.normalized);
}
if (normalized.singular.length > 0) {
aliases.add(normalized.singular);
}
if (normalized.plural.length > 0) {
aliases.add(normalized.plural);
}
}
function tableAliases(table: KloEnrichedTable): Set<string> {
const normalized = normalizeKloRelationshipName(table.ref.name);
const aliases = new Set([normalized.normalized, normalized.singular, normalized.plural]);
if (normalized.tokens.length > 1) {
const lastToken = normalized.tokens[normalized.tokens.length - 1];
if (lastToken) {
aliases.add(lastToken);
const singularLastToken = singularizeKloRelationshipToken(lastToken);
aliases.add(singularLastToken);
aliases.add(pluralizeKloRelationshipToken(singularLastToken));
}
}
return aliases;
}
function finalTableNamePart(table: KloEnrichedTable): string {
const parts = table.ref.name.split(/[^\p{L}\p{N}]+/u).filter(Boolean);
return parts[parts.length - 1] ?? table.ref.name;
}
function parentTableNameAliases(table: KloEnrichedTable): Set<string> {
const aliases = tableAliases(table);
addNormalizedTableAlias(aliases, finalTableNamePart(table));
return aliases;
}
function targetKeyScore(table: KloEnrichedTable, column: KloEnrichedColumn): number {
const columnName = normalizeKloRelationshipName(column.name).normalized;
const tableKeyBases = parentTableNameAliases(table);
if (column.primaryKey) {
return 1;
}
if (columnName === 'id') {
return 0.92;
}
if (Array.from(tableKeyBases).some((tableKeyBase) => columnName === `${tableKeyBase}_id`)) {
return 0.9;
}
if (Array.from(tableKeyBases).some((tableKeyBase) => columnName === `${tableKeyBase}_key`)) {
return 0.82;
}
if (columnName === 'key' || columnName === 'uuid') {
return 0.74;
}
return 0;
}
function profileColumn(
profiles: KloRelationshipProfileArtifact | undefined,
tableName: string,
columnName: string,
) {
return profiles?.columns[`${tableName}.${columnName}`] ?? null;
}
function profileSampleOverlap(input: {
profiles: KloRelationshipProfileArtifact | undefined;
fromTable: KloEnrichedTable;
fromColumn: KloEnrichedColumn;
toTable: KloEnrichedTable;
toColumn: KloEnrichedColumn;
}): number {
const source = profileColumn(input.profiles, input.fromTable.ref.name, input.fromColumn.name);
const target = profileColumn(input.profiles, input.toTable.ref.name, input.toColumn.name);
if (!source || !target || source.sampleValues.length === 0 || target.sampleValues.length === 0) {
return 0;
}
const targetValues = new Set(target.sampleValues.map((value) => value.toLowerCase()));
const overlap = source.sampleValues.filter((value) => targetValues.has(value.toLowerCase())).length;
return overlap / source.sampleValues.length;
}
function tableProfileRowCount(profiles: KloRelationshipProfileArtifact | undefined, tableName: string): number | null {
return profiles?.tables.find((table) => table.table.name === tableName)?.rowCount ?? null;
}
function structuralPriorScore(input: {
profiles: KloRelationshipProfileArtifact | undefined;
fromTable: KloEnrichedTable;
toTable: KloEnrichedTable;
}): number {
if (input.fromTable.id === input.toTable.id) {
return 0.72;
}
const sourceRows = tableProfileRowCount(input.profiles, input.fromTable.ref.name);
const targetRows = tableProfileRowCount(input.profiles, input.toTable.ref.name);
if (sourceRows === null || targetRows === null || sourceRows <= 0 || targetRows <= 0) {
return 0.5;
}
const ratio = targetRows / sourceRows;
if (ratio >= 0.05 && ratio <= 20) {
return 0.7;
}
return 0.4;
}
function candidateSignalVector(input: {
profiles: KloRelationshipProfileArtifact | undefined;
fromTable: KloEnrichedTable;
fromColumn: KloEnrichedColumn;
toTable: KloEnrichedTable;
toColumn: KloEnrichedColumn;
targetKeyScore: number;
nameScore: number;
valueOverlap: number;
embeddingSimilarity?: number;
}): KloRelationshipSignalVector {
const sourceProfile = profileColumn(input.profiles, input.fromTable.ref.name, input.fromColumn.name);
const targetProfile = profileColumn(input.profiles, input.toTable.ref.name, input.toColumn.name);
const targetUniqueness = targetProfile?.uniquenessRatio ?? input.targetKeyScore;
const sourceNonNullness = sourceProfile ? 1 - sourceProfile.nullRate : 0.5;
return {
nameSimilarity: input.nameScore,
typeCompatibility: typesCompatible(input.fromColumn, input.toColumn) ? 1 : 0,
valueOverlap: input.valueOverlap,
embeddingSimilarity: input.embeddingSimilarity ?? 0,
profileUniqueness: targetUniqueness,
profileNullRate: sourceNonNullness,
structuralPrior: structuralPriorScore({
profiles: input.profiles,
fromTable: input.fromTable,
toTable: input.toTable,
}),
};
}
function candidateParentTables(input: {
tables: readonly KloEnrichedTable[];
fromTable: KloEnrichedTable;
fromColumn: KloEnrichedColumn;
options: KloRelationshipDiscoveryCandidateOptions;
}): KloEnrichedTable[] {
const maxParentTables = input.options.maxCandidateParentTables ?? 20;
if (maxParentTables <= 0) {
return [];
}
const ranked = localCandidateTables({
childTable: input.fromTable,
childColumn: input.fromColumn,
parentTables: input.tables,
maxParentTables,
}).map((item) => item.table);
const normalizedColumn = normalizeKloRelationshipName(input.fromColumn.name).normalized;
if (!SELF_REFERENCE_NAMES.has(normalizedColumn) || ranked.some((table) => table.id === input.fromTable.id)) {
return ranked;
}
return [
input.fromTable,
...ranked.filter((table) => table.id !== input.fromTable.id).slice(0, Math.max(0, maxParentTables - 1)),
];
}
function targetKeyEvidence(
table: KloEnrichedTable,
column: KloEnrichedColumn,
profiles: KloRelationshipProfileArtifact | undefined,
): KloRelationshipTargetKeyEvidence {
const deterministicScore = targetKeyScore(table, column);
if (deterministicScore > 0) {
return { score: deterministicScore, reasons: ['target_key_like'] };
}
const profile = profileColumn(profiles, table.ref.name, column.name);
if (!profile || profile.uniquenessRatio < 0.98 || profile.nullRate > 0.05) {
return { score: 0, reasons: [] };
}
const columnName = normalizeKloRelationshipName(column.name).normalized;
if (columnName === 'code' || columnName.endsWith('_code') || columnName === 'key' || columnName.endsWith('_key')) {
return { score: 0.86, reasons: ['profile_unique_target'] };
}
return { score: 0.78, reasons: ['profile_unique_target'] };
}
function endpoint(table: KloEnrichedTable, column: KloEnrichedColumn): KloRelationshipEndpoint {
return {
tableId: table.id,
columnIds: [column.id],
table: table.ref,
columns: [column.name],
};
}
function relationshipId(from: KloRelationshipEndpoint, to: KloRelationshipEndpoint): string {
return `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`;
}
function endpointsHaveSameOrderedColumns(left: KloRelationshipEndpoint, right: KloRelationshipEndpoint): boolean {
if (left.columnIds.length !== right.columnIds.length || left.columns.length !== right.columns.length) {
return false;
}
return left.columnIds.every(
(columnId, index) => columnId === right.columnIds[index] && left.columns[index] === right.columns[index],
);
}
function isDegenerateSameColumnSelfLink(candidate: Pick<KloRelationshipDiscoveryCandidate, 'from' | 'to'>): boolean {
return candidate.from.tableId === candidate.to.tableId && endpointsHaveSameOrderedColumns(candidate.from, candidate.to);
}
function singleRelationshipColumn(endpointValue: KloRelationshipEndpoint): string {
const column = endpointValue.columns[0];
if (!column) {
throw new Error(`Expected relationship endpoint ${endpointValue.table.name} to contain one column`);
}
return column;
}
function candidateSortKey(candidate: KloRelationshipDiscoveryCandidate): string {
return `${candidate.from.table.name}.${singleRelationshipColumn(candidate.from)}->${candidate.to.table.name}.${singleRelationshipColumn(candidate.to)}`;
}
function uniqueReasons(values: readonly string[]): string[] {
return Array.from(new Set(values.filter((value) => value.trim().length > 0)));
}
function mergeCandidateEvidence(
left: KloRelationshipDiscoveryCandidate,
right: KloRelationshipDiscoveryCandidate,
): KloRelationshipDiscoveryCandidate {
const preferred = right.confidence > left.confidence && left.source === 'llm_proposal' ? right : left;
const supplement = preferred === left ? right : left;
return {
...preferred,
confidence: Math.max(left.confidence, right.confidence),
evidence: {
...preferred.evidence,
llmConfidence: preferred.evidence.llmConfidence ?? supplement.evidence.llmConfidence,
llmRationale: preferred.evidence.llmRationale ?? supplement.evidence.llmRationale,
reasons: uniqueReasons([...preferred.evidence.reasons, ...supplement.evidence.reasons]),
},
};
}
function sourceForEvidence(reasons: string[]): KloRelationshipDiscoveryCandidateSource {
if (reasons.includes('self_reference')) {
return 'self_reference';
}
if (reasons.includes('embedding_similarity')) {
return 'embedding_similarity';
}
if (reasons.includes('column_suffix_match')) {
return 'column_suffix_match';
}
if (reasons.includes('parent_table_name_match')) {
return 'parent_table_name_match';
}
if (reasons.includes('profile_sample_overlap') || reasons.includes('profile_unique_target')) {
return 'profile_match';
}
if (reasons.includes('normalized_table_name')) {
return 'normalized_table_match';
}
if (reasons.includes('exact_column_name')) {
return 'exact_column_match';
}
if (reasons.includes('inflection')) {
return 'inflection';
}
return 'normalized_table_match';
}
function createCandidate(input: {
fromTable: KloEnrichedTable;
fromColumn: KloEnrichedColumn;
toTable: KloEnrichedTable;
toColumn: KloEnrichedColumn;
sourceBase: string;
targetBase: string;
targetKeyScore: number;
nameScore: number;
reasons: string[];
profiles: KloRelationshipProfileArtifact | undefined;
valueOverlap: number;
embeddingSimilarity?: number;
}): KloRelationshipDiscoveryCandidate {
const from = endpoint(input.fromTable, input.fromColumn);
const to = endpoint(input.toTable, input.toColumn);
const signalVector = candidateSignalVector({
profiles: input.profiles,
fromTable: input.fromTable,
fromColumn: input.fromColumn,
toTable: input.toTable,
toColumn: input.toColumn,
targetKeyScore: input.targetKeyScore,
nameScore: input.nameScore,
valueOverlap: input.valueOverlap,
embeddingSimilarity: input.embeddingSimilarity,
});
const scoreBreakdown = scoreKloRelationshipCandidate(signalVector);
return {
id: relationshipId(from, to),
from,
to,
relationshipType: 'many_to_one',
confidence: scoreBreakdown.score,
source: sourceForEvidence(input.reasons),
status: 'review',
evidence: {
sourceColumnBase: input.sourceBase,
targetTableBase: input.targetBase,
targetColumnBase: normalizeKloRelationshipName(input.toColumn.name).normalized,
targetKeyScore: input.targetKeyScore,
nameScore: input.nameScore,
reasons: input.reasons,
signalVector,
scoreBreakdown,
...(input.embeddingSimilarity === undefined
? {}
: { embeddingSimilarity: Number(input.embeddingSimilarity.toFixed(3)) }),
},
};
}
function generateKloEmbeddingRelationshipCandidates(
schema: KloEnrichedSchema,
options: KloRelationshipDiscoveryCandidateOptions,
): KloRelationshipDiscoveryCandidate[] {
if (options.useEmbeddings === false) {
return [];
}
const threshold = options.embeddingSimilarityThreshold ?? 0.92;
const maxCandidatesPerColumn = options.maxEmbeddingCandidatesPerColumn ?? options.maxCandidatesPerColumn ?? 25;
const tables = schema.tables.filter((table) => table.enabled);
const candidates: KloRelationshipDiscoveryCandidate[] = [];
for (const fromTable of tables) {
for (const fromColumn of fromTable.columns) {
if (fromColumn.primaryKey || !hasUsableEmbedding(fromColumn)) {
continue;
}
const columnCandidates: KloRelationshipDiscoveryCandidate[] = [];
for (const toTable of candidateParentTables({ tables, fromTable, fromColumn, options })) {
if (fromTable.id === toTable.id) {
continue;
}
for (const toColumn of toTable.columns) {
if (!hasUsableEmbedding(toColumn) || !typesCompatible(fromColumn, toColumn)) {
continue;
}
const keyEvidence = targetKeyEvidence(toTable, toColumn, options.profiles);
if (keyEvidence.score === 0) {
continue;
}
const similarity = cosineSimilarity(fromColumn.embedding, toColumn.embedding);
if (similarity < threshold) {
continue;
}
const sourceBase = normalizeKloRelationshipName(fromColumn.name).normalized;
const targetBase = normalizeKloRelationshipName(toTable.ref.name).singular;
const reasons = ['embedding_similarity', ...keyEvidence.reasons];
const candidate = createCandidate({
fromTable,
fromColumn,
toTable,
toColumn,
sourceBase,
targetBase,
targetKeyScore: keyEvidence.score,
nameScore: similarity,
reasons,
profiles: options.profiles,
valueOverlap: profileSampleOverlap({
profiles: options.profiles,
fromTable,
fromColumn,
toTable,
toColumn,
}),
embeddingSimilarity: similarity,
});
if (candidate.confidence >= (options.minConfidence ?? 0.72) && !isDegenerateSameColumnSelfLink(candidate)) {
columnCandidates.push(candidate);
}
}
}
columnCandidates.sort(
(left, right) => right.confidence - left.confidence || candidateSortKey(left).localeCompare(candidateSortKey(right)),
);
candidates.push(...columnCandidates.slice(0, maxCandidatesPerColumn));
}
}
return candidates;
}
export function generateKloRelationshipDiscoveryCandidates(
schema: KloEnrichedSchema,
options: KloRelationshipDiscoveryCandidateOptions = {},
): KloRelationshipDiscoveryCandidate[] {
const maxCandidatesPerColumn = options.maxCandidatesPerColumn ?? 25;
const minConfidence = options.minConfidence ?? 0.72;
const tables = schema.tables.filter((table) => table.enabled);
const candidates: KloRelationshipDiscoveryCandidate[] = [];
for (const fromTable of tables) {
for (const fromColumn of fromTable.columns) {
if (fromColumn.primaryKey) {
continue;
}
const sourceReference = sourceColumnReference(fromColumn);
if (!sourceReference) {
continue;
}
const sourceBase = sourceReference.base;
const columnCandidates: KloRelationshipDiscoveryCandidate[] = [];
for (const toTable of candidateParentTables({ tables, fromTable, fromColumn, options })) {
const strictAliases = tableAliases(toTable);
const parentAliases = parentTableNameAliases(toTable);
const targetBase = normalizeKloRelationshipName(toTable.ref.name).singular;
const sameTable = fromTable.id === toTable.id;
const nameMatchesTarget = strictAliases.has(sourceBase);
const parentTableNameMatcher = !sameTable && !nameMatchesTarget && parentAliases.has(sourceBase);
const selfReference = sameTable && SELF_REFERENCE_NAMES.has(normalizeKloRelationshipName(fromColumn.name).normalized);
const strictTableMatcher = (!sameTable && nameMatchesTarget) || selfReference;
for (const toColumn of toTable.columns) {
const keyEvidence = targetKeyEvidence(toTable, toColumn, options.profiles);
if (keyEvidence.score === 0 || !typesCompatible(fromColumn, toColumn)) {
continue;
}
const suffixMatcher =
!strictTableMatcher &&
!parentTableNameMatcher &&
columnSuffixMatchesTarget({ fromColumn, toColumn }) &&
isRelationshipKeyShapedTarget(toColumn);
if (!strictTableMatcher && !suffixMatcher && !parentTableNameMatcher) {
continue;
}
const overlap = profileSampleOverlap({
profiles: options.profiles,
fromTable,
fromColumn,
toTable,
toColumn,
});
if (
(strictTableMatcher || parentTableNameMatcher) &&
keyEvidence.reasons.includes('profile_unique_target') &&
overlap === 0
) {
continue;
}
const reasons = suffixMatcher
? ['column_suffix_match', ...keyEvidence.reasons]
: [sourceReference.reason, ...keyEvidence.reasons];
if (overlap > 0) {
reasons.push('profile_sample_overlap');
}
let nameScore = suffixMatcher ? 0.78 : 0.88;
if (parentTableNameMatcher) {
reasons.push('parent_table_name_match');
nameScore = 0.82;
} else if (selfReference) {
reasons.push('self_reference');
nameScore = 0.82;
} else if (!suffixMatcher && normalizeKloRelationshipName(toTable.ref.name).singular === sourceBase) {
reasons.push('normalized_table_name');
nameScore = 0.92;
} else if (!suffixMatcher && strictAliases.has(sourceBase)) {
reasons.push('inflection');
nameScore = 0.88;
}
if (
!suffixMatcher &&
!parentTableNameMatcher &&
normalizeKloRelationshipName(fromColumn.name).normalized === normalizeKloRelationshipName(toColumn.name).normalized
) {
reasons.push('exact_column_name');
nameScore = Math.max(nameScore, 0.9);
}
const candidate = createCandidate({
fromTable,
fromColumn,
toTable,
toColumn,
sourceBase,
targetBase,
targetKeyScore: keyEvidence.score,
nameScore,
reasons,
profiles: options.profiles,
valueOverlap: overlap,
});
if (candidate.confidence >= minConfidence && !isDegenerateSameColumnSelfLink(candidate)) {
columnCandidates.push(candidate);
}
}
}
columnCandidates.sort(
(left, right) => right.confidence - left.confidence || candidateSortKey(left).localeCompare(candidateSortKey(right)),
);
candidates.push(...columnCandidates.slice(0, maxCandidatesPerColumn));
}
}
candidates.push(...generateKloEmbeddingRelationshipCandidates(schema, options));
const byId = new Map<string, KloRelationshipDiscoveryCandidate>();
for (const candidate of candidates) {
const existing = byId.get(candidate.id);
if (!existing || candidate.confidence > existing.confidence) {
byId.set(candidate.id, candidate);
}
}
return Array.from(byId.values()).sort(
(left, right) => right.confidence - left.confidence || candidateSortKey(left).localeCompare(candidateSortKey(right)),
);
}
export function mergeKloRelationshipDiscoveryCandidates(
candidates: readonly KloRelationshipDiscoveryCandidate[],
): KloRelationshipDiscoveryCandidate[] {
const byId = new Map<string, KloRelationshipDiscoveryCandidate>();
for (const candidate of candidates) {
const existing = byId.get(candidate.id);
byId.set(candidate.id, existing ? mergeCandidateEvidence(existing, candidate) : candidate);
}
return Array.from(byId.values()).sort((left, right) => candidateSortKey(left).localeCompare(candidateSortKey(right)));
}
export function inferKloRelationshipTargetPks(
candidates: readonly KloRelationshipDiscoveryCandidate[],
): KloRelationshipInferredTargetPk[] {
const incoming = new Map<string, { table: string; column: string; scores: number[] }>();
for (const candidate of candidates) {
const toColumn = singleRelationshipColumn(candidate.to);
const key = `${candidate.to.table.name}.${toColumn}`;
const item = incoming.get(key) ?? { table: candidate.to.table.name, column: toColumn, scores: [] };
item.scores.push(candidate.confidence);
incoming.set(key, item);
}
return Array.from(incoming.values())
.map((item) => ({
table: item.table,
columns: [item.column],
score: Number(Math.min(0.95, Math.max(...item.scores)).toFixed(3)),
status: 'review' as const,
incomingCandidateCount: item.scores.length,
}))
.sort((left, right) => left.table.localeCompare(right.table) || left.columns[0]!.localeCompare(right.columns[0]!));
}

View file

@ -0,0 +1,84 @@
import Database from 'better-sqlite3';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { snapshotToKloEnrichedSchema } from './local-enrichment.js';
import { loadKloRelationshipBenchmarkFixture, maskKloRelationshipBenchmarkSnapshot } from './relationship-benchmarks.js';
import { discoverKloCompositeRelationships } from './relationship-composite-candidates.js';
import { profileKloRelationshipSchema, type KloRelationshipReadOnlyExecutor } from './relationship-profiling.js';
import type { KloQueryResult, KloReadOnlyQueryInput, KloScanContext } from './types.js';
class TestSqliteExecutor implements KloRelationshipReadOnlyExecutor {
private readonly db: Database.Database;
constructor(dataPath: string) {
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
}
async executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return {
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
};
}
close(): void {
this.db.close();
}
}
describe('composite relationship discovery detector', () => {
it('infers composite primary keys and validates composite foreign keys from row evidence', async () => {
const fixtureRoot = new URL('../../test/fixtures/relationship-benchmarks/', import.meta.url);
const fixture = await loadKloRelationshipBenchmarkFixture(
join(fixtureRoot.pathname, 'composite_keys_no_declared_constraints'),
);
const snapshot = maskKloRelationshipBenchmarkSnapshot(fixture.snapshot, 'declared_pks_and_declared_fks_removed');
const schema = snapshotToKloEnrichedSchema(snapshot, new Map());
const executor = new TestSqliteExecutor(fixture.dataPath ?? '');
const profiles = await profileKloRelationshipSchema({
connectionId: snapshot.connectionId,
driver: snapshot.driver,
schema,
executor,
ctx: { runId: 'test:composite-profile' },
});
const result = await discoverKloCompositeRelationships({
connectionId: snapshot.connectionId,
driver: snapshot.driver,
schema,
profiles,
executor,
ctx: { runId: 'test:composite-detect' },
});
executor.close();
expect(result.primaryKeys.map((item) => `${item.table.name}.(${item.columns.join(',')})`)).toEqual([
'order_line_allocations.(order_id,line_number,warehouse_code)',
'order_lines.(order_id,line_number)',
]);
expect(
result.relationships.map(
(item) =>
`${item.from.table.name}.(${item.from.columns.join(',')})->${item.to.table.name}.(${item.to.columns.join(',')})`,
),
).toEqual(['order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)']);
expect(result.relationships[0]).toMatchObject({
relationshipType: 'many_to_one',
status: 'accepted',
confidence: 0.95,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
reasons: ['composite_validation_passed'],
},
});
expect(result.queryCount).toBeGreaterThan(0);
});
});

View file

@ -0,0 +1,622 @@
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable, KloRelationshipType } from './enrichment-types.js';
import {
formatKloRelationshipTableRef,
quoteKloRelationshipIdentifier,
type KloRelationshipProfileArtifact,
type KloRelationshipReadOnlyExecutor,
} from './relationship-profiling.js';
import type { KloConnectionDriver, KloQueryResult, KloScanContext, KloTableRef } from './types.js';
export type KloCompositeRelationshipStatus = 'accepted' | 'review' | 'rejected';
export interface KloCompositeRelationshipTupleEndpoint {
tableId: string;
columnIds: string[];
table: KloTableRef;
columns: string[];
}
export interface KloCompositePrimaryKeyCandidate {
id: string;
tableId: string;
table: KloTableRef;
columns: string[];
columnIds: string[];
score: number;
status: KloCompositeRelationshipStatus;
evidence: {
rowCount: number;
distinctCount: number;
uniquenessRatio: number;
nullRate: number;
reasons: string[];
};
}
export interface KloCompositeRelationshipValidationEvidence {
targetUniqueness: number;
sourceCoverage: number;
violationCount: number;
violationRatio: number;
childDistinct: number;
parentDistinct: number;
overlap: number;
reasons: string[];
}
export interface KloCompositeRelationshipCandidate {
id: string;
from: KloCompositeRelationshipTupleEndpoint;
to: KloCompositeRelationshipTupleEndpoint;
relationshipType: KloRelationshipType;
confidence: number;
status: KloCompositeRelationshipStatus;
source: 'composite_profile_match';
validation: KloCompositeRelationshipValidationEvidence;
}
export interface DiscoverKloCompositeRelationshipsInput {
connectionId: string;
driver: KloConnectionDriver;
schema: KloEnrichedSchema;
profiles: KloRelationshipProfileArtifact;
executor: KloRelationshipReadOnlyExecutor | null;
ctx: KloScanContext;
maxCompositeWidth?: number;
maxColumnsPerTable?: number;
minPrimaryKeyUniqueness?: number;
minSourceCoverage?: number;
maxViolationRatio?: number;
}
export interface DiscoverKloCompositeRelationshipsResult {
primaryKeys: KloCompositePrimaryKeyCandidate[];
relationships: KloCompositeRelationshipCandidate[];
queryCount: number;
warnings: string[];
}
const KEY_NAME_PARTS = new Set(['id', 'key', 'code', 'number', 'num', 'line', 'warehouse', 'account', 'order']);
const DEFAULT_MAX_COMPOSITE_WIDTH = 3;
const DEFAULT_MAX_COLUMNS_PER_TABLE = 8;
const DEFAULT_MIN_PRIMARY_KEY_UNIQUENESS = 0.98;
const DEFAULT_MIN_SOURCE_COVERAGE = 0.9;
const DEFAULT_MAX_VIOLATION_RATIO = 0.01;
function enabledTables(schema: KloEnrichedSchema): KloEnrichedTable[] {
return schema.tables.filter((table) => table.enabled);
}
function tableRowCount(profiles: KloRelationshipProfileArtifact, tableName: string): number {
return profiles.tables.find((item) => item.table.name === tableName)?.rowCount ?? 0;
}
function profileKey(tableName: string, columnName: string): string {
return `${tableName}.${columnName}`;
}
function profileNullRate(profiles: KloRelationshipProfileArtifact, tableName: string, columnName: string): number {
return profiles.columns[profileKey(tableName, columnName)]?.nullRate ?? 1;
}
function normalizedColumnName(name: string): string {
return name
.toLowerCase()
.replace(/[^a-z0-9]+/gu, '_')
.replace(/^_+|_+$/gu, '');
}
function columnNameScore(column: KloEnrichedColumn): number {
const parts = normalizedColumnName(column.name).split('_').filter(Boolean);
if (parts.some((part) => KEY_NAME_PARTS.has(part))) {
return 1;
}
return 0;
}
function nameParts(name: string): string[] {
return normalizedColumnName(name).split('_').filter(Boolean);
}
function keyLikeTableNameParts(tableName: string): Set<string> {
return new Set(nameParts(tableName).filter((part) => KEY_NAME_PARTS.has(part)));
}
function tupleCoversTableNameKeyParts(tableName: string, columns: readonly KloEnrichedColumn[]): boolean {
const required = keyLikeTableNameParts(tableName);
if (required.size === 0) {
return true;
}
const columnParts = new Set(columns.flatMap((column) => nameParts(column.name)));
return Array.from(required).every((part) => columnParts.has(part));
}
function candidateKeyColumns(input: {
table: KloEnrichedTable;
profiles: KloRelationshipProfileArtifact;
maxColumnsPerTable: number;
}): KloEnrichedColumn[] {
return input.table.columns
.map((column, index) => ({ column, index }))
.filter(({ column }) => {
if (column.dimensionType === 'time' || column.dimensionType === 'boolean') {
return false;
}
const profile = input.profiles.columns[profileKey(input.table.ref.name, column.name)];
return Boolean(profile) && profile!.nullRate <= 0.02 && columnNameScore(column) > 0;
})
.sort(
(left, right) =>
columnNameScore(right.column) - columnNameScore(left.column) || left.index - right.index,
)
.slice(0, input.maxColumnsPerTable)
.map(({ column }) => column);
}
function hasStrongSingleColumnKey(input: {
table: KloEnrichedTable;
profiles: KloRelationshipProfileArtifact;
minPrimaryKeyUniqueness: number;
}): boolean {
return input.table.columns.some((column) => {
if (column.dimensionType === 'time' || column.dimensionType === 'boolean' || columnNameScore(column) === 0) {
return false;
}
const profile = input.profiles.columns[profileKey(input.table.ref.name, column.name)];
return Boolean(profile) && profile!.nullRate <= 0.02 && profile!.uniquenessRatio >= input.minPrimaryKeyUniqueness;
});
}
function combinations<T>(values: readonly T[], width: number): T[][] {
if (width <= 0) {
return [[]];
}
if (values.length < width) {
return [];
}
const output: T[][] = [];
values.forEach((value, index) => {
for (const tail of combinations(values.slice(index + 1), width - 1)) {
output.push([value, ...tail]);
}
});
return output;
}
function tupleKey(tableName: string, columns: readonly string[]): string {
return `${tableName}.(${columns.join(',')})`;
}
function relationshipKey(input: {
fromTable: string;
fromColumns: readonly string[];
toTable: string;
toColumns: readonly string[];
}): string {
return `${tupleKey(input.fromTable, input.fromColumns)}->${tupleKey(input.toTable, input.toColumns)}`;
}
function tupleEndpoint(table: KloEnrichedTable, columns: readonly KloEnrichedColumn[]): KloCompositeRelationshipTupleEndpoint {
return {
tableId: table.id,
columnIds: columns.map((column) => column.id),
table: table.ref,
columns: columns.map((column) => column.name),
};
}
function row(result: KloQueryResult): unknown[] {
return result.rows[0] ?? [];
}
function numberAt(result: KloQueryResult, header: string): number {
const index = result.headers.findIndex((candidate) => candidate.toLowerCase() === header.toLowerCase());
const value = row(result)[index];
if (typeof value === 'number') {
return value;
}
if (typeof value === 'bigint') {
return Number(value);
}
if (typeof value === 'string' && value.trim() !== '') {
return Number(value);
}
return 0;
}
function topSql(driver: KloConnectionDriver, limit: number): string {
if (driver === 'sqlserver') {
return ` TOP (${Math.max(1, Math.floor(limit))})`;
}
return '';
}
function limitSql(driver: KloConnectionDriver, limit: number): string {
if (driver === 'sqlserver') {
return '';
}
return ` LIMIT ${Math.max(1, Math.floor(limit))}`;
}
function aliasedTupleSelect(driver: KloConnectionDriver, columns: readonly string[]): string {
return columns
.map((column, index) => `${quoteKloRelationshipIdentifier(driver, column)} AS c${index}`)
.join(', ');
}
function nonNullPredicate(driver: KloConnectionDriver, columns: readonly string[]): string {
return columns.map((column) => `${quoteKloRelationshipIdentifier(driver, column)} IS NOT NULL`).join(' AND ');
}
function tupleEquality(columns: number): string {
return Array.from({ length: columns }, (_, index) => `child_values.c${index} = parent_values.c${index}`).join(
' AND ',
);
}
function buildTupleDistinctSql(input: {
driver: KloConnectionDriver;
table: KloTableRef;
columns: readonly string[];
}): string {
const tableSql = formatKloRelationshipTableRef(input.driver, input.table);
return [
'WITH tuple_values AS (',
`SELECT DISTINCT ${aliasedTupleSelect(input.driver, input.columns)} FROM ${tableSql}`,
`WHERE ${nonNullPredicate(input.driver, input.columns)}`,
')',
'SELECT COUNT(*) AS distinct_count FROM tuple_values',
].join(' ');
}
function buildCompositeCoverageSql(input: {
driver: KloConnectionDriver;
childTable: KloTableRef;
childColumns: readonly string[];
parentTable: KloTableRef;
parentColumns: readonly string[];
maxDistinctSourceValues: number;
}): string {
const childTableSql = formatKloRelationshipTableRef(input.driver, input.childTable);
const parentTableSql = formatKloRelationshipTableRef(input.driver, input.parentTable);
const top = topSql(input.driver, input.maxDistinctSourceValues);
const limit = limitSql(input.driver, input.maxDistinctSourceValues);
return [
'WITH child_values AS (',
`SELECT DISTINCT${top} ${aliasedTupleSelect(input.driver, input.childColumns)} FROM ${childTableSql}`,
`WHERE ${nonNullPredicate(input.driver, input.childColumns)}${limit}`,
'), parent_values AS (',
`SELECT DISTINCT ${aliasedTupleSelect(input.driver, input.parentColumns)} FROM ${parentTableSql}`,
`WHERE ${nonNullPredicate(input.driver, input.parentColumns)}`,
')',
'SELECT',
'(SELECT COUNT(*) FROM child_values) AS child_distinct,',
'(SELECT COUNT(*) FROM parent_values) AS parent_distinct,',
'SUM(CASE WHEN parent_values.c0 IS NOT NULL THEN 1 ELSE 0 END) AS overlap,',
'SUM(CASE WHEN parent_values.c0 IS NULL THEN 1 ELSE 0 END) AS violation_count',
'FROM child_values',
`LEFT JOIN parent_values ON ${tupleEquality(input.childColumns.length)}`,
].join(' ');
}
function relationshipStatus(input: {
targetUniqueness: number;
sourceCoverage: number;
violationRatio: number;
minSourceCoverage: number;
maxViolationRatio: number;
}): KloCompositeRelationshipStatus {
if (
input.targetUniqueness >= DEFAULT_MIN_PRIMARY_KEY_UNIQUENESS &&
input.sourceCoverage >= input.minSourceCoverage &&
input.violationRatio <= input.maxViolationRatio
) {
return 'accepted';
}
if (input.sourceCoverage >= 0.55) {
return 'review';
}
return 'rejected';
}
function hasAcceptedSubset(
accepted: readonly KloCompositePrimaryKeyCandidate[],
tableName: string,
columns: readonly string[],
): boolean {
const columnSet = new Set(columns);
return accepted.some(
(candidate) =>
candidate.table.name === tableName &&
candidate.columns.length < columns.length &&
candidate.columns.every((column) => columnSet.has(column)),
);
}
async function detectCompositePrimaryKeys(input: {
connectionId: string;
driver: KloConnectionDriver;
table: KloEnrichedTable;
profiles: KloRelationshipProfileArtifact;
executor: KloRelationshipReadOnlyExecutor;
ctx: KloScanContext;
maxCompositeWidth: number;
maxColumnsPerTable: number;
minPrimaryKeyUniqueness: number;
}): Promise<{ primaryKeys: KloCompositePrimaryKeyCandidate[]; queryCount: number }> {
const rowCount = tableRowCount(input.profiles, input.table.ref.name);
if (rowCount === 0) {
return { primaryKeys: [], queryCount: 0 };
}
if (
hasStrongSingleColumnKey({
table: input.table,
profiles: input.profiles,
minPrimaryKeyUniqueness: input.minPrimaryKeyUniqueness,
})
) {
return { primaryKeys: [], queryCount: 0 };
}
const columns = candidateKeyColumns({
table: input.table,
profiles: input.profiles,
maxColumnsPerTable: input.maxColumnsPerTable,
});
const primaryKeys: KloCompositePrimaryKeyCandidate[] = [];
let queryCount = 0;
for (let width = 2; width <= input.maxCompositeWidth; width += 1) {
for (const columnTuple of combinations(columns, width)) {
const columnNames = columnTuple.map((column) => column.name);
if (!tupleCoversTableNameKeyParts(input.table.ref.name, columnTuple)) {
continue;
}
if (hasAcceptedSubset(primaryKeys, input.table.ref.name, columnNames)) {
continue;
}
const result = await input.executor.executeReadOnly(
{
connectionId: input.connectionId,
sql: buildTupleDistinctSql({
driver: input.driver,
table: input.table.ref,
columns: columnNames,
}),
maxRows: 1,
},
input.ctx,
);
queryCount += 1;
const distinctCount = numberAt(result, 'distinct_count');
const uniquenessRatio = rowCount === 0 ? 0 : distinctCount / rowCount;
if (uniquenessRatio < input.minPrimaryKeyUniqueness) {
continue;
}
const nullRate = Math.max(
...columnNames.map((columnName) => profileNullRate(input.profiles, input.table.ref.name, columnName)),
);
primaryKeys.push({
id: tupleKey(input.table.ref.name, columnNames),
tableId: input.table.id,
table: input.table.ref,
columns: columnNames,
columnIds: columnTuple.map((column) => column.id),
score: Number(Math.min(0.99, 0.72 + uniquenessRatio * 0.22 + (1 - nullRate) * 0.06).toFixed(3)),
status: 'accepted',
evidence: {
rowCount,
distinctCount,
uniquenessRatio,
nullRate,
reasons: ['composite_unique_tuple', 'not_null_profile'],
},
});
}
}
return {
primaryKeys: primaryKeys.sort((left, right) =>
tupleKey(left.table.name, left.columns).localeCompare(tupleKey(right.table.name, right.columns)),
),
queryCount,
};
}
function columnsByName(table: KloEnrichedTable): Map<string, KloEnrichedColumn> {
return new Map(table.columns.map((column) => [column.name, column]));
}
function compatibleTuple(sourceColumns: readonly KloEnrichedColumn[], targetColumns: readonly KloEnrichedColumn[]): boolean {
if (sourceColumns.length !== targetColumns.length) {
return false;
}
return sourceColumns.every((source, index) => {
const target = targetColumns[index];
return Boolean(target) && source.dimensionType === target.dimensionType;
});
}
async function validateCompositeRelationship(input: {
connectionId: string;
driver: KloConnectionDriver;
sourceTable: KloEnrichedTable;
sourceColumns: readonly KloEnrichedColumn[];
targetKey: KloCompositePrimaryKeyCandidate;
targetTable: KloEnrichedTable;
targetColumns: readonly KloEnrichedColumn[];
executor: KloRelationshipReadOnlyExecutor;
ctx: KloScanContext;
minSourceCoverage: number;
maxViolationRatio: number;
}): Promise<{ relationship: KloCompositeRelationshipCandidate; queryCount: number }> {
const result = await input.executor.executeReadOnly(
{
connectionId: input.connectionId,
sql: buildCompositeCoverageSql({
driver: input.driver,
childTable: input.sourceTable.ref,
childColumns: input.sourceColumns.map((column) => column.name),
parentTable: input.targetTable.ref,
parentColumns: input.targetColumns.map((column) => column.name),
maxDistinctSourceValues: 10000,
}),
maxRows: 1,
},
input.ctx,
);
const childDistinct = numberAt(result, 'child_distinct');
const parentDistinct = numberAt(result, 'parent_distinct');
const overlap = numberAt(result, 'overlap');
const violationCount = numberAt(result, 'violation_count');
const sourceCoverage = childDistinct === 0 ? 0 : overlap / childDistinct;
const violationRatio = childDistinct === 0 ? 1 : violationCount / childDistinct;
const targetUniqueness = input.targetKey.evidence.uniquenessRatio;
const status = relationshipStatus({
targetUniqueness,
sourceCoverage,
violationRatio,
minSourceCoverage: input.minSourceCoverage,
maxViolationRatio: input.maxViolationRatio,
});
const from = tupleEndpoint(input.sourceTable, input.sourceColumns);
const to = {
tableId: input.targetKey.tableId,
columnIds: input.targetKey.columnIds,
table: input.targetKey.table,
columns: input.targetKey.columns,
};
const reasons =
status === 'accepted'
? ['composite_validation_passed']
: [
'composite_validation_failed',
sourceCoverage < input.minSourceCoverage ? 'low_source_coverage' : '',
violationRatio > input.maxViolationRatio ? 'excessive_violations' : '',
].filter(Boolean);
return {
queryCount: 1,
relationship: {
id: relationshipKey({
fromTable: from.table.name,
fromColumns: from.columns,
toTable: to.table.name,
toColumns: to.columns,
}),
from,
to,
relationshipType: 'many_to_one',
confidence: status === 'accepted' ? 0.95 : 0.62,
status,
source: 'composite_profile_match',
validation: {
targetUniqueness,
sourceCoverage,
violationCount,
violationRatio,
childDistinct,
parentDistinct,
overlap,
reasons,
},
},
};
}
export async function discoverKloCompositeRelationships(
input: DiscoverKloCompositeRelationshipsInput,
): Promise<DiscoverKloCompositeRelationshipsResult> {
if (!input.executor || !input.profiles.sqlAvailable) {
return {
primaryKeys: [],
relationships: [],
queryCount: 0,
warnings: ['composite_relationship_validation_unavailable'],
};
}
const settings = {
maxCompositeWidth: input.maxCompositeWidth ?? DEFAULT_MAX_COMPOSITE_WIDTH,
maxColumnsPerTable: input.maxColumnsPerTable ?? DEFAULT_MAX_COLUMNS_PER_TABLE,
minPrimaryKeyUniqueness: input.minPrimaryKeyUniqueness ?? DEFAULT_MIN_PRIMARY_KEY_UNIQUENESS,
minSourceCoverage: input.minSourceCoverage ?? DEFAULT_MIN_SOURCE_COVERAGE,
maxViolationRatio: input.maxViolationRatio ?? DEFAULT_MAX_VIOLATION_RATIO,
};
const tables = enabledTables(input.schema);
const tableByName = new Map(tables.map((table) => [table.ref.name, table]));
const primaryKeys: KloCompositePrimaryKeyCandidate[] = [];
let queryCount = 0;
for (const table of tables) {
const result = await detectCompositePrimaryKeys({
connectionId: input.connectionId,
driver: input.driver,
table,
profiles: input.profiles,
executor: input.executor,
ctx: input.ctx,
maxCompositeWidth: settings.maxCompositeWidth,
maxColumnsPerTable: settings.maxColumnsPerTable,
minPrimaryKeyUniqueness: settings.minPrimaryKeyUniqueness,
});
primaryKeys.push(...result.primaryKeys);
queryCount += result.queryCount;
}
const relationships: KloCompositeRelationshipCandidate[] = [];
for (const targetKey of primaryKeys) {
const targetTable = tableByName.get(targetKey.table.name);
if (!targetTable) {
continue;
}
const targetColumnByName = columnsByName(targetTable);
const targetColumns = targetKey.columns.flatMap((columnName) => {
const column = targetColumnByName.get(columnName);
return column ? [column] : [];
});
if (targetColumns.length !== targetKey.columns.length) {
continue;
}
for (const sourceTable of tables) {
if (sourceTable.id === targetTable.id) {
continue;
}
const sourceColumnByName = columnsByName(sourceTable);
const sourceColumns = targetKey.columns.flatMap((columnName) => {
const column = sourceColumnByName.get(columnName);
return column ? [column] : [];
});
if (sourceColumns.length !== targetKey.columns.length || !compatibleTuple(sourceColumns, targetColumns)) {
continue;
}
const result = await validateCompositeRelationship({
connectionId: input.connectionId,
driver: input.driver,
sourceTable,
sourceColumns,
targetKey,
targetTable,
targetColumns,
executor: input.executor,
ctx: input.ctx,
minSourceCoverage: settings.minSourceCoverage,
maxViolationRatio: settings.maxViolationRatio,
});
queryCount += result.queryCount;
if (result.relationship.status !== 'rejected') {
relationships.push(result.relationship);
}
}
}
return {
primaryKeys: primaryKeys.sort((left, right) => left.id.localeCompare(right.id)),
relationships: relationships.sort((left, right) => left.id.localeCompare(right.id)),
queryCount,
warnings: [],
};
}

View file

@ -0,0 +1,373 @@
import { describe, expect, it } from 'vitest';
import type { KloEnrichedRelationship, KloRelationshipEndpoint } from './enrichment-types.js';
import type { KloResolvedRelationshipDiscoveryCandidate } from './relationship-graph-resolver.js';
import {
buildKloRelationshipArtifacts,
buildKloRelationshipDiagnostics,
emptyKloRelationshipProfileArtifact,
} from './relationship-diagnostics.js';
function endpoint(table: string, column: string): KloRelationshipEndpoint {
return {
tableId: table,
columnIds: [`${table}.${column}`],
table: { catalog: null, db: null, name: table },
columns: [column],
};
}
function enrichedRelationship(input: {
id: string;
fromTable: string;
fromColumn: string;
toTable: string;
toColumn: string;
confidence?: number;
}): KloEnrichedRelationship {
return {
id: input.id,
source: 'inferred',
from: endpoint(input.fromTable, input.fromColumn),
to: endpoint(input.toTable, input.toColumn),
relationshipType: 'many_to_one',
confidence: input.confidence ?? 0.92,
isPrimaryKeyReference: true,
};
}
function resolvedRelationship(input: {
id: string;
status: 'accepted' | 'review' | 'rejected';
source?: 'normalized_table_match' | 'exact_column_match' | 'inflection' | 'self_reference' | 'llm_proposal';
fkScore?: number;
pkScore?: number;
validationReasons?: string[];
graphReasons?: string[];
}): KloResolvedRelationshipDiscoveryCandidate {
return {
id: input.id,
from: endpoint('orders', 'customer_id'),
to: endpoint('customers', 'id'),
relationshipType: 'many_to_one',
confidence: 0.88,
source: input.source ?? 'normalized_table_match',
status: input.status,
evidence:
input.source === 'llm_proposal'
? {
sourceColumnBase: 'buyer',
targetTableBase: 'customer',
targetColumnBase: 'id',
targetKeyScore: 0.88,
nameScore: 0.45,
reasons: ['llm_proposal', 'llm_pk_proposal'],
llmConfidence: 0.89,
llmRationale: 'Buyer reference values align with customer identifiers.',
}
: {
sourceColumnBase: 'customer',
targetTableBase: 'customer',
targetColumnBase: 'id',
targetKeyScore: 0.9,
nameScore: 0.85,
reasons: ['table_name_matches_source_column'],
},
score: 0.91,
validation: {
targetUniqueness: 1,
sourceCoverage: input.status === 'rejected' ? 0.2 : 1,
violationCount: input.status === 'rejected' ? 8 : 0,
violationRatio: input.status === 'rejected' ? 0.8 : 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 10,
parentDistinct: 10,
overlap: input.status === 'rejected' ? 2 : 10,
checkedValues: 10,
reasons: input.validationReasons ?? ['validation_passed'],
},
pkScore: input.pkScore ?? 0.97,
fkScore: input.fkScore ?? 0.94,
graph: {
targetPkScore: input.pkScore ?? 0.97,
incomingCandidateCount: 1,
conflictRank: 1,
reasons: input.graphReasons ?? ['target_pk_score_passed', 'fk_score_passed'],
},
};
}
describe('relationship diagnostics artifacts', () => {
it('groups graph-resolved relationships and preserves evidence reasons', () => {
const artifacts = buildKloRelationshipArtifacts({
connectionId: 'warehouse',
resolvedRelationships: [
resolvedRelationship({ id: 'accepted-edge', status: 'accepted', source: 'llm_proposal' }),
resolvedRelationship({
id: 'review-edge',
status: 'review',
validationReasons: ['validation_unavailable'],
graphReasons: ['validation_unavailable_review_only', 'fk_score_review'],
}),
resolvedRelationship({
id: 'rejected-edge',
status: 'rejected',
validationReasons: ['low_source_coverage'],
graphReasons: ['fk_score_rejected'],
}),
],
});
expect(artifacts.accepted).toHaveLength(1);
expect(artifacts.accepted[0]).toMatchObject({
source: 'llm_proposal',
evidence: {
llmConfidence: 0.89,
llmRationale: 'Buyer reference values align with customer identifiers.',
},
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
});
expect(artifacts.review).toHaveLength(1);
expect(artifacts.rejected).toHaveLength(1);
expect(artifacts.review[0]).toMatchObject({
id: 'review-edge',
status: 'review',
source: 'normalized_table_match',
fkScore: 0.94,
reasons: expect.arrayContaining(['validation_unavailable', 'validation_unavailable_review_only']),
});
expect(artifacts.rejected[0]?.reasons).toEqual(
expect.arrayContaining(['table_name_matches_source_column', 'low_source_coverage', 'fk_score_rejected']),
);
});
it('adapts legacy relationship updates into the richer artifact shape', () => {
const artifacts = buildKloRelationshipArtifacts({
connectionId: 'warehouse',
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
enrichedRelationship({
id: 'orders-customer',
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
}),
],
rejected: [
enrichedRelationship({
id: 'orders-account',
fromTable: 'orders',
fromColumn: 'account_id',
toTable: 'accounts',
toColumn: 'id',
confidence: 0.4,
}),
],
skipped: [{ relationshipId: 'orders-region', reason: 'validation_port_unavailable' }],
},
});
expect(artifacts.accepted[0]).toMatchObject({
id: 'orders-customer',
status: 'accepted',
source: 'inferred',
reasons: ['accepted_relationship_update'],
});
expect(artifacts.rejected[0]).toMatchObject({
id: 'orders-account',
status: 'rejected',
reasons: ['rejected_relationship_update'],
});
expect(artifacts.skipped).toEqual([{ relationshipId: 'orders-region', reason: 'validation_port_unavailable' }]);
});
it('deduplicates resolved and formal relationship update artifacts by edge id', () => {
const artifacts = buildKloRelationshipArtifacts({
connectionId: 'warehouse',
resolvedRelationships: [
{
id: 'orders:orders.account_id->accounts:accounts.id',
from: endpoint('orders', 'account_id'),
to: endpoint('accounts', 'id'),
relationshipType: 'many_to_one',
source: 'normalized_table_match',
status: 'accepted',
confidence: 0.92,
score: 0.9,
pkScore: 0.92,
fkScore: 0.9,
evidence: {
sourceColumnBase: 'account',
targetTableBase: 'account',
targetColumnBase: 'id',
targetKeyScore: 0.92,
nameScore: 0.92,
reasons: ['foreign_key_suffix'],
},
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 2,
parentDistinct: 2,
overlap: 2,
checkedValues: 2,
reasons: ['validation_passed'],
},
graph: {
targetPkScore: 0.92,
incomingCandidateCount: 1,
conflictRank: 1,
reasons: ['fk_score_passed'],
},
},
],
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
{
id: 'orders:orders.account_id->accounts:accounts.id',
source: 'formal',
from: endpoint('orders', 'account_id'),
to: endpoint('accounts', 'id'),
relationshipType: 'many_to_one',
confidence: 1,
isPrimaryKeyReference: true,
},
],
rejected: [],
skipped: [],
},
});
expect(artifacts.accepted).toHaveLength(1);
expect(artifacts.accepted[0]).toMatchObject({
id: 'orders:orders.account_id->accounts:accounts.id',
source: 'normalized_table_match',
reasons: expect.arrayContaining(['foreign_key_suffix', 'validation_passed', 'fk_score_passed']),
});
});
it('explains validation-unavailable review candidates', () => {
const artifacts = buildKloRelationshipArtifacts({
connectionId: 'warehouse',
resolvedRelationships: [
resolvedRelationship({
id: 'review-edge',
status: 'review',
validationReasons: ['validation_unavailable'],
graphReasons: ['validation_unavailable_review_only'],
}),
],
});
const profile = emptyKloRelationshipProfileArtifact({
connectionId: 'warehouse',
driver: 'sqlite',
reason: 'read_only_sql_unavailable',
});
const diagnostics = buildKloRelationshipDiagnostics({
connectionId: 'warehouse',
generatedAt: '2026-05-07T12:00:00.000Z',
artifacts,
profile,
warnings: [
{
code: 'connector_capability_missing',
message: 'KLO scan connector cannot run standalone statistical relationship validation',
recoverable: true,
metadata: { capability: 'readOnlySql' },
},
],
thresholds: { acceptThreshold: 0.85, reviewThreshold: 0.55 },
});
expect(diagnostics.summary).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
expect(diagnostics.noAcceptedReason).toBe('validation unavailable; review candidates written');
expect(diagnostics.candidateCountsBySource).toEqual({ normalized_table_match: 1 });
expect(diagnostics.validation).toEqual({
available: false,
sqlAvailable: false,
queryCount: 0,
});
expect(diagnostics.profileWarnings).toEqual(['read_only_sql_unavailable']);
expect(diagnostics.warnings[0]).toMatchObject({ code: 'connector_capability_missing' });
});
it('explains empty relationship output as a no-candidate outcome', () => {
const artifacts = buildKloRelationshipArtifacts({ connectionId: 'warehouse' });
const diagnostics = buildKloRelationshipDiagnostics({
connectionId: 'warehouse',
generatedAt: '2026-05-07T12:00:00.000Z',
artifacts,
profile: emptyKloRelationshipProfileArtifact({
connectionId: 'warehouse',
driver: 'sqlite',
reason: 'relationship_profiling_not_run',
}),
});
expect(diagnostics.summary).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
expect(diagnostics.noAcceptedReason).toBe('no candidate pairs passed type compatibility');
expect(diagnostics.candidateCountsBySource).toEqual({});
});
it('records composite relationship endpoints in relationship artifacts', () => {
const artifacts = buildKloRelationshipArtifacts({
connectionId: 'warehouse',
compositeRelationships: [
{
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
source: 'composite_profile_match',
status: 'accepted',
from: {
tableId: 'order_line_allocations',
columnIds: ['order_line_allocations.order_id', 'order_line_allocations.line_number'],
table: { catalog: null, db: null, name: 'order_line_allocations' },
columns: ['order_id', 'line_number'],
},
to: {
tableId: 'order_lines',
columnIds: ['order_lines.order_id', 'order_lines.line_number'],
table: { catalog: null, db: null, name: 'order_lines' },
columns: ['order_id', 'line_number'],
},
relationshipType: 'many_to_one',
confidence: 0.95,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
childDistinct: 2,
parentDistinct: 2,
overlap: 2,
reasons: ['composite_validation_passed'],
},
},
],
});
expect(artifacts.accepted).toEqual([
expect.objectContaining({
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
source: 'composite_profile_match',
from: expect.objectContaining({
columnIds: ['order_line_allocations.order_id', 'order_line_allocations.line_number'],
columns: ['order_id', 'line_number'],
}),
to: expect.objectContaining({
columnIds: ['order_lines.order_id', 'order_lines.line_number'],
columns: ['order_id', 'line_number'],
}),
reasons: ['composite_validation_passed'],
validation: expect.objectContaining({ sourceCoverage: 1 }),
}),
]);
});
});

View file

@ -0,0 +1,364 @@
import type {
KloEnrichedRelationship,
KloRelationshipEndpoint,
KloRelationshipType,
KloRelationshipUpdate,
} from './enrichment-types.js';
import type {
KloResolvedRelationshipDiscoveryCandidate,
KloResolvedRelationshipStatus,
} from './relationship-graph-resolver.js';
import type { KloCompositeRelationshipCandidate } from './relationship-composite-candidates.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
import type { KloConnectionDriver, KloScanWarning } from './types.js';
export interface KloRelationshipArtifactEndpoint {
tableId: string;
columnIds: string[];
table: {
catalog: string | null;
db: string | null;
name: string;
};
columns: string[];
}
export interface KloRelationshipArtifactEdge {
id: string;
status: KloResolvedRelationshipStatus;
source: string;
from: KloRelationshipArtifactEndpoint;
to: KloRelationshipArtifactEndpoint;
relationshipType: KloRelationshipType;
confidence: number;
pkScore: number | null;
fkScore: number | null;
score: number | null;
evidence: unknown | null;
validation: unknown | null;
graph: unknown | null;
reasons: string[];
}
export interface KloRelationshipArtifact {
connectionId: string;
accepted: KloRelationshipArtifactEdge[];
review: KloRelationshipArtifactEdge[];
rejected: KloRelationshipArtifactEdge[];
skipped: KloRelationshipUpdate['skipped'];
}
export interface KloRelationshipDiagnosticsSummary {
accepted: number;
review: number;
rejected: number;
skipped: number;
}
export interface KloRelationshipDiagnosticsValidation {
available: boolean;
sqlAvailable: boolean;
queryCount: number;
}
export interface KloRelationshipDiagnosticsThresholds {
acceptThreshold: number;
reviewThreshold: number;
}
export interface KloRelationshipDiagnosticsPolicy {
validationRequiredForManifest: boolean;
maxCandidatesPerColumn: number;
profileSampleRows: number;
validationConcurrency: number;
}
export interface KloRelationshipDiagnosticsArtifact {
connectionId: string;
generatedAt: string;
summary: KloRelationshipDiagnosticsSummary;
noAcceptedReason: string | null;
candidateCountsBySource: Record<string, number>;
validation: KloRelationshipDiagnosticsValidation;
thresholds: KloRelationshipDiagnosticsThresholds;
policy: KloRelationshipDiagnosticsPolicy;
warnings: KloScanWarning[];
profileWarnings: string[];
}
export interface BuildKloRelationshipArtifactsInput {
connectionId: string;
relationshipUpdate?: KloRelationshipUpdate | null;
resolvedRelationships?: readonly KloResolvedRelationshipDiscoveryCandidate[];
compositeRelationships?: readonly KloCompositeRelationshipCandidate[];
}
export interface BuildKloRelationshipDiagnosticsInput {
connectionId: string;
artifacts: KloRelationshipArtifact;
profile: KloRelationshipProfileArtifact;
warnings?: readonly KloScanWarning[];
thresholds?: Partial<KloRelationshipDiagnosticsThresholds>;
policy?: Partial<KloRelationshipDiagnosticsPolicy>;
generatedAt?: string;
}
export interface EmptyKloRelationshipProfileArtifactInput {
connectionId: string;
driver: KloConnectionDriver;
reason: string;
}
const DEFAULT_THRESHOLDS: KloRelationshipDiagnosticsThresholds = {
acceptThreshold: 0.85,
reviewThreshold: 0.55,
};
const DEFAULT_POLICY: KloRelationshipDiagnosticsPolicy = {
validationRequiredForManifest: true,
maxCandidatesPerColumn: 25,
profileSampleRows: 10000,
validationConcurrency: 4,
};
function endpointArtifact(endpoint: KloRelationshipEndpoint): KloRelationshipArtifactEndpoint {
return {
tableId: endpoint.tableId,
columnIds: endpoint.columnIds,
table: {
catalog: endpoint.table.catalog,
db: endpoint.table.db,
name: endpoint.table.name,
},
columns: endpoint.columns,
};
}
function uniqueReasons(values: readonly string[]): string[] {
return Array.from(new Set(values.filter((value) => value.trim().length > 0)));
}
function relationshipUpdateEdge(
relationship: KloEnrichedRelationship,
status: 'accepted' | 'rejected',
): KloRelationshipArtifactEdge {
const acceptedReason = relationship.source === 'formal' ? 'formal_metadata_accepted' : 'accepted_relationship_update';
return {
id: relationship.id,
status,
source: relationship.source,
from: endpointArtifact(relationship.from),
to: endpointArtifact(relationship.to),
relationshipType: relationship.relationshipType,
confidence: relationship.confidence,
pkScore: null,
fkScore: null,
score: relationship.confidence,
evidence: relationship.source === 'formal' ? { source: 'formal_metadata' } : null,
validation: relationship.source === 'formal' ? { status: 'formal_metadata' } : null,
graph: null,
reasons: [status === 'accepted' ? acceptedReason : 'rejected_relationship_update'],
};
}
function resolvedEdge(candidate: KloResolvedRelationshipDiscoveryCandidate): KloRelationshipArtifactEdge {
return {
id: candidate.id,
status: candidate.status,
source: candidate.source,
from: endpointArtifact(candidate.from),
to: endpointArtifact(candidate.to),
relationshipType: candidate.relationshipType,
confidence: candidate.confidence,
pkScore: candidate.pkScore,
fkScore: candidate.fkScore,
score: candidate.score,
evidence: candidate.evidence,
validation: candidate.validation,
graph: candidate.graph,
reasons: uniqueReasons([
...candidate.evidence.reasons,
...candidate.validation.reasons,
...candidate.graph.reasons,
]),
};
}
function compositeEndpointArtifact(endpoint: KloCompositeRelationshipCandidate['from']): KloRelationshipArtifactEndpoint {
return {
tableId: endpoint.tableId,
columnIds: endpoint.columnIds,
table: {
catalog: endpoint.table.catalog,
db: endpoint.table.db,
name: endpoint.table.name,
},
columns: endpoint.columns,
};
}
function compositeEdge(candidate: KloCompositeRelationshipCandidate): KloRelationshipArtifactEdge {
return {
id: candidate.id,
status: candidate.status,
source: candidate.source,
from: compositeEndpointArtifact(candidate.from),
to: compositeEndpointArtifact(candidate.to),
relationshipType: candidate.relationshipType,
confidence: candidate.confidence,
pkScore: null,
fkScore: candidate.confidence,
score: candidate.confidence,
evidence: { source: candidate.source },
validation: candidate.validation,
graph: null,
reasons: uniqueReasons(candidate.validation.reasons),
};
}
function emptyArtifacts(connectionId: string): KloRelationshipArtifact {
return {
connectionId,
accepted: [],
review: [],
rejected: [],
skipped: [],
};
}
function pushUniqueEdge(edges: KloRelationshipArtifactEdge[], edge: KloRelationshipArtifactEdge): void {
if (!edges.some((item) => item.id === edge.id)) {
edges.push(edge);
}
}
export function buildKloRelationshipArtifacts(input: BuildKloRelationshipArtifactsInput): KloRelationshipArtifact {
const artifacts = emptyArtifacts(input.connectionId);
if (input.resolvedRelationships) {
for (const candidate of input.resolvedRelationships) {
const edge = resolvedEdge(candidate);
if (edge.status === 'accepted') {
pushUniqueEdge(artifacts.accepted, edge);
} else if (edge.status === 'review') {
pushUniqueEdge(artifacts.review, edge);
} else {
pushUniqueEdge(artifacts.rejected, edge);
}
}
}
for (const candidate of input.compositeRelationships ?? []) {
const edge = compositeEdge(candidate);
if (edge.status === 'accepted') {
pushUniqueEdge(artifacts.accepted, edge);
} else if (edge.status === 'review') {
pushUniqueEdge(artifacts.review, edge);
} else {
pushUniqueEdge(artifacts.rejected, edge);
}
}
const relationshipUpdate = input.relationshipUpdate;
if (relationshipUpdate) {
for (const relationship of relationshipUpdate.accepted) {
pushUniqueEdge(artifacts.accepted, relationshipUpdateEdge(relationship, 'accepted'));
}
for (const relationship of relationshipUpdate.rejected) {
pushUniqueEdge(artifacts.rejected, relationshipUpdateEdge(relationship, 'rejected'));
}
artifacts.skipped.push(...relationshipUpdate.skipped);
}
return {
connectionId: artifacts.connectionId,
accepted: artifacts.accepted.sort((left, right) => left.id.localeCompare(right.id)),
review: artifacts.review.sort((left, right) => left.id.localeCompare(right.id)),
rejected: artifacts.rejected.sort((left, right) => left.id.localeCompare(right.id)),
skipped: [...artifacts.skipped].sort((left, right) => left.relationshipId.localeCompare(right.relationshipId)),
};
}
function allEdges(artifacts: KloRelationshipArtifact): KloRelationshipArtifactEdge[] {
return [...artifacts.accepted, ...artifacts.review, ...artifacts.rejected];
}
function candidateCountsBySource(artifacts: KloRelationshipArtifact): Record<string, number> {
const counts: Record<string, number> = {};
for (const edge of allEdges(artifacts)) {
counts[edge.source] = (counts[edge.source] ?? 0) + 1;
}
return Object.fromEntries(Object.entries(counts).sort(([left], [right]) => left.localeCompare(right)));
}
function hasReason(artifacts: KloRelationshipArtifact, reason: string): boolean {
return allEdges(artifacts).some((edge) => edge.reasons.includes(reason));
}
function noAcceptedReason(input: {
artifacts: KloRelationshipArtifact;
profile: KloRelationshipProfileArtifact;
}): string | null {
if (input.artifacts.accepted.length > 0) {
return null;
}
if (
input.artifacts.review.length > 0 &&
(!input.profile.sqlAvailable ||
hasReason(input.artifacts, 'validation_unavailable') ||
hasReason(input.artifacts, 'validation_unavailable_review_only'))
) {
return 'validation unavailable; review candidates written';
}
if (input.artifacts.review.length > 0) {
return 'relationship candidates require review before manifest writes';
}
if (input.artifacts.rejected.length > 0) {
return 'all candidate pairs were rejected';
}
return 'no candidate pairs passed type compatibility';
}
export function emptyKloRelationshipProfileArtifact(
input: EmptyKloRelationshipProfileArtifactInput,
): KloRelationshipProfileArtifact {
return {
connectionId: input.connectionId,
driver: input.driver,
sqlAvailable: false,
queryCount: 0,
tables: [],
columns: {},
warnings: [input.reason],
};
}
export function buildKloRelationshipDiagnostics(
input: BuildKloRelationshipDiagnosticsInput,
): KloRelationshipDiagnosticsArtifact {
const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
const policy = { ...DEFAULT_POLICY, ...input.policy };
const summary: KloRelationshipDiagnosticsSummary = {
accepted: input.artifacts.accepted.length,
review: input.artifacts.review.length,
rejected: input.artifacts.rejected.length,
skipped: input.artifacts.skipped.length,
};
return {
connectionId: input.connectionId,
generatedAt: input.generatedAt ?? new Date().toISOString(),
summary,
noAcceptedReason: noAcceptedReason({ artifacts: input.artifacts, profile: input.profile }),
candidateCountsBySource: candidateCountsBySource(input.artifacts),
validation: {
available: input.profile.sqlAvailable,
sqlAvailable: input.profile.sqlAvailable,
queryCount: input.profile.queryCount,
},
thresholds,
policy,
warnings: [...(input.warnings ?? [])],
profileWarnings: [...input.profile.warnings],
};
}

View file

@ -0,0 +1,699 @@
import type { KloLlmProvider } from '@klo/llm';
import Database from 'better-sqlite3';
import { afterEach, describe, expect, it, vi } from 'vitest';
import { buildDefaultKloProjectConfig } from '../project/config.js';
import { snapshotToKloEnrichedSchema } from './local-enrichment.js';
import {
loadKloRelationshipBenchmarkFixture,
maskKloRelationshipBenchmarkSnapshot,
} from './relationship-benchmarks.js';
import { discoverKloRelationships } from './relationship-discovery.js';
import { createKloConnectorCapabilities } from './types.js';
import type { KloQueryResult, KloReadOnlyQueryInput, KloScanConnector, KloScanContext, KloSchemaSnapshot } from './types.js';
class InMemorySqliteExecutor {
readonly db = new Database(':memory:');
queryCount = 0;
executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
this.queryCount += 1;
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return Promise.resolve({
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
});
}
close(): void {
this.db.close();
}
}
function snapshot(): KloSchemaSnapshot {
return {
connectionId: 'warehouse',
driver: 'sqlite',
extractedAt: '2026-05-07T00:00:00.000Z',
scope: {},
metadata: {},
tables: [
{
catalog: null,
db: null,
name: 'accounts',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'name',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: null,
name: 'orders',
kind: 'table',
comment: null,
estimatedRows: 3,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'account_id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
}
function declaredForeignKeySnapshot(): KloSchemaSnapshot {
const source = snapshot();
return {
...source,
tables: source.tables.map((table) =>
table.name === 'accounts'
? {
...table,
columns: table.columns.map((column) => (column.name === 'id' ? { ...column, primaryKey: true } : column)),
}
: table.name === 'orders'
? {
...table,
foreignKeys: [
{
fromColumn: 'account_id',
toCatalog: null,
toDb: null,
toTable: 'accounts',
toColumn: 'id',
constraintName: 'orders_account_id_fkey',
},
],
}
: table,
),
};
}
function naturalKeySnapshot(): KloSchemaSnapshot {
return {
connectionId: 'warehouse',
driver: 'sqlite',
extractedAt: '2026-05-07T00:00:00.000Z',
scope: {},
metadata: {},
tables: [
{
catalog: null,
db: null,
name: 'dim_countries',
kind: 'table',
comment: null,
estimatedRows: 3,
foreignKeys: [],
columns: [
{
name: 'iso_code',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'name',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: null,
name: 'fct_accounts',
kind: 'table',
comment: null,
estimatedRows: 4,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'country_code',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
}
function connector(executor: InMemorySqliteExecutor | null): KloScanConnector {
return {
id: 'sqlite:test',
driver: 'sqlite',
capabilities: createKloConnectorCapabilities({
readOnlySql: executor !== null,
columnStats: executor !== null,
tableSampling: false,
columnSampling: false,
}),
introspect: async () => snapshot(),
executeReadOnly: executor ? executor.executeReadOnly.bind(executor) : undefined,
};
}
function llmProvider(): KloLlmProvider {
const model = { modelId: 'claude-sonnet-4-6', provider: 'anthropic' };
return {
getModel: vi.fn(() => model as ReturnType<KloLlmProvider['getModel']>),
getModelByName: vi.fn(() => model as ReturnType<KloLlmProvider['getModelByName']>),
cacheMarker: vi.fn(),
repairToolCallHandler: vi.fn(),
thinkingProviderOptions: vi.fn(() => ({})),
telemetryConfig: vi.fn(() => undefined),
promptCachingConfig: vi.fn(
() =>
({
enabled: false,
systemTtl: '1h',
toolsTtl: '1h',
historyTtl: '5m',
cacheSystem: true,
cacheTools: true,
cacheHistory: true,
vertexFallbackTo5m: false,
}) as ReturnType<KloLlmProvider['promptCachingConfig']>,
),
activeBackend: vi.fn(() => 'anthropic' as ReturnType<KloLlmProvider['activeBackend']>),
};
}
function relationshipSettings() {
return buildDefaultKloProjectConfig('warehouse').scan.relationships;
}
function llmOnlyRelationshipSnapshot(): KloSchemaSnapshot {
return {
connectionId: 'warehouse',
driver: 'sqlite',
extractedAt: '2026-05-07T00:00:00.000Z',
scope: {},
metadata: {},
tables: [
{
catalog: null,
db: null,
name: 'customers',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: null,
name: 'orders',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'buyer_ref',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
}
describe('production relationship discovery', () => {
let executor: InMemorySqliteExecutor | null = null;
afterEach(() => {
executor?.close();
executor = null;
});
it('accepts a validated relationship without declared PK or FK metadata', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
`);
const result = await discoverKloRelationships({
connectionId: 'warehouse',
driver: 'sqlite',
connector: connector(executor),
schema: snapshotToKloEnrichedSchema(snapshot()),
context: { runId: 'relationship-run-1' },
settings: relationshipSettings(),
});
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.statisticalValidation).toBe('completed');
expect(result.profile.sqlAvailable).toBe(true);
expect(result.profile.queryCount).toBeGreaterThan(0);
expect(result.relationshipUpdate.accepted).toEqual([
expect.objectContaining({
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
relationshipType: 'many_to_one',
source: 'inferred',
isPrimaryKeyReference: true,
}),
]);
expect(result.resolvedRelationships[0]).toMatchObject({
status: 'accepted',
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
graph: expect.objectContaining({ reasons: expect.arrayContaining(['fk_score_passed']) }),
});
});
it('accepts a profile-driven natural-key relationship without declared metadata', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE dim_countries (iso_code TEXT NOT NULL, name TEXT NOT NULL);
CREATE TABLE fct_accounts (id INTEGER NOT NULL, country_code TEXT NOT NULL);
INSERT INTO dim_countries (iso_code, name) VALUES ('US', 'United States'), ('FR', 'France'), ('DE', 'Germany');
INSERT INTO fct_accounts (id, country_code) VALUES (1, 'US'), (2, 'FR'), (3, 'US'), (4, 'DE');
`);
const schema = naturalKeySnapshot();
const result = await discoverKloRelationships({
connectionId: 'warehouse',
driver: 'sqlite',
connector: {
...connector(executor),
introspect: async () => schema,
},
schema: snapshotToKloEnrichedSchema(schema),
context: { runId: 'natural-key-relationship-run' },
settings: relationshipSettings(),
});
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.relationshipUpdate.accepted).toEqual([
expect.objectContaining({
from: expect.objectContaining({ table: expect.objectContaining({ name: 'fct_accounts' }), columns: ['country_code'] }),
to: expect.objectContaining({ table: expect.objectContaining({ name: 'dim_countries' }), columns: ['iso_code'] }),
relationshipType: 'many_to_one',
source: 'inferred',
isPrimaryKeyReference: true,
}),
]);
expect(result.resolvedRelationships[0]).toMatchObject({
source: 'profile_match',
status: 'accepted',
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
graph: expect.objectContaining({ reasons: expect.arrayContaining(['fk_score_passed']) }),
});
});
it('accepts an embedding-driven relationship without declared metadata or LLM proposals', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE customers (id INTEGER NOT NULL, name TEXT NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, buyer_ref INTEGER NOT NULL);
INSERT INTO customers (id, name) VALUES (1, 'Acme'), (2, 'Orbit'), (3, 'Globex');
INSERT INTO orders (id, buyer_ref) VALUES (10, 1), (11, 2), (12, 2), (13, 3);
`);
const sourceSnapshot = llmOnlyRelationshipSnapshot();
const schema = snapshotToKloEnrichedSchema(
sourceSnapshot,
new Map([
['customers.id', [1, 0, 0]],
['customers.name', [0, 1, 0]],
['orders.id', [0, 0, 1]],
['orders.buyer_ref', [0.995, 0.005, 0]],
]),
);
const result = await discoverKloRelationships({
connectionId: 'warehouse',
driver: 'sqlite',
connector: {
...connector(executor),
introspect: async () => sourceSnapshot,
},
schema,
context: { runId: 'embedding-relationship-run' },
settings: {
...relationshipSettings(),
llmProposals: false,
},
});
expect(result.llmRelationshipValidation).toBe('skipped');
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.relationshipUpdate.accepted[0]).toMatchObject({
from: { table: { name: 'orders' }, columns: ['buyer_ref'] },
to: { table: { name: 'customers' }, columns: ['id'] },
});
expect(result.resolvedRelationships[0]).toMatchObject({
source: 'embedding_similarity',
status: 'accepted',
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
evidence: expect.objectContaining({
reasons: expect.arrayContaining(['embedding_similarity', 'target_key_like']),
embeddingSimilarity: expect.any(Number),
}),
});
});
it('keeps candidates review-only when read-only SQL is unavailable', async () => {
const result = await discoverKloRelationships({
connectionId: 'warehouse',
driver: 'sqlite',
connector: connector(null),
schema: snapshotToKloEnrichedSchema(snapshot()),
context: { runId: 'relationship-run-no-sql' },
settings: relationshipSettings(),
});
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
expect(result.statisticalValidation).toBe('skipped');
expect(result.relationshipUpdate.accepted).toEqual([]);
expect(result.resolvedRelationships[0]).toMatchObject({
status: 'review',
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_unavailable']) }),
});
expect(result.warnings).toContainEqual({
code: 'connector_capability_missing',
message: 'KLO scan connector cannot run read-only SQL relationship validation',
recoverable: true,
metadata: { capability: 'readOnlySql' },
});
});
it('accepts formal metadata relationships when read-only SQL is unavailable', async () => {
const sourceSnapshot = declaredForeignKeySnapshot();
const result = await discoverKloRelationships({
connectionId: 'warehouse',
driver: 'sqlite',
connector: connector(null),
schema: snapshotToKloEnrichedSchema(sourceSnapshot),
context: { runId: 'formal-metadata-no-sql' },
settings: relationshipSettings(),
});
expect(result.statisticalValidation).toBe('skipped');
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.resolvedRelationships).toEqual([]);
expect(result.relationshipUpdate.accepted).toEqual([
expect.objectContaining({
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
source: 'formal',
confidence: 1,
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
}),
]);
expect(result.relationshipUpdate.rejected).toEqual([]);
expect(result.relationshipUpdate.skipped).toEqual([]);
});
it('accepts LLM-only relationship proposals only after SQL validation and graph resolution pass', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE customers (id INTEGER);
CREATE TABLE orders (id INTEGER, buyer_ref INTEGER);
INSERT INTO customers (id) VALUES (1), (2);
INSERT INTO orders (id, buyer_ref) VALUES (10, 1), (11, 2);
`);
const generateText = vi.fn(async () => ({
output: {
pkCandidates: [{ table: 'customers', column: 'id', confidence: 0.91, rationale: 'Unique customer key.' }],
fkCandidates: [
{
fromTable: 'orders',
fromColumn: 'buyer_ref',
toTable: 'customers',
toColumn: 'id',
confidence: 0.89,
rationale: 'Buyer reference values align with customer identifiers.',
},
],
},
}));
const result = await discoverKloRelationships({
connectionId: 'warehouse',
driver: 'sqlite',
connector: connector(executor),
schema: snapshotToKloEnrichedSchema(llmOnlyRelationshipSnapshot()),
context: { runId: 'llm-relationship-orchestrator' },
settings: relationshipSettings(),
llmProvider: llmProvider(),
generateText,
});
expect(result.llmRelationshipValidation).toBe('completed');
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
expect(result.resolvedRelationships[0]).toMatchObject({
source: 'llm_proposal',
status: 'accepted',
evidence: {
llmRationale: 'Buyer reference values align with customer identifiers.',
},
});
expect(result.relationshipUpdate.accepted[0]).toMatchObject({
from: { table: { name: 'orders' }, columns: ['buyer_ref'] },
to: { table: { name: 'customers' }, columns: ['id'] },
});
});
it('uses configured acceptance thresholds when resolving graph relationships', async () => {
const executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts VALUES (1, 'Acme'), (2, 'Orbit');
INSERT INTO orders VALUES (10, 1), (11, 1), (12, 2);
`);
const settings = {
...buildDefaultKloProjectConfig('warehouse').scan.relationships,
acceptThreshold: 0.99,
reviewThreshold: 0.55,
};
const result = await discoverKloRelationships({
connectionId: 'warehouse',
driver: 'sqlite',
connector: connector(executor),
schema: snapshotToKloEnrichedSchema(snapshot()),
context: { runId: 'configured-thresholds' },
settings,
});
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
expect(result.relationshipUpdate.accepted).toEqual([]);
expect(result.resolvedRelationships[0]).toMatchObject({
status: 'review',
graph: { reasons: expect.arrayContaining(['fk_score_review']) },
});
executor.close();
});
it('passes maxCandidatesPerColumn into broad deterministic candidate generation', async () => {
const executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
CREATE TABLE account_archive (id INTEGER NOT NULL, name TEXT NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts VALUES (1, 'Acme'), (2, 'Orbit');
INSERT INTO account_archive VALUES (99, 'Archive');
INSERT INTO orders VALUES (10, 1), (11, 1), (12, 2);
`);
const richSnapshot = snapshot();
richSnapshot.tables.splice(1, 0, {
catalog: null,
db: null,
name: 'account_archive',
kind: 'table',
comment: null,
estimatedRows: 1,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'name',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
comment: null,
},
],
});
const result = await discoverKloRelationships({
connectionId: 'warehouse',
driver: 'sqlite',
connector: {
...connector(executor),
introspect: async () => richSnapshot,
},
schema: snapshotToKloEnrichedSchema(richSnapshot),
context: { runId: 'candidate-cap' },
settings: {
...buildDefaultKloProjectConfig('warehouse').scan.relationships,
maxCandidatesPerColumn: 1,
},
});
const sourceTargets = result.resolvedRelationships
.filter((relationship) => relationship.from.columns[0] === 'account_id')
.map((relationship) => `${relationship.to.table.name}.${relationship.to.columns[0]}`);
expect(sourceTargets).toHaveLength(1);
expect(sourceTargets).toEqual(['accounts.id']);
executor.close();
});
it('accepts SQL-validated composite relationships in production relationship-discovery detection', async () => {
const fixtureRoot = new URL(
'../../test/fixtures/relationship-benchmarks/composite_keys_no_declared_constraints',
import.meta.url,
);
const fixture = await loadKloRelationshipBenchmarkFixture(fixtureRoot.pathname);
const maskedSnapshot = maskKloRelationshipBenchmarkSnapshot(fixture.snapshot, 'declared_pks_and_declared_fks_removed');
const database = new Database(fixture.dataPath ?? '', { readonly: true, fileMustExist: true });
const testConnector: KloScanConnector = {
id: 'sqlite:composite',
driver: 'sqlite',
capabilities: createKloConnectorCapabilities({
readOnlySql: true,
columnStats: true,
tableSampling: false,
columnSampling: false,
}),
introspect: async () => maskedSnapshot,
executeReadOnly: async (input) => {
const rows = database.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return {
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
};
},
};
const result = await discoverKloRelationships({
connectionId: maskedSnapshot.connectionId,
driver: maskedSnapshot.driver,
connector: testConnector,
schema: snapshotToKloEnrichedSchema(maskedSnapshot, new Map()),
context: { runId: 'test:production-composite' },
settings: relationshipSettings(),
});
database.close();
expect(
result.relationshipUpdate.accepted.map(
(relationship) =>
`${relationship.from.table.name}.(${relationship.from.columns.join(',')})->${relationship.to.table.name}.(${relationship.to.columns.join(',')})`,
),
).toContain('order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)');
expect(result.relationships.accepted).toBeGreaterThanOrEqual(1);
expect(result.compositeRelationships.map((relationship) => relationship.status)).toContain('accepted');
});
});

View file

@ -0,0 +1,338 @@
import type { KloLlmProvider } from '@klo/llm';
import type { KloScanRelationshipConfig } from '../project/config.js';
import type { KloEnrichedRelationship, KloEnrichedSchema, KloRelationshipUpdate } from './enrichment-types.js';
import {
generateKloRelationshipDiscoveryCandidates,
type KloRelationshipDiscoveryCandidate,
mergeKloRelationshipDiscoveryCandidates,
} from './relationship-candidates.js';
import {
discoverKloCompositeRelationships,
type KloCompositeRelationshipCandidate,
} from './relationship-composite-candidates.js';
import { collectKloFormalMetadataRelationships } from './relationship-formal-metadata.js';
import {
type KloResolvedRelationshipDiscoveryCandidate,
resolveKloRelationshipGraph,
} from './relationship-graph-resolver.js';
import {
type KloRelationshipLlmProposalGenerateText,
proposeKloRelationshipCandidatesWithLlm,
} from './relationship-llm-proposal.js';
import {
createKloRelationshipProfileCache,
type KloRelationshipProfileArtifact,
type KloRelationshipReadOnlyExecutor,
profileKloRelationshipSchema,
} from './relationship-profiling.js';
import { validateKloRelationshipDiscoveryCandidates } from './relationship-validation.js';
import type {
KloConnectionDriver,
KloScanConnector,
KloScanContext,
KloScanEnrichmentSummary,
KloScanRelationshipSummary,
KloScanWarning,
} from './types.js';
export interface DiscoverKloRelationshipsInput {
connectionId: string;
driver: KloConnectionDriver;
connector: KloScanConnector;
schema: KloEnrichedSchema;
context: KloScanContext;
settings: KloScanRelationshipConfig;
llmProvider?: KloLlmProvider | null;
generateText?: KloRelationshipLlmProposalGenerateText;
}
export interface DiscoverKloRelationshipsResult {
relationshipUpdate: KloRelationshipUpdate;
relationships: KloScanRelationshipSummary;
profile: KloRelationshipProfileArtifact;
resolvedRelationships: KloResolvedRelationshipDiscoveryCandidate[];
compositeRelationships: KloCompositeRelationshipCandidate[];
statisticalValidation: KloScanEnrichmentSummary['statisticalValidation'];
llmRelationshipValidation: KloScanEnrichmentSummary['llmRelationshipValidation'];
warnings: KloScanWarning[];
}
function relationshipFromResolved(candidate: KloResolvedRelationshipDiscoveryCandidate): KloEnrichedRelationship {
return {
id: candidate.id,
source: 'inferred',
from: candidate.from,
to: candidate.to,
relationshipType: candidate.relationshipType,
confidence: candidate.fkScore,
isPrimaryKeyReference: candidate.pkScore >= 0.78,
};
}
function relationshipFromComposite(candidate: KloCompositeRelationshipCandidate): KloEnrichedRelationship {
return {
id: candidate.id,
source: 'inferred',
from: {
tableId: candidate.from.tableId,
columnIds: candidate.from.columnIds,
table: candidate.from.table,
columns: candidate.from.columns,
},
to: {
tableId: candidate.to.tableId,
columnIds: candidate.to.columnIds,
table: candidate.to.table,
columns: candidate.to.columns,
},
relationshipType: candidate.relationshipType,
confidence: candidate.confidence,
isPrimaryKeyReference: candidate.status === 'accepted',
};
}
function relationshipId(input: Pick<KloEnrichedRelationship, 'from' | 'to'>): string {
return `${input.from.tableId}:(${input.from.columnIds.join(',')})->${input.to.tableId}:(${input.to.columnIds.join(',')})`;
}
function nonFormalAcceptedRelationships(input: {
formalIds: ReadonlySet<string>;
resolvedRelationships: readonly KloResolvedRelationshipDiscoveryCandidate[];
}): KloEnrichedRelationship[] {
return input.resolvedRelationships
.filter((candidate) => candidate.status === 'accepted' && !input.formalIds.has(candidate.id))
.map(relationshipFromResolved);
}
function relationshipSummary(
resolvedRelationships: readonly KloResolvedRelationshipDiscoveryCandidate[],
): KloScanRelationshipSummary {
return {
accepted: resolvedRelationships.filter((candidate) => candidate.status === 'accepted').length,
review: resolvedRelationships.filter((candidate) => candidate.status === 'review').length,
rejected: resolvedRelationships.filter((candidate) => candidate.status === 'rejected').length,
skipped: 0,
};
}
function compositeSummary(relationships: readonly KloCompositeRelationshipCandidate[]): KloScanRelationshipSummary {
return {
accepted: relationships.filter((candidate) => candidate.status === 'accepted').length,
review: relationships.filter((candidate) => candidate.status === 'review').length,
rejected: relationships.filter((candidate) => candidate.status === 'rejected').length,
skipped: 0,
};
}
async function detectCompositeRelationships(input: {
connectionId: string;
driver: DiscoverKloRelationshipsInput['driver'];
schema: KloEnrichedSchema;
profile: KloRelationshipProfileArtifact;
executor: KloRelationshipReadOnlyExecutor | null;
context: DiscoverKloRelationshipsInput['context'];
warnings: KloScanWarning[];
}): Promise<KloCompositeRelationshipCandidate[]> {
if (!input.executor || !input.profile.sqlAvailable) {
return [];
}
try {
const compositeDetection = await discoverKloCompositeRelationships({
connectionId: input.connectionId,
driver: input.driver,
schema: input.schema,
profiles: input.profile,
executor: input.executor,
ctx: input.context,
});
for (const warning of compositeDetection.warnings) {
input.warnings.push({
code: 'relationship_validation_failed',
message: warning,
recoverable: true,
metadata: { source: 'composite_relationship_detection' },
});
}
return compositeDetection.relationships;
} catch (error) {
input.warnings.push({
code: 'relationship_validation_failed',
message: `KLO composite relationship detection failed: ${error instanceof Error ? error.message : String(error)}`,
recoverable: true,
metadata: { source: 'composite_relationship_detection' },
});
return [];
}
}
function combinedRelationshipSummary(input: {
formalAccepted: number;
formalSkipped: number;
resolvedRelationships: readonly KloResolvedRelationshipDiscoveryCandidate[];
}): KloScanRelationshipSummary {
const graph = relationshipSummary(input.resolvedRelationships);
return {
accepted: input.formalAccepted + graph.accepted,
review: graph.review,
rejected: graph.rejected,
skipped: input.formalSkipped,
};
}
function sqlExecutor(input: DiscoverKloRelationshipsInput): {
executor: KloRelationshipReadOnlyExecutor | null;
warnings: KloScanWarning[];
} {
if (!input.connector.capabilities.readOnlySql) {
return {
executor: null,
warnings: [
{
code: 'connector_capability_missing',
message: 'KLO scan connector cannot run read-only SQL relationship validation',
recoverable: true,
metadata: { capability: 'readOnlySql' },
},
],
};
}
if (!input.connector.executeReadOnly) {
return {
executor: null,
warnings: [
{
code: 'relationship_validation_failed',
message: 'KLO scan connector advertises readOnlySql but does not expose executeReadOnly',
recoverable: true,
metadata: { capability: 'readOnlySql' },
},
],
};
}
return {
executor: {
executeReadOnly: input.connector.executeReadOnly.bind(input.connector),
},
warnings: [],
};
}
export async function discoverKloRelationships(
input: DiscoverKloRelationshipsInput,
): Promise<DiscoverKloRelationshipsResult> {
const { executor, warnings } = sqlExecutor(input);
const formalMetadata = collectKloFormalMetadataRelationships(input.schema);
const profileCache = createKloRelationshipProfileCache();
const profile = await profileKloRelationshipSchema({
connectionId: input.connectionId,
driver: input.driver,
schema: input.schema,
executor,
ctx: input.context,
profileSampleRows: input.settings.profileSampleRows,
cache: profileCache,
});
const deterministicCandidates: KloRelationshipDiscoveryCandidate[] = generateKloRelationshipDiscoveryCandidates(
input.schema,
{
maxCandidatesPerColumn: input.settings.maxCandidatesPerColumn,
profiles: profile,
},
);
const llmProposalResult = input.settings.llmProposals
? await proposeKloRelationshipCandidatesWithLlm({
connectionId: input.connectionId,
schema: input.schema,
profile,
llmProvider: input.llmProvider ?? null,
settings: {
maxTablesPerBatch: input.settings.maxLlmTablesPerBatch,
},
generateText: input.generateText,
})
: { candidates: [], warnings: [], llmCalls: 0, summary: 'skipped' as const };
const candidates = mergeKloRelationshipDiscoveryCandidates([
...deterministicCandidates,
...llmProposalResult.candidates,
]).filter((candidate) => !formalMetadata.acceptedIds.has(candidate.id));
warnings.push(...llmProposalResult.warnings);
const validated = await validateKloRelationshipDiscoveryCandidates({
connectionId: input.connectionId,
driver: input.driver,
candidates,
profiles: profile,
executor,
ctx: input.context,
tableCount: input.schema.tables.length,
settings: {
acceptThreshold: input.settings.acceptThreshold,
reviewThreshold: input.settings.reviewThreshold,
maxDistinctSourceValues: input.settings.profileSampleRows,
concurrency: input.settings.validationConcurrency,
validationBudget: input.settings.validationBudget,
},
});
const graph = resolveKloRelationshipGraph({
schema: input.schema,
profiles: profile,
candidates: validated,
settings: {
acceptThreshold: input.settings.acceptThreshold,
reviewThreshold: input.settings.reviewThreshold,
validationRequiredForManifest: input.settings.validationRequiredForManifest,
},
});
const compositeRelationships = await detectCompositeRelationships({
connectionId: input.connectionId,
driver: input.driver,
schema: input.schema,
profile,
executor,
context: input.context,
warnings,
});
const inferredAccepted = nonFormalAcceptedRelationships({
formalIds: formalMetadata.acceptedIds,
resolvedRelationships: graph.relationships,
});
const compositeAccepted = compositeRelationships
.filter((candidate) => candidate.status === 'accepted')
.map(relationshipFromComposite);
const relationshipsForAcceptance = formalMetadata.accepted.concat(inferredAccepted, compositeAccepted);
const acceptedById = new Map(relationshipsForAcceptance.map((relationship) => [relationship.id, relationship]));
const accepted = Array.from(acceptedById.values()).sort((left, right) =>
relationshipId(left).localeCompare(relationshipId(right)),
);
const rejected = graph.relationships
.filter((candidate) => candidate.status === 'rejected')
.map(relationshipFromResolved);
const combined = combinedRelationshipSummary({
formalAccepted: formalMetadata.accepted.length,
formalSkipped: formalMetadata.skipped.length,
resolvedRelationships: graph.relationships,
});
const compositeCounts = compositeSummary(compositeRelationships);
return {
relationshipUpdate: {
connectionId: input.connectionId,
accepted,
rejected,
skipped: formalMetadata.skipped,
},
relationships: {
accepted: combined.accepted + compositeCounts.accepted,
review: combined.review + compositeCounts.review,
rejected: combined.rejected + compositeCounts.rejected,
skipped: combined.skipped,
},
profile,
resolvedRelationships: graph.relationships,
compositeRelationships,
statisticalValidation: profile.sqlAvailable ? 'completed' : 'skipped',
llmRelationshipValidation: llmProposalResult.summary,
warnings,
};
}

View file

@ -0,0 +1,211 @@
import type { KloLocalProject } from '../project/index.js';
import { describe, expect, it, vi } from 'vitest';
import {
buildKloRelationshipFeedbackCalibrationReport,
calibrateLocalRelationshipFeedbackLabels,
formatKloRelationshipFeedbackCalibrationMarkdown,
} from './relationship-feedback-calibration.js';
import type {
ExportLocalRelationshipFeedbackLabelsResult,
KloRelationshipFeedbackLabel,
} from './relationship-feedback-export.js';
function label(
input: Partial<KloRelationshipFeedbackLabel> &
Pick<KloRelationshipFeedbackLabel, 'candidateId' | 'decision' | 'score'>,
): KloRelationshipFeedbackLabel {
return {
schemaVersion: 1,
previousStatus: 'review',
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
decidedAt: '2026-05-07T12:00:00.000Z',
reviewer: 'Andrey',
note: null,
relationshipType: 'many_to_one',
source: 'deterministic_name',
confidence: input.score ?? 0,
pkScore: input.pkScore ?? null,
fkScore: input.fkScore ?? input.score,
fromTable: 'public.orders',
fromColumns: ['customer_id'],
toTable: 'public.customers',
toColumns: ['id'],
reasons: [],
artifactPath: 'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json',
...input,
};
}
function feedback(labels: KloRelationshipFeedbackLabel[]): ExportLocalRelationshipFeedbackLabelsResult {
return {
generatedAt: '2026-05-07T13:00:00.000Z',
filters: { connectionId: null, decision: 'all' },
summary: {
total: labels.length,
accepted: labels.filter((item) => item.decision === 'accepted').length,
rejected: labels.filter((item) => item.decision === 'rejected').length,
connections: new Set(labels.map((item) => item.connectionId)).size,
runs: new Set(labels.map((item) => `${item.connectionId}:${item.runId}`)).size,
},
labels,
warnings: [],
};
}
describe('relationship feedback calibration', () => {
it('builds score buckets and threshold-band summary from feedback labels', () => {
const report = buildKloRelationshipFeedbackCalibrationReport(
feedback([
label({
candidateId: 'orders:orders.customer_id->customers:customers.id',
decision: 'accepted',
score: 0.91,
pkScore: 0.97,
fkScore: 0.91,
}),
label({
candidateId: 'orders:orders.account_id->accounts:accounts.id',
decision: 'accepted',
score: 0.61,
pkScore: 0.88,
fkScore: 0.61,
}),
label({
candidateId: 'orders:orders.note_id->notes:notes.id',
decision: 'rejected',
score: 0.21,
pkScore: 0.4,
fkScore: 0.21,
}),
label({
candidateId: 'orders:orders.region_id->regions:regions.id',
decision: 'rejected',
score: 0.88,
pkScore: 0.9,
fkScore: 0.88,
}),
]),
{
acceptThreshold: 0.85,
reviewThreshold: 0.55,
},
);
expect(report.thresholds).toEqual({ accept: 0.85, review: 0.55 });
expect(report.summary).toEqual({
total: 4,
scored: 4,
unscored: 0,
acceptedLabels: 2,
rejectedLabels: 2,
predictedAccepted: 2,
predictedReview: 1,
predictedRejected: 1,
acceptedBandPrecision: 0.5,
rejectedBandPrecision: 1,
reviewBandAcceptedRate: 1,
meanAcceptedScore: 0.76,
meanRejectedScore: 0.545,
});
expect(report.buckets.map((bucket) => [bucket.label, bucket.total, bucket.accepted, bucket.rejected, bucket.acceptanceRate])).toEqual([
['0.00-0.24', 1, 0, 1, 0],
['0.25-0.49', 0, 0, 0, null],
['0.50-0.74', 1, 1, 0, 1],
['0.75-1.00', 2, 1, 1, 0.5],
]);
expect(report.labels.map((item) => [item.candidateId, item.predictedStatus, item.bucket])).toEqual([
['orders:orders.account_id->accounts:accounts.id', 'review', '0.50-0.74'],
['orders:orders.customer_id->customers:customers.id', 'accepted', '0.75-1.00'],
['orders:orders.note_id->notes:notes.id', 'rejected', '0.00-0.24'],
['orders:orders.region_id->regions:regions.id', 'accepted', '0.75-1.00'],
]);
});
it('keeps unscored labels visible without treating them as threshold predictions', () => {
const report = buildKloRelationshipFeedbackCalibrationReport(
feedback([
label({
candidateId: 'orders:orders.note_id->notes:notes.id',
decision: 'rejected',
score: null,
confidence: 0.2,
fkScore: null,
}),
]),
{
acceptThreshold: 0.85,
reviewThreshold: 0.55,
},
);
expect(report.summary).toMatchObject({
total: 1,
scored: 0,
unscored: 1,
predictedAccepted: 0,
predictedReview: 0,
predictedRejected: 0,
acceptedBandPrecision: null,
rejectedBandPrecision: null,
reviewBandAcceptedRate: null,
meanAcceptedScore: null,
meanRejectedScore: null,
});
expect(report.labels[0]).toMatchObject({
candidateId: 'orders:orders.note_id->notes:notes.id',
predictedStatus: 'unscored',
bucket: 'unscored',
});
});
it('formats a stable markdown summary for human CLI output', () => {
const report = buildKloRelationshipFeedbackCalibrationReport(
feedback([
label({ candidateId: 'orders:orders.customer_id->customers:customers.id', decision: 'accepted', score: 0.91 }),
label({ candidateId: 'orders:orders.note_id->notes:notes.id', decision: 'rejected', score: 0.21 }),
]),
{
acceptThreshold: 0.85,
reviewThreshold: 0.55,
},
);
expect(formatKloRelationshipFeedbackCalibrationMarkdown(report)).toContain(
'KLO relationship feedback calibration',
);
expect(formatKloRelationshipFeedbackCalibrationMarkdown(report)).toContain('Total labels: 2');
expect(formatKloRelationshipFeedbackCalibrationMarkdown(report)).toContain('Accepted-band precision: 1.000');
expect(formatKloRelationshipFeedbackCalibrationMarkdown(report)).toContain(
'0.75-1.00: total=1 accepted=1 rejected=0 acceptanceRate=1.000',
);
});
it('wraps the feedback exporter and preserves exporter warnings', async () => {
const project = { projectDir: '/tmp/klo-project' } as KloLocalProject;
const exportLocalRelationshipFeedbackLabels = vi.fn(async () => ({
...feedback([
label({ candidateId: 'orders:orders.customer_id->customers:customers.id', decision: 'accepted', score: 0.91 }),
]),
warnings: [{ path: 'raw-sources/broken/live-database/sync/enrichment/relationship-review-decisions.json', message: 'Unexpected token' }],
}));
const report = await calibrateLocalRelationshipFeedbackLabels(project, {
connectionId: 'warehouse',
decision: 'all',
acceptThreshold: 0.9,
reviewThreshold: 0.5,
exportLocalRelationshipFeedbackLabels,
});
expect(exportLocalRelationshipFeedbackLabels).toHaveBeenCalledWith(project, {
connectionId: 'warehouse',
decision: 'all',
});
expect(report.thresholds).toEqual({ accept: 0.9, review: 0.5 });
expect(report.warnings).toEqual([
{ path: 'raw-sources/broken/live-database/sync/enrichment/relationship-review-decisions.json', message: 'Unexpected token' },
]);
});
});

View file

@ -0,0 +1,300 @@
import type { KloLocalProject } from '../project/index.js';
import {
exportLocalRelationshipFeedbackLabels,
type ExportLocalRelationshipFeedbackLabelsInput,
type ExportLocalRelationshipFeedbackLabelsResult,
type KloRelationshipFeedbackExportWarning,
type KloRelationshipFeedbackLabel,
} from './relationship-feedback-export.js';
import type { KloResolvedRelationshipStatus } from './relationship-graph-resolver.js';
import type { KloRelationshipReviewDecisionValue } from './relationship-review-decisions.js';
const DEFAULT_ACCEPT_THRESHOLD = 0.85;
const DEFAULT_REVIEW_THRESHOLD = 0.55;
type CalibrationPredictedStatus = KloResolvedRelationshipStatus | 'unscored';
interface Thresholds {
accept: number;
review: number;
}
export interface BuildKloRelationshipFeedbackCalibrationReportInput {
acceptThreshold?: number;
reviewThreshold?: number;
}
export interface CalibrateLocalRelationshipFeedbackLabelsInput
extends ExportLocalRelationshipFeedbackLabelsInput,
BuildKloRelationshipFeedbackCalibrationReportInput {
exportLocalRelationshipFeedbackLabels?: typeof exportLocalRelationshipFeedbackLabels;
}
export interface KloRelationshipFeedbackCalibrationBucket {
label: string;
minInclusive: number;
maxInclusive: number;
total: number;
accepted: number;
rejected: number;
acceptanceRate: number | null;
}
export interface KloRelationshipFeedbackCalibrationLabel {
candidateId: string;
decision: KloRelationshipReviewDecisionValue;
previousStatus: KloRelationshipFeedbackLabel['previousStatus'];
predictedStatus: CalibrationPredictedStatus;
bucket: string;
score: number | null;
pkScore: number | null;
fkScore: number | null;
connectionId: string;
runId: string;
fromTable: string;
fromColumns: string[];
toTable: string;
toColumns: string[];
source: string;
reasons: string[];
}
export interface KloRelationshipFeedbackCalibrationReport {
generatedAt: string;
filters: ExportLocalRelationshipFeedbackLabelsResult['filters'];
thresholds: Thresholds;
summary: {
total: number;
scored: number;
unscored: number;
acceptedLabels: number;
rejectedLabels: number;
predictedAccepted: number;
predictedReview: number;
predictedRejected: number;
acceptedBandPrecision: number | null;
rejectedBandPrecision: number | null;
reviewBandAcceptedRate: number | null;
meanAcceptedScore: number | null;
meanRejectedScore: number | null;
};
buckets: KloRelationshipFeedbackCalibrationBucket[];
labels: KloRelationshipFeedbackCalibrationLabel[];
warnings: KloRelationshipFeedbackExportWarning[];
}
const BUCKETS = [
{ label: '0.00-0.24', minInclusive: 0, maxInclusive: 0.249999 },
{ label: '0.25-0.49', minInclusive: 0.25, maxInclusive: 0.499999 },
{ label: '0.50-0.74', minInclusive: 0.5, maxInclusive: 0.749999 },
{ label: '0.75-1.00', minInclusive: 0.75, maxInclusive: 1 },
] as const;
function thresholds(input: BuildKloRelationshipFeedbackCalibrationReportInput): Thresholds {
return {
accept: input.acceptThreshold ?? DEFAULT_ACCEPT_THRESHOLD,
review: input.reviewThreshold ?? DEFAULT_REVIEW_THRESHOLD,
};
}
function roundMetric(value: number): number {
return Math.round(value * 1000) / 1000;
}
function ratio(numerator: number, denominator: number): number | null {
return denominator === 0 ? null : roundMetric(numerator / denominator);
}
function mean(values: readonly number[]): number | null {
if (values.length === 0) {
return null;
}
return roundMetric(values.reduce((sum, value) => sum + value, 0) / values.length);
}
function scoreBucket(score: number | null): string {
if (score === null) {
return 'unscored';
}
return BUCKETS.find((bucket) => score >= bucket.minInclusive && score <= bucket.maxInclusive)?.label ?? 'unscored';
}
function predictedStatus(score: number | null, currentThresholds: Thresholds): CalibrationPredictedStatus {
if (score === null) {
return 'unscored';
}
if (score >= currentThresholds.accept) {
return 'accepted';
}
if (score >= currentThresholds.review) {
return 'review';
}
return 'rejected';
}
function calibrationLabel(
label: KloRelationshipFeedbackLabel,
currentThresholds: Thresholds,
): KloRelationshipFeedbackCalibrationLabel {
return {
candidateId: label.candidateId,
decision: label.decision,
previousStatus: label.previousStatus,
predictedStatus: predictedStatus(label.score, currentThresholds),
bucket: scoreBucket(label.score),
score: label.score,
pkScore: label.pkScore,
fkScore: label.fkScore,
connectionId: label.connectionId,
runId: label.runId,
fromTable: label.fromTable,
fromColumns: [...label.fromColumns],
toTable: label.toTable,
toColumns: [...label.toColumns],
source: label.source,
reasons: [...label.reasons],
};
}
function summarize(
labels: readonly KloRelationshipFeedbackCalibrationLabel[],
): KloRelationshipFeedbackCalibrationReport['summary'] {
const scored = labels.filter((label) => label.score !== null);
const predictedAccepted = scored.filter((label) => label.predictedStatus === 'accepted');
const predictedReview = scored.filter((label) => label.predictedStatus === 'review');
const predictedRejected = scored.filter((label) => label.predictedStatus === 'rejected');
const acceptedLabels = labels.filter((label) => label.decision === 'accepted');
const rejectedLabels = labels.filter((label) => label.decision === 'rejected');
return {
total: labels.length,
scored: scored.length,
unscored: labels.length - scored.length,
acceptedLabels: acceptedLabels.length,
rejectedLabels: rejectedLabels.length,
predictedAccepted: predictedAccepted.length,
predictedReview: predictedReview.length,
predictedRejected: predictedRejected.length,
acceptedBandPrecision: ratio(
predictedAccepted.filter((label) => label.decision === 'accepted').length,
predictedAccepted.length,
),
rejectedBandPrecision: ratio(
predictedRejected.filter((label) => label.decision === 'rejected').length,
predictedRejected.length,
),
reviewBandAcceptedRate: ratio(
predictedReview.filter((label) => label.decision === 'accepted').length,
predictedReview.length,
),
meanAcceptedScore: mean(acceptedLabels.map((label) => label.score).filter((score): score is number => score !== null)),
meanRejectedScore: mean(rejectedLabels.map((label) => label.score).filter((score): score is number => score !== null)),
};
}
function buildBuckets(
labels: readonly KloRelationshipFeedbackCalibrationLabel[],
): KloRelationshipFeedbackCalibrationBucket[] {
return BUCKETS.map((bucket) => {
const bucketLabels = labels.filter((label) => label.bucket === bucket.label);
const accepted = bucketLabels.filter((label) => label.decision === 'accepted').length;
const rejected = bucketLabels.filter((label) => label.decision === 'rejected').length;
return {
label: bucket.label,
minInclusive: bucket.minInclusive,
maxInclusive:
bucket.maxInclusive === 0.249999
? 0.24
: bucket.maxInclusive === 0.499999
? 0.49
: bucket.maxInclusive === 0.749999
? 0.74
: 1,
total: bucketLabels.length,
accepted,
rejected,
acceptanceRate: ratio(accepted, bucketLabels.length),
};
});
}
export function buildKloRelationshipFeedbackCalibrationReport(
feedback: ExportLocalRelationshipFeedbackLabelsResult,
input: BuildKloRelationshipFeedbackCalibrationReportInput = {},
): KloRelationshipFeedbackCalibrationReport {
const currentThresholds = thresholds(input);
const labels = feedback.labels
.map((label) => calibrationLabel(label, currentThresholds))
.sort(
(left, right) =>
left.connectionId.localeCompare(right.connectionId) ||
left.runId.localeCompare(right.runId) ||
left.candidateId.localeCompare(right.candidateId),
);
return {
generatedAt: feedback.generatedAt,
filters: feedback.filters,
thresholds: currentThresholds,
summary: summarize(labels),
buckets: buildBuckets(labels),
labels,
warnings: [...feedback.warnings],
};
}
export async function calibrateLocalRelationshipFeedbackLabels(
project: KloLocalProject,
input: CalibrateLocalRelationshipFeedbackLabelsInput = {},
): Promise<KloRelationshipFeedbackCalibrationReport> {
const exporter = input.exportLocalRelationshipFeedbackLabels ?? exportLocalRelationshipFeedbackLabels;
const feedback = await exporter(project, {
connectionId: input.connectionId,
decision: input.decision,
});
return buildKloRelationshipFeedbackCalibrationReport(feedback, input);
}
function formatMetric(value: number | null): string {
return value === null ? 'n/a' : value.toFixed(3);
}
export function formatKloRelationshipFeedbackCalibrationMarkdown(
report: KloRelationshipFeedbackCalibrationReport,
): string {
const lines = [
'KLO relationship feedback calibration',
`Generated: ${report.generatedAt}`,
`Filter connection: ${report.filters.connectionId ?? 'all'}`,
`Filter decision: ${report.filters.decision}`,
`Thresholds: accept=${report.thresholds.accept.toFixed(2)} review=${report.thresholds.review.toFixed(2)}`,
`Total labels: ${report.summary.total}`,
`Scored labels: ${report.summary.scored}`,
`Unscored labels: ${report.summary.unscored}`,
`Accepted labels: ${report.summary.acceptedLabels}`,
`Rejected labels: ${report.summary.rejectedLabels}`,
`Predicted accepted: ${report.summary.predictedAccepted}`,
`Predicted review: ${report.summary.predictedReview}`,
`Predicted rejected: ${report.summary.predictedRejected}`,
`Accepted-band precision: ${formatMetric(report.summary.acceptedBandPrecision)}`,
`Rejected-band precision: ${formatMetric(report.summary.rejectedBandPrecision)}`,
`Review-band accepted rate: ${formatMetric(report.summary.reviewBandAcceptedRate)}`,
`Mean accepted score: ${formatMetric(report.summary.meanAcceptedScore)}`,
`Mean rejected score: ${formatMetric(report.summary.meanRejectedScore)}`,
'',
'Score buckets',
...report.buckets.map(
(bucket) =>
` - ${bucket.label}: total=${bucket.total} accepted=${bucket.accepted} rejected=${bucket.rejected} acceptanceRate=${formatMetric(bucket.acceptanceRate)}`,
),
];
if (report.warnings.length > 0) {
lines.push('', 'Warnings');
for (const warning of report.warnings.slice(0, 5)) {
lines.push(` - ${warning.path}: ${warning.message}`);
}
}
return `${lines.join('\n')}\n`;
}

View file

@ -0,0 +1,270 @@
import type { KloLocalProject } from '../project/index.js';
import { describe, expect, it, vi } from 'vitest';
import {
exportLocalRelationshipFeedbackLabels,
formatKloRelationshipFeedbackLabelsJsonl,
} from './relationship-feedback-export.js';
import type { KloRelationshipReviewDecisionArtifact } from './relationship-review-decisions.js';
function projectWithFiles(files: Record<string, unknown>): KloLocalProject {
const contentByPath = new Map(
Object.entries(files).map(([path, value]) => [
path,
typeof value === 'string' ? value : `${JSON.stringify(value, null, 2)}\n`,
]),
);
return {
projectDir: '/tmp/klo-project',
fileStore: {
async listFiles(path: string) {
return {
files: [...contentByPath.keys()].filter((file) => file.startsWith(`${path}/`)).sort(),
};
},
async readFile(path: string) {
const content = contentByPath.get(path);
if (!content) {
throw new Error(`missing file ${path}`);
}
return { content };
},
writeFile: vi.fn(),
deleteFile: vi.fn(),
getFileHistory: vi.fn(),
forWorktree: vi.fn(),
},
} as unknown as KloLocalProject;
}
function decisionsArtifact(input: {
connectionId: string;
runId: string;
syncId: string;
decisions: KloRelationshipReviewDecisionArtifact['decisions'];
}): KloRelationshipReviewDecisionArtifact {
return {
connectionId: input.connectionId,
runId: input.runId,
syncId: input.syncId,
generatedAt: '2026-05-07T12:00:00.000Z',
decisions: input.decisions,
};
}
const acceptedOrderCustomer = {
candidateId: 'orders:orders.customer_id->customers:customers.id',
decision: 'accepted' as const,
previousStatus: 'review' as const,
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
decidedAt: '2026-05-07T12:00:00.000Z',
reviewer: 'Andrey',
note: 'Confirmed in warehouse docs',
from: {
tableId: 'orders',
columnIds: ['orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'customers',
columnIds: ['customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one' as const,
source: 'deterministic_name',
score: 0.62,
confidence: 0.62,
pkScore: 0.91,
fkScore: 0.62,
reasons: ['fk_score_review'],
};
const rejectedOrderNote = {
candidateId: 'orders:orders.note_id->notes:notes.id',
decision: 'rejected' as const,
previousStatus: 'rejected' as const,
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
decidedAt: '2026-05-07T12:05:00.000Z',
reviewer: 'Andrey',
note: null,
from: {
tableId: 'orders',
columnIds: ['orders.note_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['note_id'],
},
to: {
tableId: 'notes',
columnIds: ['notes.id'],
table: { catalog: null, db: 'public', name: 'notes' },
columns: ['id'],
},
relationshipType: 'many_to_one' as const,
source: 'deterministic_name',
score: 0.2,
confidence: 0.2,
pkScore: 0.4,
fkScore: 0.2,
reasons: ['low_source_coverage'],
};
const acceptedInvoiceAccount = {
candidateId: 'invoices:invoices.account_id->accounts:accounts.id',
decision: 'accepted' as const,
previousStatus: 'accepted' as const,
connectionId: 'billing',
runId: 'scan-run-b',
syncId: 'sync-b',
decidedAt: '2026-05-07T12:10:00.000Z',
reviewer: 'klo',
note: null,
from: {
tableId: 'invoices',
columnIds: ['invoices.account_id'],
table: { catalog: null, db: 'billing', name: 'invoices' },
columns: ['account_id'],
},
to: {
tableId: 'accounts',
columnIds: ['accounts.id'],
table: { catalog: null, db: 'billing', name: 'accounts' },
columns: ['id'],
},
relationshipType: 'many_to_one' as const,
source: 'formal_metadata',
score: 1,
confidence: 1,
pkScore: 1,
fkScore: 1,
reasons: ['formal_metadata_relationship'],
};
describe('relationship feedback export', () => {
it('exports stable labels from all relationship review decision artifacts', async () => {
const project = projectWithFiles({
'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json': decisionsArtifact({
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
decisions: [rejectedOrderNote, acceptedOrderCustomer],
}),
'raw-sources/billing/live-database/sync-b/enrichment/relationship-review-decisions.json': decisionsArtifact({
connectionId: 'billing',
runId: 'scan-run-b',
syncId: 'sync-b',
decisions: [acceptedInvoiceAccount],
}),
'raw-sources/warehouse/live-database/sync-a/enrichment/relationships.json': { accepted: [], review: [], rejected: [] },
});
const result = await exportLocalRelationshipFeedbackLabels(project, {
now: () => new Date('2026-05-07T13:00:00.000Z'),
});
expect(result.summary).toEqual({
total: 3,
accepted: 2,
rejected: 1,
connections: 2,
runs: 2,
});
expect(result.labels.map((label) => label.candidateId)).toEqual([
'invoices:invoices.account_id->accounts:accounts.id',
'orders:orders.customer_id->customers:customers.id',
'orders:orders.note_id->notes:notes.id',
]);
expect(result.labels[0]).toMatchObject({
schemaVersion: 1,
decision: 'accepted',
connectionId: 'billing',
source: 'formal_metadata',
fromTable: 'billing.invoices',
fromColumns: ['account_id'],
toTable: 'billing.accounts',
toColumns: ['id'],
artifactPath: 'raw-sources/billing/live-database/sync-b/enrichment/relationship-review-decisions.json',
});
expect(result.warnings).toEqual([]);
});
it('filters labels by connection and decision', async () => {
const project = projectWithFiles({
'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json': decisionsArtifact({
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
decisions: [rejectedOrderNote, acceptedOrderCustomer],
}),
'raw-sources/billing/live-database/sync-b/enrichment/relationship-review-decisions.json': decisionsArtifact({
connectionId: 'billing',
runId: 'scan-run-b',
syncId: 'sync-b',
decisions: [acceptedInvoiceAccount],
}),
});
const result = await exportLocalRelationshipFeedbackLabels(project, {
connectionId: 'warehouse',
decision: 'rejected',
now: () => new Date('2026-05-07T13:00:00.000Z'),
});
expect(result.summary).toMatchObject({ total: 1, accepted: 0, rejected: 1 });
expect(result.labels).toHaveLength(1);
expect(result.labels[0]?.candidateId).toBe('orders:orders.note_id->notes:notes.id');
});
it('formats JSONL with one stable label object per line', async () => {
const project = projectWithFiles({
'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json': decisionsArtifact({
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
decisions: [acceptedOrderCustomer],
}),
});
const result = await exportLocalRelationshipFeedbackLabels(project, {
now: () => new Date('2026-05-07T13:00:00.000Z'),
});
const lines = formatKloRelationshipFeedbackLabelsJsonl(result).trim().split('\n').map((line) => JSON.parse(line));
expect(lines).toHaveLength(1);
expect(lines[0]).toMatchObject({
schemaVersion: 1,
candidateId: 'orders:orders.customer_id->customers:customers.id',
decision: 'accepted',
relationshipType: 'many_to_one',
});
});
it('records parse warnings and continues exporting readable decision artifacts', async () => {
const project = projectWithFiles({
'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json': decisionsArtifact({
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
decisions: [acceptedOrderCustomer],
}),
'raw-sources/broken/live-database/sync-b/enrichment/relationship-review-decisions.json': '{not-json',
});
const result = await exportLocalRelationshipFeedbackLabels(project, {
now: () => new Date('2026-05-07T13:00:00.000Z'),
});
expect(result.summary.total).toBe(1);
expect(result.warnings).toEqual([
{
path: 'raw-sources/broken/live-database/sync-b/enrichment/relationship-review-decisions.json',
message: expect.any(String),
},
]);
expect(result.warnings[0]?.message.length).toBeGreaterThan(0);
});
});

View file

@ -0,0 +1,179 @@
import type { KloLocalProject } from '../project/index.js';
import type {
KloRelationshipReviewDecisionArtifact,
KloRelationshipReviewDecisionEntry,
KloRelationshipReviewDecisionValue,
} from './relationship-review-decisions.js';
const DECISION_ARTIFACT_SUFFIX = '/enrichment/relationship-review-decisions.json';
const FEEDBACK_SCHEMA_VERSION = 1;
export type KloRelationshipFeedbackDecisionFilter = KloRelationshipReviewDecisionValue | 'all';
export interface ExportLocalRelationshipFeedbackLabelsInput {
connectionId?: string | null;
decision?: KloRelationshipFeedbackDecisionFilter;
now?: () => Date;
}
export interface KloRelationshipFeedbackLabel {
schemaVersion: 1;
candidateId: string;
decision: KloRelationshipReviewDecisionValue;
previousStatus: KloRelationshipReviewDecisionEntry['previousStatus'];
connectionId: string;
runId: string;
syncId: string;
decidedAt: string;
reviewer: string;
note: string | null;
relationshipType: KloRelationshipReviewDecisionEntry['relationshipType'];
source: string;
score: number | null;
confidence: number;
pkScore: number | null;
fkScore: number | null;
fromTable: string;
fromColumns: string[];
toTable: string;
toColumns: string[];
reasons: string[];
artifactPath: string;
}
export interface KloRelationshipFeedbackExportWarning {
path: string;
message: string;
}
export interface ExportLocalRelationshipFeedbackLabelsResult {
generatedAt: string;
filters: {
connectionId: string | null;
decision: KloRelationshipFeedbackDecisionFilter;
};
summary: {
total: number;
accepted: number;
rejected: number;
connections: number;
runs: number;
};
labels: KloRelationshipFeedbackLabel[];
warnings: KloRelationshipFeedbackExportWarning[];
}
function qualifiedTableName(entry: KloRelationshipReviewDecisionEntry, side: 'from' | 'to'): string {
const table = entry[side].table;
return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.');
}
function labelFromDecision(entry: KloRelationshipReviewDecisionEntry, artifactPath: string): KloRelationshipFeedbackLabel {
return {
schemaVersion: FEEDBACK_SCHEMA_VERSION,
candidateId: entry.candidateId,
decision: entry.decision,
previousStatus: entry.previousStatus,
connectionId: entry.connectionId,
runId: entry.runId,
syncId: entry.syncId,
decidedAt: entry.decidedAt,
reviewer: entry.reviewer,
note: entry.note,
relationshipType: entry.relationshipType,
source: entry.source,
score: entry.score,
confidence: entry.confidence,
pkScore: entry.pkScore,
fkScore: entry.fkScore,
fromTable: qualifiedTableName(entry, 'from'),
fromColumns: [...entry.from.columns],
toTable: qualifiedTableName(entry, 'to'),
toColumns: [...entry.to.columns],
reasons: [...entry.reasons],
artifactPath,
};
}
function sortLabels(labels: KloRelationshipFeedbackLabel[]): KloRelationshipFeedbackLabel[] {
return [...labels].sort((left, right) => {
return (
left.connectionId.localeCompare(right.connectionId) ||
left.runId.localeCompare(right.runId) ||
left.candidateId.localeCompare(right.candidateId) ||
left.decidedAt.localeCompare(right.decidedAt)
);
});
}
function passesFilters(
label: KloRelationshipFeedbackLabel,
filters: { connectionId: string | null; decision: KloRelationshipFeedbackDecisionFilter },
): boolean {
if (filters.connectionId && label.connectionId !== filters.connectionId) {
return false;
}
return filters.decision === 'all' || label.decision === filters.decision;
}
function messageFromUnknownError(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
async function readDecisionLabels(
project: KloLocalProject,
artifactPath: string,
): Promise<KloRelationshipFeedbackLabel[]> {
const raw = await project.fileStore.readFile(artifactPath);
const parsed = JSON.parse(raw.content) as KloRelationshipReviewDecisionArtifact;
const decisions = Array.isArray(parsed.decisions) ? parsed.decisions : [];
return decisions.map((entry) => labelFromDecision(entry, artifactPath));
}
function summarize(labels: KloRelationshipFeedbackLabel[]): ExportLocalRelationshipFeedbackLabelsResult['summary'] {
return {
total: labels.length,
accepted: labels.filter((label) => label.decision === 'accepted').length,
rejected: labels.filter((label) => label.decision === 'rejected').length,
connections: new Set(labels.map((label) => label.connectionId)).size,
runs: new Set(labels.map((label) => `${label.connectionId}:${label.runId}`)).size,
};
}
export async function exportLocalRelationshipFeedbackLabels(
project: KloLocalProject,
input: ExportLocalRelationshipFeedbackLabelsInput = {},
): Promise<ExportLocalRelationshipFeedbackLabelsResult> {
const filters = {
connectionId: input.connectionId ?? null,
decision: input.decision ?? 'all',
};
const listed = await project.fileStore.listFiles('raw-sources');
const artifactPaths = listed.files.filter((path) => path.endsWith(DECISION_ARTIFACT_SUFFIX)).sort();
const labels: KloRelationshipFeedbackLabel[] = [];
const warnings: KloRelationshipFeedbackExportWarning[] = [];
for (const artifactPath of artifactPaths) {
try {
labels.push(...(await readDecisionLabels(project, artifactPath)));
} catch (error) {
warnings.push({ path: artifactPath, message: messageFromUnknownError(error) });
}
}
const filtered = sortLabels(labels.filter((label) => passesFilters(label, filters)));
return {
generatedAt: (input.now?.() ?? new Date()).toISOString(),
filters,
summary: summarize(filtered),
labels: filtered,
warnings,
};
}
export function formatKloRelationshipFeedbackLabelsJsonl(result: ExportLocalRelationshipFeedbackLabelsResult): string {
if (result.labels.length === 0) {
return '';
}
return `${result.labels.map((label) => JSON.stringify(label)).join('\n')}\n`;
}

View file

@ -0,0 +1,134 @@
import { describe, expect, it } from 'vitest';
import type { KloEnrichedRelationship, KloEnrichedSchema } from './enrichment-types.js';
import { collectKloFormalMetadataRelationships } from './relationship-formal-metadata.js';
function schema(relationships: KloEnrichedRelationship[]): KloEnrichedSchema {
return {
connectionId: 'warehouse',
tables: [
{
id: 'accounts',
ref: { catalog: null, db: null, name: 'accounts' },
enabled: true,
descriptions: {},
columns: [
{
id: 'accounts.id',
tableId: 'accounts',
tableRef: { catalog: null, db: null, name: 'accounts' },
name: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
{
id: 'orders',
ref: { catalog: null, db: null, name: 'orders' },
enabled: true,
descriptions: {},
columns: [
{
id: 'orders.account_id',
tableId: 'orders',
tableRef: { catalog: null, db: null, name: 'orders' },
name: 'account_id',
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
],
relationships,
};
}
function formalRelationship(overrides: Partial<KloEnrichedRelationship> = {}): KloEnrichedRelationship {
return {
id: 'orders:orders.account_id->accounts:accounts.id',
source: 'formal',
from: {
tableId: 'orders',
columnIds: ['orders.account_id'],
table: { catalog: null, db: null, name: 'orders' },
columns: ['account_id'],
},
to: {
tableId: 'accounts',
columnIds: ['accounts.id'],
table: { catalog: null, db: null, name: 'accounts' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.6,
isPrimaryKeyReference: false,
...overrides,
};
}
describe('formal metadata relationship collection', () => {
it('accepts valid formal relationships with ground-truth confidence', () => {
const result = collectKloFormalMetadataRelationships(schema([formalRelationship()]));
expect(result.accepted).toEqual([
expect.objectContaining({
id: 'orders:orders.account_id->accounts:accounts.id',
source: 'formal',
confidence: 1,
isPrimaryKeyReference: true,
}),
]);
expect(result.skipped).toEqual([]);
expect(result.acceptedIds).toEqual(new Set(['orders:orders.account_id->accounts:accounts.id']));
});
it('skips duplicate and invalid formal relationships with reasons', () => {
const result = collectKloFormalMetadataRelationships(
schema([
formalRelationship(),
formalRelationship(),
formalRelationship({
id: 'orders:orders.missing_account_id->accounts:accounts.id',
from: {
tableId: 'orders',
columnIds: ['orders.missing_account_id'],
table: { catalog: null, db: null, name: 'orders' },
columns: ['missing_account_id'],
},
}),
formalRelationship({
id: 'manual-edge',
source: 'manual',
}),
]),
);
expect(result.accepted).toHaveLength(1);
expect(result.skipped).toEqual([
{
relationshipId: 'orders:orders.account_id->accounts:accounts.id',
reason: 'formal_metadata_duplicate',
},
{
relationshipId: 'orders:orders.missing_account_id->accounts:accounts.id',
reason: 'formal_metadata_endpoint_not_found',
},
]);
});
});

View file

@ -0,0 +1,61 @@
import type { KloEnrichedRelationship, KloEnrichedSchema, KloSkippedRelationship } from './enrichment-types.js';
export interface KloFormalMetadataRelationshipCollection {
accepted: KloEnrichedRelationship[];
skipped: KloSkippedRelationship[];
acceptedIds: Set<string>;
}
function relationshipEndpointExists(schema: KloEnrichedSchema, relationship: KloEnrichedRelationship): boolean {
const fromTable = schema.tables.find((table) => table.id === relationship.from.tableId && table.enabled);
const toTable = schema.tables.find((table) => table.id === relationship.to.tableId && table.enabled);
const fromColumn = fromTable?.columns.some(
(column) => relationship.from.columnIds.includes(column.id) && relationship.from.columns.includes(column.name),
);
const toColumn = toTable?.columns.some(
(column) => relationship.to.columnIds.includes(column.id) && relationship.to.columns.includes(column.name),
);
return Boolean(fromTable && toTable && fromColumn && toColumn);
}
export function collectKloFormalMetadataRelationships(
schema: KloEnrichedSchema,
): KloFormalMetadataRelationshipCollection {
const accepted: KloEnrichedRelationship[] = [];
const skipped: KloSkippedRelationship[] = [];
const acceptedIds = new Set<string>();
for (const relationship of schema.relationships) {
if (relationship.source !== 'formal') {
continue;
}
if (acceptedIds.has(relationship.id)) {
skipped.push({
relationshipId: relationship.id,
reason: 'formal_metadata_duplicate',
});
continue;
}
if (!relationshipEndpointExists(schema, relationship)) {
skipped.push({
relationshipId: relationship.id,
reason: 'formal_metadata_endpoint_not_found',
});
continue;
}
acceptedIds.add(relationship.id);
accepted.push({
...relationship,
source: 'formal',
confidence: 1,
isPrimaryKeyReference: true,
});
}
return {
accepted: accepted.sort((left, right) => left.id.localeCompare(right.id)),
skipped,
acceptedIds,
};
}

View file

@ -0,0 +1,649 @@
import { describe, expect, it } from 'vitest';
import type {
KloEnrichedColumn,
KloEnrichedSchema,
KloEnrichedTable,
KloRelationshipEndpoint,
} from './enrichment-types.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
import type { KloValidatedRelationshipDiscoveryCandidate } from './relationship-validation.js';
import { resolveKloRelationshipGraph } from './relationship-graph-resolver.js';
function column(tableId: string, name: string, overrides: Partial<KloEnrichedColumn> = {}): KloEnrichedColumn {
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
return {
id: `${tableId}.${name}`,
tableId,
tableRef,
name,
nativeType: overrides.nativeType ?? 'INTEGER',
normalizedType: overrides.normalizedType ?? 'integer',
dimensionType: overrides.dimensionType ?? 'number',
nullable: overrides.nullable ?? true,
primaryKey: overrides.primaryKey ?? false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
...overrides,
};
}
function table(name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
const ref = { catalog: null, db: null, name };
return {
id: name,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
};
}
function schema(overrides: { accountsPrimaryKey?: boolean } = {}): KloEnrichedSchema {
return {
connectionId: 'warehouse',
tables: [
table('accounts', [
column('accounts', 'id', { nullable: false, primaryKey: overrides.accountsPrimaryKey ?? false }),
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
]),
table('account_archive', [column('account_archive', 'id', { nullable: false })]),
table('users', [
column('users', 'id', { nullable: false }),
column('users', 'account_id', { nullable: false }),
]),
],
relationships: [],
};
}
function endpoint(tableName: string, columnName: string): KloRelationshipEndpoint {
return {
tableId: tableName,
columnIds: [`${tableName}.${columnName}`],
table: { catalog: null, db: null, name: tableName },
columns: [columnName],
};
}
function profiles(): KloRelationshipProfileArtifact {
return {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
queryCount: 0,
tables: [
{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 3 },
{ table: { catalog: null, db: null, name: 'account_archive' }, rowCount: 3 },
{ table: { catalog: null, db: null, name: 'users' }, rowCount: 3 },
],
columns: {
'accounts.id': {
table: { catalog: null, db: null, name: 'accounts' },
column: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2', '3'],
minTextLength: 1,
maxTextLength: 1,
},
'account_archive.id': {
table: { catalog: null, db: null, name: 'account_archive' },
column: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2', '3'],
minTextLength: 1,
maxTextLength: 1,
},
'users.account_id': {
table: { catalog: null, db: null, name: 'users' },
column: 'account_id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2', '3'],
minTextLength: 1,
maxTextLength: 1,
},
},
warnings: [],
};
}
function validatedCandidate(
overrides: Partial<KloValidatedRelationshipDiscoveryCandidate> = {},
): KloValidatedRelationshipDiscoveryCandidate {
const from = overrides.from ?? endpoint('users', 'account_id');
const to = overrides.to ?? endpoint('accounts', 'id');
return {
id: `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`,
from,
to,
relationshipType: 'many_to_one',
confidence: overrides.confidence ?? 0.95,
source: overrides.source ?? 'normalized_table_match',
status: overrides.status ?? 'accepted',
score: overrides.score ?? 0.96,
evidence: {
sourceColumnBase: 'account',
targetTableBase: to.table.name,
targetColumnBase: to.columns[0] ?? '',
targetKeyScore: 0.92,
nameScore: 0.92,
reasons: ['foreign_key_suffix', 'normalized_table_name', 'target_key_like'],
...overrides.evidence,
},
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 3,
parentDistinct: 3,
overlap: 3,
checkedValues: 3,
reasons: ['validation_passed'],
...overrides.validation,
},
...overrides,
};
}
describe('relationship graph resolver', () => {
it('promotes validated relationship discovery references to accepted relationships and inferred PKs', () => {
const result = resolveKloRelationshipGraph({
schema: schema(),
profiles: profiles(),
candidates: [validatedCandidate()],
});
expect(result.pks).toContainEqual({
table: 'accounts',
columns: ['id'],
pkScore: expect.any(Number),
status: 'accepted',
incomingCandidateCount: 1,
evidence: {
declaredPrimaryKey: false,
targetUniqueness: 1,
incomingAcceptedCount: 1,
incomingReviewCount: 0,
reasons: expect.arrayContaining(['unique_target_column', 'incoming_validated_reference']),
},
});
expect(result.pks.find((pk) => pk.table === 'accounts')?.pkScore).toBeGreaterThanOrEqual(0.85);
expect(result.relationships).toHaveLength(1);
expect(result.relationships[0]).toMatchObject({
from: { table: { name: 'users' }, columns: ['account_id'] },
to: { table: { name: 'accounts' }, columns: ['id'] },
status: 'accepted',
pkScore: expect.any(Number),
fkScore: expect.any(Number),
graph: {
reasons: expect.arrayContaining(['target_pk_score_passed', 'fk_score_passed']),
},
});
expect(result.relationships[0]?.fkScore).toBeGreaterThanOrEqual(0.85);
});
it('keeps validation-unavailable candidates in review even when name evidence is strong', () => {
const result = resolveKloRelationshipGraph({
schema: schema(),
profiles: { ...profiles(), sqlAvailable: false, columns: {}, warnings: ['read_only_sql_unavailable'] },
candidates: [
validatedCandidate({
status: 'review',
score: 0.57,
validation: {
targetUniqueness: 0,
sourceCoverage: 0,
violationCount: 0,
violationRatio: 1,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 0,
parentDistinct: 0,
overlap: 0,
checkedValues: 0,
reasons: ['validation_unavailable'],
},
}),
],
});
expect(result.relationships).toHaveLength(1);
expect(result.relationships[0]).toMatchObject({
status: 'review',
graph: {
reasons: expect.arrayContaining(['validation_unavailable_review_only']),
},
});
expect(result.relationships[0]?.fkScore).toBeGreaterThanOrEqual(0.55);
});
it('accepts at most one target per source column and rejects the lower-scored conflict loser', () => {
const winner = validatedCandidate({ confidence: 0.95, score: 0.96 });
const loser = validatedCandidate({
from: endpoint('users', 'account_id'),
to: endpoint('account_archive', 'id'),
confidence: 0.85,
score: 0.9,
evidence: {
sourceColumnBase: 'account',
targetTableBase: 'account_archive',
targetColumnBase: 'id',
targetKeyScore: 0.92,
nameScore: 0.78,
reasons: ['foreign_key_suffix', 'inflection', 'target_key_like'],
},
});
const result = resolveKloRelationshipGraph({
schema: schema(),
profiles: profiles(),
candidates: [loser, winner],
});
expect(result.relationships.map((relationship) => relationship.status)).toEqual(['accepted', 'rejected']);
expect(result.relationships[0]?.to.table.name).toBe('accounts');
expect(result.relationships[1]).toMatchObject({
to: { table: { name: 'account_archive' }, columns: ['id'] },
status: 'rejected',
graph: {
reasons: expect.arrayContaining(['conflict_lost']),
},
});
});
it('preserves declared primary keys as accepted even without incoming candidates', () => {
const result = resolveKloRelationshipGraph({
schema: schema({ accountsPrimaryKey: true }),
profiles: profiles(),
candidates: [],
});
expect(result.relationships).toEqual([]);
expect(result.pks).toContainEqual({
table: 'accounts',
columns: ['id'],
pkScore: 1,
status: 'accepted',
incomingCandidateCount: 0,
evidence: {
declaredPrimaryKey: true,
targetUniqueness: 1,
incomingAcceptedCount: 0,
incomingReviewCount: 0,
reasons: ['declared_primary_key'],
},
});
});
it('infers profile-only key-like columns without incoming relationship candidates', () => {
const baseSchema = schema();
const invoices = table('invoices', [
column('invoices', 'id', { nullable: false }),
column('invoices', 'invoice_number', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
}),
column('invoices', 'amount', {
nativeType: 'INTEGER',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
}),
]);
const baseProfiles = profiles();
const result = resolveKloRelationshipGraph({
schema: { ...baseSchema, tables: [...baseSchema.tables, invoices] },
profiles: {
...baseProfiles,
tables: [...baseProfiles.tables, { table: invoices.ref, rowCount: 3 }],
columns: {
...baseProfiles.columns,
'invoices.id': {
table: invoices.ref,
column: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2', '3'],
minTextLength: 1,
maxTextLength: 1,
},
'invoices.invoice_number': {
table: invoices.ref,
column: 'invoice_number',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['INV-1', 'INV-2', 'INV-3'],
minTextLength: 5,
maxTextLength: 5,
},
'invoices.amount': {
table: invoices.ref,
column: 'amount',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 2 / 3,
nullRate: 0,
sampleValues: ['100', '200'],
minTextLength: 3,
maxTextLength: 3,
},
},
},
candidates: [],
});
expect(result.relationships).toEqual([]);
expect(result.pks).toContainEqual({
table: 'invoices',
columns: ['id'],
pkScore: 1,
status: 'accepted',
incomingCandidateCount: 0,
evidence: {
declaredPrimaryKey: false,
targetUniqueness: 1,
incomingAcceptedCount: 0,
incomingReviewCount: 0,
reasons: expect.arrayContaining([
'unique_target_column',
'profile_key_name',
'not_null_profile',
'profile_only_primary_key',
'no_incoming_references',
]),
},
});
expect(result.pks).toContainEqual(
expect.objectContaining({
table: 'invoices',
columns: ['invoice_number'],
status: 'review',
evidence: expect.objectContaining({
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
}),
}),
);
expect(result.pks.some((pk) => pk.table === 'invoices' && pk.columns[0] === 'amount')).toBe(false);
});
it('pins single-incoming column_suffix_match resolver scores', () => {
const schema = {
connectionId: 'warehouse',
relationships: [],
tables: [
{
id: 'plans-id',
ref: { catalog: null, db: null, name: 'stg_plans' },
enabled: true,
descriptions: {},
columns: [
{
id: 'plan-code-col',
tableId: 'plans-id',
tableRef: { catalog: null, db: null, name: 'stg_plans' },
name: 'plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
{
id: 'segments-id',
ref: { catalog: null, db: null, name: 'mart_account_segments' },
enabled: true,
descriptions: {},
columns: [
{
id: 'current-plan-code-col',
tableId: 'segments-id',
tableRef: { catalog: null, db: null, name: 'mart_account_segments' },
name: 'current_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
primaryKey: false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
],
} satisfies KloEnrichedSchema;
const profiles = {
connectionId: 'warehouse',
driver: 'sqlite' as const,
sqlAvailable: true,
queryCount: 0,
tables: [],
warnings: [],
columns: {
'stg_plans.plan_code': {
table: { catalog: null, db: null, name: 'stg_plans' },
column: 'plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
},
};
const result = resolveKloRelationshipGraph({
schema,
profiles,
candidates: [
{
id: 'segments:(current_plan_code)->plans:(plan_code)',
from: {
tableId: 'segments-id',
columnIds: ['current-plan-code-col'],
table: { catalog: null, db: null, name: 'mart_account_segments' },
columns: ['current_plan_code'],
},
to: {
tableId: 'plans-id',
columnIds: ['plan-code-col'],
table: { catalog: null, db: null, name: 'stg_plans' },
columns: ['plan_code'],
},
relationshipType: 'many_to_one',
confidence: 0.902,
source: 'column_suffix_match',
evidence: {
sourceColumnBase: 'current_plan',
targetTableBase: 'plan',
targetColumnBase: 'plan_code',
targetKeyScore: 0.86,
nameScore: 0.78,
reasons: ['column_suffix_match', 'profile_unique_target'],
},
status: 'accepted',
score: 0.98,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
sourceNullRate: 0,
targetNullRate: 0,
childDistinct: 4,
parentDistinct: 4,
overlap: 4,
checkedValues: 4,
reasons: ['validation_passed'],
},
},
],
});
expect(result.pks).toEqual([
expect.objectContaining({
table: 'stg_plans',
columns: ['plan_code'],
pkScore: 0.922,
status: 'accepted',
}),
]);
expect(result.relationships).toEqual([
expect.objectContaining({
source: 'column_suffix_match',
status: 'accepted',
pkScore: 0.922,
fkScore: 0.953,
}),
]);
});
it('keeps strong profile-only primary key evidence when name evidence is weak', () => {
const baseSchema = schema();
baseSchema.tables.push(
table('events', [
column('events', 'warehouse_key', {
nullable: false,
primaryKey: false,
nativeType: 'INTEGER',
normalizedType: 'integer',
}),
]),
);
const baseProfiles = profiles();
baseProfiles.tables.push({ table: { catalog: null, db: null, name: 'events' }, rowCount: 3 });
baseProfiles.columns['events.warehouse_key'] = {
table: { catalog: null, db: null, name: 'events' },
column: 'warehouse_key',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['100', '101', '102'],
minTextLength: 3,
maxTextLength: 3,
};
const result = resolveKloRelationshipGraph({
schema: baseSchema,
profiles: baseProfiles,
candidates: [],
});
expect(result.pks).toEqual(
expect.arrayContaining([
expect.objectContaining({
table: 'events',
columns: ['warehouse_key'],
status: 'review',
evidence: expect.objectContaining({
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
}),
}),
]),
);
});
it('keeps strong profile-only primary key evidence when the column is not key-shaped', () => {
const baseSchema = schema();
baseSchema.tables.push(
table('events', [
column('events', 'opaque_reference', {
nullable: false,
primaryKey: false,
nativeType: 'INTEGER',
normalizedType: 'integer',
}),
]),
);
const baseProfiles = profiles();
baseProfiles.tables.push({ table: { catalog: null, db: null, name: 'events' }, rowCount: 3 });
baseProfiles.columns['events.opaque_reference'] = {
table: { catalog: null, db: null, name: 'events' },
column: 'opaque_reference',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 3,
nullCount: 0,
distinctCount: 3,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['100', '101', '102'],
minTextLength: 3,
maxTextLength: 3,
};
const result = resolveKloRelationshipGraph({
schema: baseSchema,
profiles: baseProfiles,
candidates: [],
});
const inferredPk = result.pks.find((candidate) => candidate.table === 'events');
expect(inferredPk).toMatchObject({
table: 'events',
columns: ['opaque_reference'],
status: 'review',
evidence: expect.objectContaining({
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
}),
});
expect(inferredPk?.pkScore).toBeGreaterThanOrEqual(0.55);
});
});

View file

@ -0,0 +1,508 @@
import type {
KloEnrichedColumn,
KloEnrichedSchema,
KloEnrichedTable,
KloRelationshipEndpoint,
} from './enrichment-types.js';
import { normalizeKloRelationshipName } from './relationship-candidates.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
import { scoreKloRelationshipCandidate } from './relationship-scoring.js';
import type { KloValidatedRelationshipDiscoveryCandidate } from './relationship-validation.js';
export type KloResolvedRelationshipStatus = 'accepted' | 'review' | 'rejected';
export interface KloRelationshipGraphResolverSettings {
acceptThreshold: number;
reviewThreshold: number;
minTargetPkScoreForAcceptance: number;
validationRequiredForManifest: boolean;
}
export interface KloResolvedRelationshipPkEvidence {
declaredPrimaryKey: boolean;
targetUniqueness: number;
incomingAcceptedCount: number;
incomingReviewCount: number;
reasons: string[];
}
export interface KloResolvedRelationshipPk {
table: string;
columns: string[];
pkScore: number;
status: KloResolvedRelationshipStatus;
incomingCandidateCount: number;
evidence: KloResolvedRelationshipPkEvidence;
}
export interface KloResolvedRelationshipGraphEvidence {
targetPkScore: number;
incomingCandidateCount: number;
conflictRank: number;
reasons: string[];
}
export interface KloResolvedRelationshipDiscoveryCandidate
extends Omit<KloValidatedRelationshipDiscoveryCandidate, 'status'> {
status: KloResolvedRelationshipStatus;
pkScore: number;
fkScore: number;
graph: KloResolvedRelationshipGraphEvidence;
}
export interface KloRelationshipGraphResolutionResult {
pks: KloResolvedRelationshipPk[];
relationships: KloResolvedRelationshipDiscoveryCandidate[];
}
export interface ResolveKloRelationshipGraphInput {
schema: KloEnrichedSchema;
profiles: KloRelationshipProfileArtifact;
candidates: readonly KloValidatedRelationshipDiscoveryCandidate[];
settings?: Partial<KloRelationshipGraphResolverSettings>;
}
const DEFAULT_SETTINGS: KloRelationshipGraphResolverSettings = {
acceptThreshold: 0.85,
reviewThreshold: 0.55,
minTargetPkScoreForAcceptance: 0.78,
validationRequiredForManifest: true,
};
const PROFILE_ONLY_PK_MEASURE_NAME_TOKENS = new Set(['amount', 'count', 'price', 'quantity', 'subtotal', 'total']);
function mergeSettings(
settings: Partial<KloRelationshipGraphResolverSettings> | undefined,
): KloRelationshipGraphResolverSettings {
return { ...DEFAULT_SETTINGS, ...settings };
}
function roundScore(value: number): number {
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
}
function endpointKey(endpoint: KloRelationshipEndpoint): string {
return `${endpoint.table.name}.${singleRelationshipColumn(endpoint)}`;
}
function sourceKey(endpoint: KloRelationshipEndpoint): string {
return `${endpoint.tableId}:${endpoint.columnIds.join(',')}`;
}
function singleRelationshipColumn(endpoint: KloRelationshipEndpoint): string {
const column = endpoint.columns[0];
if (!column) {
throw new Error(`Expected relationship endpoint ${endpoint.table.name} to contain one column`);
}
return column;
}
function pkKey(pk: Pick<KloResolvedRelationshipPk, 'table' | 'columns'>): string {
return `${pk.table}.(${pk.columns.join(',')})`;
}
function candidateSortKey(candidate: Pick<KloValidatedRelationshipDiscoveryCandidate, 'from' | 'to'>): string {
return `${candidate.from.table.name}.${singleRelationshipColumn(candidate.from)}->${candidate.to.table.name}.${singleRelationshipColumn(candidate.to)}`;
}
function statusForScore(
score: number,
settings: KloRelationshipGraphResolverSettings,
acceptedAllowed: boolean,
): KloResolvedRelationshipStatus {
if (acceptedAllowed && score >= settings.acceptThreshold) {
return 'accepted';
}
if (score >= settings.reviewThreshold) {
return 'review';
}
return 'rejected';
}
function candidateHasValidationPassed(candidate: KloValidatedRelationshipDiscoveryCandidate): boolean {
return candidate.validation.reasons.includes('validation_passed');
}
function candidateIsValidationUnavailable(candidate: KloValidatedRelationshipDiscoveryCandidate): boolean {
return (
candidate.validation.reasons.includes('validation_unavailable') ||
candidate.validation.reasons.includes('profile_unavailable')
);
}
function declaredPrimaryKeys(schema: KloEnrichedSchema): KloResolvedRelationshipPk[] {
const pks: KloResolvedRelationshipPk[] = [];
for (const table of schema.tables.filter((candidate) => candidate.enabled)) {
for (const column of table.columns.filter((candidate) => candidate.primaryKey)) {
pks.push({
table: table.ref.name,
columns: [column.name],
pkScore: 1,
status: 'accepted',
incomingCandidateCount: 0,
evidence: {
declaredPrimaryKey: true,
targetUniqueness: 1,
incomingAcceptedCount: 0,
incomingReviewCount: 0,
reasons: ['declared_primary_key'],
},
});
}
}
return pks;
}
function schemaTargetColumns(schema: KloEnrichedSchema): Array<{ table: KloEnrichedTable; column: KloEnrichedColumn }> {
return schema.tables
.filter((table) => table.enabled)
.flatMap((table) => table.columns.map((column) => ({ table, column })));
}
function profileUniqueness(profiles: KloRelationshipProfileArtifact, tableName: string, columnName: string): number {
return profiles.columns[`${tableName}.${columnName}`]?.uniquenessRatio ?? 0;
}
function profileNullRate(profiles: KloRelationshipProfileArtifact, tableName: string, columnName: string): number {
return profiles.columns[`${tableName}.${columnName}`]?.nullRate ?? 1;
}
function profileColumnExists(profiles: KloRelationshipProfileArtifact, tableName: string, columnName: string): boolean {
return Boolean(profiles.columns[`${tableName}.${columnName}`]);
}
function profileOnlyPkNameScore(tableName: string, columnName: string): number {
const table = normalizeKloRelationshipName(tableName).singular;
const column = normalizeKloRelationshipName(columnName).normalized;
if (column === 'id') {
return 1;
}
if (column === `${table}_id`) {
return 0.96;
}
if (column === `${table}_key`) {
return 0.88;
}
if (column === 'key' || column === 'uuid') {
return 0.76;
}
return 0;
}
function profileOnlyPkTypeCompatibility(columnName: string): number {
const tokens = normalizeKloRelationshipName(columnName).normalized.split('_').filter(Boolean);
return tokens.some((token) => PROFILE_ONLY_PK_MEASURE_NAME_TOKENS.has(token)) ? 0 : 1;
}
function profileOnlyPkEvidence(input: {
profiles: KloRelationshipProfileArtifact;
tableName: string;
columnName: string;
}): { nameScore: number; nullRate: number; uniqueness: number; pkScore: number; weakName: boolean } | null {
if (!profileColumnExists(input.profiles, input.tableName, input.columnName)) {
return null;
}
const uniqueness = profileUniqueness(input.profiles, input.tableName, input.columnName);
const nullRate = profileNullRate(input.profiles, input.tableName, input.columnName);
const nameScore = profileOnlyPkNameScore(input.tableName, input.columnName);
if (uniqueness < 0.98 || nullRate > 0.05) {
return null;
}
const typeCompatibility = profileOnlyPkTypeCompatibility(input.columnName);
const scoreBreakdown = scoreKloRelationshipCandidate(
{
nameSimilarity: nameScore,
typeCompatibility,
valueOverlap: 0,
embeddingSimilarity: 0,
profileUniqueness: uniqueness,
profileNullRate: 1 - nullRate,
structuralPrior: 0.65,
},
{
nameSimilarity: 0.2,
typeCompatibility: 0.08,
valueOverlap: 0,
embeddingSimilarity: 0,
profileUniqueness: 0.48,
profileNullRate: 0.2,
structuralPrior: 0.04,
},
);
if (scoreBreakdown.score < DEFAULT_SETTINGS.reviewThreshold) {
return null;
}
return { nameScore, nullRate, uniqueness, pkScore: scoreBreakdown.score, weakName: nameScore < 0.74 };
}
function resolveTargetPk(input: {
table: string;
column: string;
declared: KloResolvedRelationshipPk | undefined;
profiles: KloRelationshipProfileArtifact;
incoming: readonly KloValidatedRelationshipDiscoveryCandidate[];
settings: KloRelationshipGraphResolverSettings;
profileOnly?: { nameScore: number; nullRate: number; uniqueness: number; pkScore: number; weakName: boolean } | null;
}): KloResolvedRelationshipPk {
if (input.declared) {
return input.declared;
}
const targetUniqueness = profileUniqueness(input.profiles, input.table, input.column);
const incomingAccepted = input.incoming.filter((candidate) => candidate.status === 'accepted');
const incomingReview = input.incoming.filter((candidate) => candidate.status === 'review');
const incomingQuality = Math.max(0, ...input.incoming.map((candidate) => candidate.score));
const incomingVolume = Math.min(1, incomingAccepted.length * 0.3 + incomingReview.length * 0.15);
const keyEvidence = Math.max(0, ...input.incoming.map((candidate) => candidate.evidence.targetKeyScore));
const reasons: string[] = [];
if (targetUniqueness >= 0.9) {
reasons.push('unique_target_column');
}
if (incomingAccepted.length > 0) {
reasons.push('incoming_validated_reference');
}
if (incomingReview.length > 0) {
reasons.push('incoming_review_reference');
}
if (keyEvidence >= 0.8) {
reasons.push('target_key_like');
}
if (input.incoming.length === 0) {
reasons.push('no_incoming_references');
}
if (input.profileOnly) {
reasons.push('not_null_profile', 'profile_only_primary_key');
if (input.profileOnly.weakName) {
reasons.push('weak_name_profile_key');
} else {
reasons.push('profile_key_name');
}
const pkScore = input.profileOnly.pkScore;
return {
table: input.table,
columns: [input.column],
pkScore,
status: statusForScore(pkScore, input.settings, !input.profileOnly.weakName),
incomingCandidateCount: 0,
evidence: {
declaredPrimaryKey: false,
targetUniqueness,
incomingAcceptedCount: 0,
incomingReviewCount: 0,
reasons,
},
};
}
const pkScore = roundScore(0.52 * targetUniqueness + 0.28 * incomingQuality + 0.12 * keyEvidence + 0.08 * incomingVolume);
const acceptedAllowed = incomingAccepted.length > 0 && targetUniqueness >= 0.9;
const status =
incomingReview.length > 0 && pkScore < input.settings.reviewThreshold
? 'review'
: statusForScore(pkScore, input.settings, acceptedAllowed);
return {
table: input.table,
columns: [input.column],
pkScore,
status,
incomingCandidateCount: input.incoming.length,
evidence: {
declaredPrimaryKey: false,
targetUniqueness,
incomingAcceptedCount: incomingAccepted.length,
incomingReviewCount: incomingReview.length,
reasons,
},
};
}
function baseRelationshipResolution(input: {
candidate: KloValidatedRelationshipDiscoveryCandidate;
pk: KloResolvedRelationshipPk;
settings: KloRelationshipGraphResolverSettings;
}): KloResolvedRelationshipDiscoveryCandidate {
const reasons: string[] = [];
if (input.candidate.status === 'rejected') {
reasons.push('candidate_validation_rejected');
}
if (candidateIsValidationUnavailable(input.candidate)) {
reasons.push('validation_unavailable_review_only');
}
if (input.pk.pkScore >= input.settings.minTargetPkScoreForAcceptance) {
reasons.push('target_pk_score_passed');
} else {
reasons.push('target_pk_score_low');
}
if (candidateHasValidationPassed(input.candidate)) {
reasons.push('validation_passed');
}
const validationPassBonus = candidateHasValidationPassed(input.candidate) ? 1 : 0;
let fkScore = roundScore(
0.48 * input.candidate.score +
0.3 * input.pk.pkScore +
0.14 * input.candidate.confidence +
0.08 * validationPassBonus,
);
let status: KloResolvedRelationshipStatus;
if (input.candidate.status === 'rejected') {
status = 'rejected';
} else if (candidateIsValidationUnavailable(input.candidate)) {
status = 'review';
fkScore = Math.max(fkScore, input.settings.reviewThreshold);
} else {
const acceptedAllowed =
input.candidate.status === 'accepted' &&
input.pk.pkScore >= input.settings.minTargetPkScoreForAcceptance &&
(!input.settings.validationRequiredForManifest || candidateHasValidationPassed(input.candidate));
status = statusForScore(fkScore, input.settings, acceptedAllowed);
}
if (status === 'accepted') {
reasons.push('fk_score_passed');
} else if (status === 'review') {
reasons.push('fk_score_review');
} else {
reasons.push('fk_score_rejected');
}
return {
...input.candidate,
status,
pkScore: input.pk.pkScore,
fkScore,
graph: {
targetPkScore: input.pk.pkScore,
incomingCandidateCount: input.pk.incomingCandidateCount,
conflictRank: 1,
reasons,
},
};
}
function relationshipRank(
left: KloResolvedRelationshipDiscoveryCandidate,
right: KloResolvedRelationshipDiscoveryCandidate,
): number {
return (
right.fkScore - left.fkScore ||
right.validation.sourceCoverage - left.validation.sourceCoverage ||
right.pkScore - left.pkScore ||
candidateSortKey(left).localeCompare(candidateSortKey(right))
);
}
function applySourceConflicts(
relationships: readonly KloResolvedRelationshipDiscoveryCandidate[],
): KloResolvedRelationshipDiscoveryCandidate[] {
const bySource = new Map<string, KloResolvedRelationshipDiscoveryCandidate[]>();
for (const relationship of relationships) {
const key = sourceKey(relationship.from);
bySource.set(key, [...(bySource.get(key) ?? []), relationship]);
}
const resolved: KloResolvedRelationshipDiscoveryCandidate[] = [];
for (const group of bySource.values()) {
const ranked = [...group].sort(relationshipRank);
let acceptedSeen = false;
ranked.forEach((relationship, index) => {
const conflictRank = index + 1;
if (relationship.status === 'accepted' && acceptedSeen) {
resolved.push({
...relationship,
status: 'rejected',
graph: {
...relationship.graph,
conflictRank,
reasons: [...relationship.graph.reasons.filter((reason) => reason !== 'fk_score_passed'), 'conflict_lost'],
},
});
return;
}
if (relationship.status === 'accepted') {
acceptedSeen = true;
}
resolved.push({
...relationship,
graph: {
...relationship.graph,
conflictRank,
},
});
});
}
return resolved.sort(relationshipRank);
}
export function resolveKloRelationshipGraph(
input: ResolveKloRelationshipGraphInput,
): KloRelationshipGraphResolutionResult {
const settings = mergeSettings(input.settings);
const declared = declaredPrimaryKeys(input.schema);
const declaredByKey = new Map(declared.map((pk) => [pkKey(pk), pk]));
const incomingByTarget = new Map<string, KloValidatedRelationshipDiscoveryCandidate[]>();
for (const candidate of input.candidates) {
const key = endpointKey(candidate.to);
incomingByTarget.set(key, [...(incomingByTarget.get(key) ?? []), candidate]);
}
const pkCandidates = new Map<string, KloResolvedRelationshipPk>();
for (const item of schemaTargetColumns(input.schema)) {
const key = `${item.table.ref.name}.(${item.column.name})`;
const incoming = incomingByTarget.get(`${item.table.ref.name}.${item.column.name}`) ?? [];
const profileOnly =
incoming.length === 0 && !item.column.primaryKey
? profileOnlyPkEvidence({
profiles: input.profiles,
tableName: item.table.ref.name,
columnName: item.column.name,
})
: null;
if (incoming.length === 0 && !item.column.primaryKey && !profileOnly) {
continue;
}
const pk = resolveTargetPk({
table: item.table.ref.name,
column: item.column.name,
declared: declaredByKey.get(key),
profiles: input.profiles,
incoming,
settings,
profileOnly,
});
pkCandidates.set(key, pk);
}
const relationships = input.candidates.map((candidate) => {
const toColumn = singleRelationshipColumn(candidate.to);
const key = `${candidate.to.table.name}.(${toColumn})`;
const pk =
pkCandidates.get(key) ??
resolveTargetPk({
table: candidate.to.table.name,
column: toColumn,
declared: undefined,
profiles: input.profiles,
incoming: incomingByTarget.get(endpointKey(candidate.to)) ?? [],
settings,
profileOnly: null,
});
pkCandidates.set(key, pk);
return baseRelationshipResolution({ candidate, pk, settings });
});
return {
pks: Array.from(pkCandidates.values()).sort(
(left, right) => right.pkScore - left.pkScore || pkKey(left).localeCompare(pkKey(right)),
),
relationships: applySourceConflicts(relationships),
};
}

View file

@ -0,0 +1,240 @@
import type { KloLlmProvider } from '@klo/llm';
import { describe, expect, it, vi } from 'vitest';
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
import { proposeKloRelationshipCandidatesWithLlm } from './relationship-llm-proposal.js';
function llmProvider(provider = 'anthropic'): KloLlmProvider {
const model = { modelId: 'claude-sonnet-4-6', provider };
return {
getModel: vi.fn(() => model as ReturnType<KloLlmProvider['getModel']>),
getModelByName: vi.fn(() => model as ReturnType<KloLlmProvider['getModelByName']>),
cacheMarker: vi.fn(),
repairToolCallHandler: vi.fn(),
thinkingProviderOptions: vi.fn(() => ({})),
telemetryConfig: vi.fn(() => undefined),
promptCachingConfig: vi.fn(
() =>
({
enabled: false,
systemTtl: '1h',
toolsTtl: '1h',
historyTtl: '5m',
cacheSystem: true,
cacheTools: true,
cacheHistory: true,
vertexFallbackTo5m: false,
}) as ReturnType<KloLlmProvider['promptCachingConfig']>,
),
activeBackend: vi.fn(() => provider as ReturnType<KloLlmProvider['activeBackend']>),
};
}
function column(tableId: string, name: string, overrides: Partial<KloEnrichedColumn> = {}): KloEnrichedColumn {
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
return {
id: `${tableId}.${name}`,
tableId,
tableRef,
name,
nativeType: overrides.nativeType ?? 'INTEGER',
normalizedType: overrides.normalizedType ?? 'integer',
dimensionType: overrides.dimensionType ?? 'number',
nullable: overrides.nullable ?? true,
primaryKey: overrides.primaryKey ?? false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
...overrides,
};
}
function table(name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
const ref = { catalog: null, db: null, name };
return {
id: name,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
};
}
function schema(): KloEnrichedSchema {
return {
connectionId: 'warehouse',
relationships: [],
tables: [
table('customers', [
column('customers', 'id', { nullable: false }),
column('customers', 'email', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
]),
table('orders', [
column('orders', 'id', { nullable: false }),
column('orders', 'buyer_ref'),
]),
],
};
}
function profile(): KloRelationshipProfileArtifact {
return {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
queryCount: 4,
warnings: [],
tables: [
{ table: { catalog: null, db: null, name: 'customers' }, rowCount: 2 },
{ table: { catalog: null, db: null, name: 'orders' }, rowCount: 2 },
],
columns: {
'customers.id': {
table: { catalog: null, db: null, name: 'customers' },
column: 'id',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
'orders.buyer_ref': {
table: { catalog: null, db: null, name: 'orders' },
column: 'buyer_ref',
nativeType: 'INTEGER',
normalizedType: 'integer',
rowCount: 2,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['1', '2'],
minTextLength: 1,
maxTextLength: 1,
},
},
};
}
describe('relationship LLM proposals', () => {
it('maps valid structured FK proposals into review candidates with rationale evidence', async () => {
const generateText = vi.fn(async () => ({
output: {
pkCandidates: [{ table: 'customers', column: 'id', confidence: 0.94, rationale: 'Unique customer identifier.' }],
fkCandidates: [
{
fromTable: 'orders',
fromColumn: 'buyer_ref',
toTable: 'customers',
toColumn: 'id',
confidence: 0.88,
rationale: 'Buyer reference values match customer identifiers.',
},
],
},
}));
const result = await proposeKloRelationshipCandidatesWithLlm({
connectionId: 'warehouse',
schema: schema(),
profile: profile(),
llmProvider: llmProvider(),
generateText,
});
expect(result.summary).toBe('completed');
expect(result.llmCalls).toBe(1);
expect(result.warnings).toEqual([]);
expect(result.candidates).toHaveLength(1);
expect(result.candidates[0]).toMatchObject({
from: { tableId: 'orders', columnIds: ['orders.buyer_ref'], columns: ['buyer_ref'] },
to: { tableId: 'customers', columnIds: ['customers.id'], columns: ['id'] },
source: 'llm_proposal',
status: 'review',
evidence: {
llmConfidence: 0.88,
llmRationale: 'Buyer reference values match customer identifiers.',
reasons: ['llm_proposal', 'llm_pk_proposal'],
},
});
expect(generateText).toHaveBeenCalledWith(
expect.objectContaining({
messages: expect.arrayContaining([
expect.objectContaining({
role: 'user',
content: expect.stringContaining('"tables"'),
}),
]),
}),
);
});
it('skips deterministic providers without calling generateText', async () => {
const generateText = vi.fn();
const result = await proposeKloRelationshipCandidatesWithLlm({
connectionId: 'warehouse',
schema: schema(),
profile: profile(),
llmProvider: llmProvider('deterministic'),
generateText,
});
expect(result).toMatchObject({ candidates: [], llmCalls: 0, summary: 'skipped' });
expect(result.warnings).toEqual([]);
expect(generateText).not.toHaveBeenCalled();
});
it('returns recoverable warnings for invalid references and generation failures', async () => {
const invalidReference = await proposeKloRelationshipCandidatesWithLlm({
connectionId: 'warehouse',
schema: schema(),
profile: profile(),
llmProvider: llmProvider(),
generateText: vi.fn(async () => ({
output: {
pkCandidates: [],
fkCandidates: [
{
fromTable: 'orders',
fromColumn: 'missing_column',
toTable: 'customers',
toColumn: 'id',
confidence: 0.7,
rationale: 'Invalid source column.',
},
],
},
})),
});
expect(invalidReference.candidates).toEqual([]);
expect(invalidReference.summary).toBe('completed');
expect(invalidReference.warnings[0]).toMatchObject({
code: 'relationship_llm_invalid_reference',
recoverable: true,
});
const failed = await proposeKloRelationshipCandidatesWithLlm({
connectionId: 'warehouse',
schema: schema(),
profile: profile(),
llmProvider: llmProvider(),
generateText: vi.fn(async () => {
throw new Error('model unavailable');
}),
});
expect(failed).toMatchObject({ candidates: [], llmCalls: 1, summary: 'failed' });
expect(failed.warnings[0]).toMatchObject({
code: 'relationship_llm_proposal_failed',
message: 'KLO relationship LLM proposal failed: model unavailable',
recoverable: true,
});
});
});

View file

@ -0,0 +1,281 @@
import type { KloLlmProvider } from '@klo/llm';
import type { generateText } from 'ai';
import { z } from 'zod';
import { generateKloObject } from '../llm/index.js';
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
import {
normalizeKloRelationshipName,
type KloRelationshipDiscoveryCandidate,
} from './relationship-candidates.js';
import type { KloRelationshipColumnProfile, KloRelationshipProfileArtifact } from './relationship-profiling.js';
import type { KloScanEnrichmentSummary, KloScanWarning, KloTableRef } from './types.js';
const relationshipLlmProposalSchema = z.object({
pkCandidates: z.array(
z.object({
table: z.string(),
column: z.string(),
confidence: z.number(),
rationale: z.string(),
}),
),
fkCandidates: z.array(
z.object({
fromTable: z.string(),
fromColumn: z.string(),
toTable: z.string(),
toColumn: z.string(),
confidence: z.number(),
rationale: z.string(),
}),
),
});
type KloRelationshipLlmProposalOutput = z.infer<typeof relationshipLlmProposalSchema>;
type GenerateTextInput = Parameters<typeof generateText>[0];
export type KloRelationshipLlmProposalGenerateText = (
input: GenerateTextInput,
) => Promise<{ text?: string; output?: unknown }>;
export interface KloRelationshipLlmProposalSettings {
maxTablesPerBatch: number;
maxColumnsPerTable: number;
maxSampleValuesPerColumn: number;
minConfidence: number;
}
export interface ProposeKloRelationshipCandidatesWithLlmInput {
connectionId: string;
schema: KloEnrichedSchema;
profile: KloRelationshipProfileArtifact;
llmProvider: KloLlmProvider | null;
settings?: Partial<KloRelationshipLlmProposalSettings>;
generateText?: KloRelationshipLlmProposalGenerateText;
}
export interface KloRelationshipLlmProposalResult {
candidates: KloRelationshipDiscoveryCandidate[];
warnings: KloScanWarning[];
llmCalls: number;
summary: KloScanEnrichmentSummary['llmRelationshipValidation'];
}
const DEFAULT_SETTINGS: KloRelationshipLlmProposalSettings = {
maxTablesPerBatch: 40,
maxColumnsPerTable: 80,
maxSampleValuesPerColumn: 5,
minConfidence: 0.55,
};
function mergeSettings(
settings: Partial<KloRelationshipLlmProposalSettings> | undefined,
): KloRelationshipLlmProposalSettings {
return { ...DEFAULT_SETTINGS, ...settings };
}
function clampConfidence(value: number): number {
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
}
function modelIsDeterministic(llmProvider: KloLlmProvider): boolean {
const model = llmProvider.getModel('candidateExtraction');
return (model as { provider?: string }).provider === 'deterministic';
}
function findTable(schema: KloEnrichedSchema, name: string): KloEnrichedTable | null {
const normalized = name.toLowerCase();
return schema.tables.find((table) => table.ref.name.toLowerCase() === normalized) ?? null;
}
function findColumn(table: KloEnrichedTable, name: string): KloEnrichedColumn | null {
const normalized = name.toLowerCase();
return table.columns.find((column) => column.name.toLowerCase() === normalized) ?? null;
}
function profileKey(table: KloTableRef, column: KloEnrichedColumn): string {
return `${table.name}.${column.name}`;
}
function profileForColumn(
profile: KloRelationshipProfileArtifact,
table: KloEnrichedTable,
column: KloEnrichedColumn,
): KloRelationshipColumnProfile | null {
return profile.columns[profileKey(table.ref, column)] ?? null;
}
function rowCountForTable(profile: KloRelationshipProfileArtifact, table: KloEnrichedTable): number | null {
return profile.tables.find((item) => item.table.name.toLowerCase() === table.ref.name.toLowerCase())?.rowCount ?? null;
}
function buildEvidencePacket(
schema: KloEnrichedSchema,
profile: KloRelationshipProfileArtifact,
settings: KloRelationshipLlmProposalSettings,
): Record<string, unknown> {
return {
connectionId: schema.connectionId,
sqlAvailable: profile.sqlAvailable,
tables: schema.tables
.filter((table) => table.enabled)
.slice(0, settings.maxTablesPerBatch)
.map((table) => ({
name: table.ref.name,
catalog: table.ref.catalog,
db: table.ref.db,
rowCount: rowCountForTable(profile, table),
columns: table.columns.slice(0, settings.maxColumnsPerTable).map((column) => {
const columnProfile = profileForColumn(profile, table, column);
return {
name: column.name,
nativeType: column.nativeType,
normalizedType: column.normalizedType,
dimensionType: column.dimensionType,
nullable: column.nullable,
declaredPrimaryKey: column.primaryKey,
profile: columnProfile
? {
rowCount: columnProfile.rowCount,
nullCount: columnProfile.nullCount,
distinctCount: columnProfile.distinctCount,
uniquenessRatio: columnProfile.uniquenessRatio,
nullRate: columnProfile.nullRate,
sampleValues: columnProfile.sampleValues.slice(0, settings.maxSampleValuesPerColumn),
}
: null,
};
}),
})),
};
}
function pkProposalKey(table: string, column: string): string {
return `${table.toLowerCase()}.${column.toLowerCase()}`;
}
function endpoint(table: KloEnrichedTable, column: KloEnrichedColumn) {
return {
tableId: table.id,
columnIds: [column.id],
table: table.ref,
columns: [column.name],
};
}
function relationshipId(fromTable: KloEnrichedTable, fromColumn: KloEnrichedColumn, toTable: KloEnrichedTable, toColumn: KloEnrichedColumn): string {
return `${fromTable.id}:(${fromColumn.id})->${toTable.id}:(${toColumn.id})`;
}
function invalidReferenceWarning(message: string, metadata: Record<string, unknown>): KloScanWarning {
return {
code: 'relationship_llm_invalid_reference',
message,
recoverable: true,
metadata,
};
}
function mapValidProposals(
schema: KloEnrichedSchema,
output: KloRelationshipLlmProposalOutput,
settings: KloRelationshipLlmProposalSettings,
): { candidates: KloRelationshipDiscoveryCandidate[]; warnings: KloScanWarning[] } {
const warnings: KloScanWarning[] = [];
const pkProposals = new Set(output.pkCandidates.map((item) => pkProposalKey(item.table, item.column)));
const candidates: KloRelationshipDiscoveryCandidate[] = [];
for (const item of output.fkCandidates) {
if (item.confidence < settings.minConfidence) {
continue;
}
const fromTable = findTable(schema, item.fromTable);
const toTable = findTable(schema, item.toTable);
const fromColumn = fromTable ? findColumn(fromTable, item.fromColumn) : null;
const toColumn = toTable ? findColumn(toTable, item.toColumn) : null;
if (!fromTable || !toTable || !fromColumn || !toColumn) {
warnings.push(
invalidReferenceWarning('KLO relationship LLM proposal referenced a table or column that is not in the schema.', {
proposal: item,
}),
);
continue;
}
const pkProposalExists = pkProposals.has(pkProposalKey(toTable.ref.name, toColumn.name));
candidates.push({
id: relationshipId(fromTable, fromColumn, toTable, toColumn),
from: endpoint(fromTable, fromColumn),
to: endpoint(toTable, toColumn),
source: 'llm_proposal',
status: 'review',
relationshipType: 'many_to_one',
confidence: clampConfidence(item.confidence),
evidence: {
sourceColumnBase: normalizeKloRelationshipName(fromColumn.name).singular,
targetTableBase: normalizeKloRelationshipName(toTable.ref.name).singular,
targetColumnBase: normalizeKloRelationshipName(toColumn.name).singular,
targetKeyScore: pkProposalExists ? 0.88 : 0.68,
nameScore: 0.45,
reasons: pkProposalExists ? ['llm_proposal', 'llm_pk_proposal'] : ['llm_proposal'],
llmConfidence: clampConfidence(item.confidence),
llmRationale: item.rationale,
},
});
}
return { candidates, warnings };
}
function generationFailureWarning(error: unknown): KloScanWarning {
const message = error instanceof Error ? error.message : String(error);
return {
code: 'relationship_llm_proposal_failed',
message: `KLO relationship LLM proposal failed: ${message}`,
recoverable: true,
};
}
export async function proposeKloRelationshipCandidatesWithLlm(
input: ProposeKloRelationshipCandidatesWithLlmInput,
): Promise<KloRelationshipLlmProposalResult> {
if (!input.llmProvider || modelIsDeterministic(input.llmProvider)) {
return { candidates: [], warnings: [], llmCalls: 0, summary: 'skipped' };
}
const settings = mergeSettings(input.settings);
const evidence = buildEvidencePacket(input.schema, input.profile, settings);
const prompt = [
'You are helping KLO review possible SQL relationships before validation.',
'Use only the compact schema evidence. Propose likely primary keys and foreign keys for later SQL validation.',
'Return structured output only; never assume a join is accepted.',
JSON.stringify(evidence),
].join('\n\n');
try {
const generated = await generateKloObject<
KloRelationshipLlmProposalOutput,
typeof relationshipLlmProposalSchema
>({
llmProvider: input.llmProvider,
role: 'candidateExtraction',
prompt,
schema: relationshipLlmProposalSchema,
generateText: input.generateText,
});
const output = relationshipLlmProposalSchema.parse(generated);
const mapped = mapValidProposals(input.schema, output, settings);
return {
candidates: mapped.candidates,
warnings: mapped.warnings,
llmCalls: 1,
summary: 'completed',
};
} catch (error) {
return {
candidates: [],
warnings: [generationFailureWarning(error)],
llmCalls: 1,
summary: 'failed',
};
}
}

View file

@ -0,0 +1,151 @@
import { describe, expect, it } from 'vitest';
import type { KloEnrichedColumn, KloEnrichedTable } from './enrichment-types.js';
import { localCandidateTables } from './relationship-locality.js';
function column(
tableId: string,
id: string,
name: string,
options: Partial<KloEnrichedColumn> = {},
): KloEnrichedColumn {
const tableRef = options.tableRef ?? { catalog: null, db: 'public', name: tableId };
return {
id,
tableId,
tableRef,
name,
nativeType: options.nativeType ?? 'INTEGER',
normalizedType: options.normalizedType ?? 'integer',
dimensionType: options.dimensionType ?? 'number',
nullable: options.nullable ?? true,
primaryKey: options.primaryKey ?? false,
parentColumnId: options.parentColumnId ?? null,
descriptions: options.descriptions ?? {},
embedding: options.embedding ?? null,
sampleValues: options.sampleValues ?? null,
cardinality: options.cardinality ?? null,
};
}
function table(id: string, name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
const ref = { catalog: null, db: 'public', name };
return {
id,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: id, tableRef: ref })),
};
}
describe('relationship locality', () => {
it('ranks the referenced parent table ahead of the child table for id-like source columns', () => {
const artists = table('artist-id', 'Artist', [column('artist-id', 'artist-pk', 'ArtistId')]);
const albums = table('album-id', 'Album', [
column('album-id', 'album-pk', 'AlbumId'),
column('album-id', 'artist-fk', 'ArtistId'),
]);
const unrelated = table('invoice-id', 'Invoice', [column('invoice-id', 'invoice-pk', 'InvoiceId')]);
const ranked = localCandidateTables({
childTable: albums,
childColumn: albums.columns[1]!,
parentTables: [albums, unrelated, artists],
maxParentTables: 1,
});
expect(ranked.map((item) => item.table.ref.name)).toEqual(['Artist']);
expect(ranked[0]).toMatchObject({
score: expect.any(Number),
tokenScore: expect.any(Number),
embeddingScore: 0,
reasons: expect.arrayContaining(['column_table_token_overlap']),
});
});
it('uses singular and plural variants so plan_code can rank stg_plans', () => {
const plans = table('plans-id', 'stg_plans', [column('plans-id', 'plan-code', 'plan_code')]);
const segments = table('segments-id', 'mart_account_segments', [
column('segments-id', 'current-plan-code', 'current_plan_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
}),
]);
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
const ranked = localCandidateTables({
childTable: segments,
childColumn: segments.columns[0]!,
parentTables: [accounts, segments, plans],
maxParentTables: 1,
});
expect(ranked.map((item) => item.table.ref.name)).toEqual(['stg_plans']);
expect(ranked[0]?.tokenScore).toBeGreaterThan(0);
});
it('returns all tables when the schema is smaller than the default locality cap', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
const invoices = table('invoices-id', 'invoices', [
column('invoices-id', 'invoice-id', 'id'),
column('invoices-id', 'account-id', 'account_id'),
]);
const ranked = localCandidateTables({
childTable: invoices,
childColumn: invoices.columns[1]!,
parentTables: [invoices, accounts],
});
expect(ranked.map((item) => item.table.ref.name).sort()).toEqual(['accounts', 'invoices']);
});
it('supports an explicit zero cap for deterministic tests', () => {
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
const invoices = table('invoices-id', 'invoices', [
column('invoices-id', 'invoice-id', 'id'),
column('invoices-id', 'account-id', 'account_id'),
]);
const ranked = localCandidateTables({
childTable: invoices,
childColumn: invoices.columns[1]!,
parentTables: [invoices, accounts],
maxParentTables: 0,
});
expect(ranked).toEqual([]);
});
it('uses parent-column embeddings when token locality is weak', () => {
const customers = table('customers-id', 'customers', [
column('customers-id', 'customers-id-col', 'id', { embedding: [1, 0, 0] }),
column('customers-id', 'customers-name-col', 'name', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
embedding: [0, 1, 0],
}),
]);
const orders = table('orders-id', 'orders', [
column('orders-id', 'orders-id-col', 'id', { embedding: [0, 0, 1] }),
column('orders-id', 'buyer-ref-col', 'buyer_ref', { embedding: [0.995, 0.005, 0] }),
]);
const invoices = table('invoices-id', 'invoices', [column('invoices-id', 'invoice-id', 'id')]);
const ranked = localCandidateTables({
childTable: orders,
childColumn: orders.columns[1]!,
parentTables: [invoices, customers],
maxParentTables: 1,
});
expect(ranked.map((item) => item.table.ref.name)).toEqual(['customers']);
expect(ranked[0]).toMatchObject({
embeddingScore: expect.any(Number),
reasons: expect.arrayContaining(['embedding_similarity']),
});
expect(ranked[0]!.embeddingScore).toBeGreaterThan(0.99);
});
});

View file

@ -0,0 +1,164 @@
import type { KloEnrichedColumn, KloEnrichedTable } from './enrichment-types.js';
import { normalizeKloRelationshipName, tokenizeKloRelationshipName } from './relationship-name-similarity.js';
export interface KloRelationshipLocalityCandidateTable {
table: KloEnrichedTable;
score: number;
tokenScore: number;
embeddingScore: number;
reasons: string[];
}
export interface LocalKloRelationshipCandidateTablesInput {
childTable: KloEnrichedTable;
childColumn: KloEnrichedColumn;
parentTables: readonly KloEnrichedTable[];
maxParentTables?: number;
}
const DEFAULT_MAX_PARENT_TABLES = 20;
const RELATIONSHIP_SUFFIX_TOKENS = new Set(['id', 'ids', 'key', 'keys', 'code', 'codes', 'uuid', 'uuids']);
function roundedScore(value: number): number {
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
}
function normalizedTokenVariants(name: string): string[] {
const normalized = normalizeKloRelationshipName(name);
return Array.from(
new Set([
...normalized.tokens,
...tokenizeKloRelationshipName(normalized.singular),
...tokenizeKloRelationshipName(normalized.plural),
]),
).filter(Boolean);
}
function childColumnLocalityTokens(column: KloEnrichedColumn): string[] {
const tokens = normalizedTokenVariants(column.name);
const withoutSuffix = tokens.filter((token) => !RELATIONSHIP_SUFFIX_TOKENS.has(token));
return withoutSuffix.length > 0 ? withoutSuffix : tokens;
}
function uniqueTokens(values: readonly string[]): string[] {
return Array.from(new Set(values.filter((value) => value.length > 0)));
}
function jaccard(left: readonly string[], right: readonly string[]): number {
if (left.length === 0 || right.length === 0) {
return 0;
}
const leftSet = new Set(left);
const rightSet = new Set(right);
const intersectionSize = Array.from(leftSet).filter((token) => rightSet.has(token)).length;
const unionSize = new Set([...leftSet, ...rightSet]).size;
return unionSize === 0 ? 0 : intersectionSize / unionSize;
}
function cosineSimilarity(left: readonly number[] | null, right: readonly number[] | null): number {
if (!left || !right || left.length === 0 || left.length !== right.length) {
return 0;
}
let dot = 0;
let leftMagnitude = 0;
let rightMagnitude = 0;
for (let index = 0; index < left.length; index += 1) {
const leftValue = left[index] ?? 0;
const rightValue = right[index] ?? 0;
dot += leftValue * rightValue;
leftMagnitude += leftValue * leftValue;
rightMagnitude += rightValue * rightValue;
}
if (leftMagnitude === 0 || rightMagnitude === 0) {
return 0;
}
return dot / (Math.sqrt(leftMagnitude) * Math.sqrt(rightMagnitude));
}
function parentEmbeddingScore(childColumn: KloEnrichedColumn, parentTable: KloEnrichedTable): number {
if (!Array.isArray(childColumn.embedding) || childColumn.embedding.length === 0) {
return 0;
}
let best = 0;
for (const parentColumn of parentTable.columns) {
best = Math.max(best, cosineSimilarity(childColumn.embedding, parentColumn.embedding));
}
return best;
}
function tableTokenScore(input: {
childTable: KloEnrichedTable;
childColumn: KloEnrichedColumn;
parentTable: KloEnrichedTable;
}): number {
const childTableTokens = normalizedTokenVariants(input.childTable.ref.name);
const childColumnTokens = childColumnLocalityTokens(input.childColumn);
const parentTokens = normalizedTokenVariants(input.parentTable.ref.name);
const columnOnlyScore = jaccard(childColumnTokens, parentTokens);
if (input.parentTable.id === input.childTable.id) {
return columnOnlyScore;
}
const columnAndTableScore = jaccard(uniqueTokens([...childTableTokens, ...childColumnTokens]), parentTokens);
return Math.max(columnOnlyScore, columnAndTableScore * 0.6);
}
function localityScore(input: {
childTable: KloEnrichedTable;
childColumn: KloEnrichedColumn;
parentTable: KloEnrichedTable;
}): Omit<KloRelationshipLocalityCandidateTable, 'table'> {
const tokenScore = roundedScore(tableTokenScore(input));
const embeddingScore = roundedScore(parentEmbeddingScore(input.childColumn, input.parentTable));
const score =
embeddingScore > 0
? roundedScore(Math.max(tokenScore, tokenScore * 0.8 + embeddingScore * 0.2, embeddingScore * 0.65))
: tokenScore;
const reasons: string[] = [];
if (tokenScore > 0) {
reasons.push('column_table_token_overlap');
}
if (embeddingScore > 0) {
reasons.push('embedding_similarity');
}
if (reasons.length === 0) {
reasons.push('locality_tie_breaker');
}
return {
score,
tokenScore,
embeddingScore,
reasons,
};
}
export function localCandidateTables(
input: LocalKloRelationshipCandidateTablesInput,
): KloRelationshipLocalityCandidateTable[] {
const limit = input.maxParentTables ?? DEFAULT_MAX_PARENT_TABLES;
if (!Number.isFinite(limit) || limit <= 0) {
return [];
}
return input.parentTables
.map((table) => ({
table,
...localityScore({
childTable: input.childTable,
childColumn: input.childColumn,
parentTable: table,
}),
}))
.sort(
(left, right) =>
right.score - left.score ||
right.tokenScore - left.tokenScore ||
right.embeddingScore - left.embeddingScore ||
left.table.ref.name.localeCompare(right.table.ref.name) ||
left.table.id.localeCompare(right.table.id),
)
.slice(0, Math.floor(limit));
}

View file

@ -0,0 +1,81 @@
import { describe, expect, it } from 'vitest';
import {
normalizeKloRelationshipName,
pluralizeKloRelationshipToken,
singularizeKloRelationshipToken,
tokenSimilarity,
tokenizeKloRelationshipName,
} from './relationship-name-similarity.js';
describe('relationship name similarity', () => {
it('tokenizes common warehouse naming styles', () => {
expect(normalizeKloRelationshipName('AlbumId')).toMatchObject({
normalized: 'album_id',
singular: 'album_id',
plural: 'album_ids',
tokens: ['album', 'id'],
});
expect(normalizeKloRelationshipName('artistID')).toMatchObject({
normalized: 'artist_id',
tokens: ['artist', 'id'],
});
expect(normalizeKloRelationshipName('SalesLT.CustomerID')).toMatchObject({
normalized: 'sales_lt_customer_id',
singular: 'sales_lt_customer_id',
tokens: ['sales', 'lt', 'customer', 'id'],
});
expect(normalizeKloRelationshipName('SCREAMING_CUSTOMER_UUID')).toMatchObject({
normalized: 'screaming_customer_uuid',
tokens: ['screaming', 'customer', 'uuid'],
});
expect(normalizeKloRelationshipName('billing-account-key')).toMatchObject({
normalized: 'billing_account_key',
tokens: ['billing', 'account', 'key'],
});
});
it('removes only leading warehouse layer prefixes', () => {
expect(normalizeKloRelationshipName('mart__Sales_Accounts')).toMatchObject({
normalized: 'sales_accounts',
singular: 'sales_account',
plural: 'sales_accounts',
tokens: ['sales', 'accounts'],
});
expect(normalizeKloRelationshipName('dim_users')).toMatchObject({
normalized: 'users',
singular: 'user',
plural: 'users',
tokens: ['users'],
});
expect(normalizeKloRelationshipName('customer_dim_id')).toMatchObject({
normalized: 'customer_dim_id',
tokens: ['customer', 'dim', 'id'],
});
});
it('folds accents and preserves non-suffix trailing s words', () => {
expect(normalizeKloRelationshipName('KundénID')).toMatchObject({
normalized: 'kunden_id',
tokens: ['kunden', 'id'],
});
expect(singularizeKloRelationshipToken('address')).toBe('address');
expect(singularizeKloRelationshipToken('addresses')).toBe('address');
expect(singularizeKloRelationshipToken('status')).toBe('status');
expect(pluralizeKloRelationshipToken('address')).toBe('addresses');
expect(pluralizeKloRelationshipToken('company')).toBe('companies');
});
it('returns deterministic tokens for direct tokenization calls', () => {
expect(tokenizeKloRelationshipName('HTTPResponseCode')).toEqual(['http', 'response', 'code']);
expect(tokenizeKloRelationshipName('customer2AddressID')).toEqual(['customer', '2', 'address', 'id']);
});
it('scores token overlap and ordered suffix similarity', () => {
expect(tokenSimilarity('artist_id', 'artist_id')).toBe(1);
expect(tokenSimilarity('Album.ArtistId', 'ArtistID')).toBeGreaterThanOrEqual(0.74);
expect(tokenSimilarity('customer_account_id', 'account_id')).toBeGreaterThan(
tokenSimilarity('customer_account_id', 'invoice_id'),
);
expect(tokenSimilarity('', 'artist')).toBe(0);
});
});

View file

@ -0,0 +1,151 @@
export interface KloRelationshipNormalizedName {
raw: string;
normalized: string;
singular: string;
plural: string;
tokens: string[];
}
export type KloRelationshipTokenInput = string | readonly string[] | KloRelationshipNormalizedName;
const WAREHOUSE_LAYER_PREFIXES = new Set(['stg', 'stage', 'staging', 'dim', 'fct', 'fact', 'int', 'mart']);
function splitCaseBoundaries(value: string): string {
return value
.replace(/([\p{Lu}]+)([\p{Lu}][\p{Ll}])/gu, '$1_$2')
.replace(/([\p{Ll}\p{N}])([\p{Lu}])/gu, '$1_$2')
.replace(/(\p{L})(\p{N})/gu, '$1_$2')
.replace(/(\p{N})(\p{L})/gu, '$1_$2');
}
function foldAccents(value: string): string {
return value
.normalize('NFKD')
.replace(/\p{Mark}+/gu, '')
.replace(/ß/giu, 'ss')
.replace(/æ/giu, 'ae')
.replace(/œ/giu, 'oe');
}
export function singularizeKloRelationshipToken(value: string): string {
if (value.length <= 2) {
return value;
}
if (value.endsWith('ies') && value.length > 3) {
return `${value.slice(0, -3)}y`;
}
if (/(ches|shes|sses|xes|zes)$/u.test(value)) {
return value.slice(0, -2);
}
if (value.endsWith('ves') && value.length > 4) {
return `${value.slice(0, -3)}f`;
}
if (value.endsWith('s') && !/(ss|us|is)$/u.test(value)) {
return value.slice(0, -1);
}
return value;
}
export function pluralizeKloRelationshipToken(value: string): string {
if (value.endsWith('y')) {
return `${value.slice(0, -1)}ies`;
}
if (/(s|x|z|ch|sh)$/u.test(value)) {
return `${value}es`;
}
return `${value}s`;
}
function singularizeTokens(tokens: readonly string[]): string[] {
if (tokens.length === 0) {
return [];
}
const result = [...tokens];
const last = result[result.length - 1];
if (last) {
result[result.length - 1] = singularizeKloRelationshipToken(last);
}
return result;
}
function pluralizeTokens(tokens: readonly string[]): string[] {
if (tokens.length === 0) {
return [];
}
const result = [...tokens];
const last = result[result.length - 1];
if (last) {
result[result.length - 1] = pluralizeKloRelationshipToken(last);
}
return result;
}
export function tokenizeKloRelationshipName(name: string): string[] {
const boundarySeparated = splitCaseBoundaries(foldAccents(name.trim()));
const tokens = boundarySeparated
.toLowerCase()
.replace(/[^\p{L}\p{N}]+/gu, '_')
.replace(/^_+|_+$/gu, '')
.split('_')
.filter(Boolean);
return tokens.filter((token, index) => index > 0 || !WAREHOUSE_LAYER_PREFIXES.has(token));
}
export function normalizeKloRelationshipName(name: string): KloRelationshipNormalizedName {
const tokens = tokenizeKloRelationshipName(name);
const singularTokens = singularizeTokens(tokens);
const pluralTokens = pluralizeTokens(singularTokens);
return {
raw: name,
normalized: tokens.join('_'),
singular: singularTokens.join('_'),
plural: pluralTokens.join('_'),
tokens,
};
}
function tokensFromInput(input: KloRelationshipTokenInput): string[] {
if (typeof input === 'string') {
return tokenizeKloRelationshipName(input);
}
if ('tokens' in input) {
return input.tokens;
}
return input.map((token) => normalizeKloRelationshipName(token).normalized).filter(Boolean);
}
function longestCommonSuffixLength(left: readonly string[], right: readonly string[]): number {
let count = 0;
while (
count < left.length &&
count < right.length &&
left[left.length - 1 - count] === right[right.length - 1 - count]
) {
count += 1;
}
return count;
}
function roundedScore(value: number): number {
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
}
export function tokenSimilarity(leftInput: KloRelationshipTokenInput, rightInput: KloRelationshipTokenInput): number {
const left = tokensFromInput(leftInput);
const right = tokensFromInput(rightInput);
if (left.length === 0 || right.length === 0) {
return 0;
}
const leftSet = new Set(left);
const rightSet = new Set(right);
const intersectionSize = Array.from(leftSet).filter((token) => rightSet.has(token)).length;
const unionSize = new Set([...leftSet, ...rightSet]).size;
const jaccard = unionSize === 0 ? 0 : intersectionSize / unionSize;
const suffixLength = longestCommonSuffixLength(left, right);
const suffixScore = suffixLength / Math.min(left.length, right.length);
return roundedScore(jaccard * 0.75 + suffixScore * 0.25);
}

View file

@ -0,0 +1,354 @@
import { readFile } from 'node:fs/promises';
import { join } from 'node:path';
import Database from 'better-sqlite3';
import { afterEach, describe, expect, it } from 'vitest';
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
import { snapshotToKloEnrichedSchema } from './local-enrichment.js';
import { loadKloRelationshipBenchmarkFixture, maskKloRelationshipBenchmarkSnapshot } from './relationship-benchmarks.js';
import {
createKloRelationshipProfileCache,
formatKloRelationshipTableRef,
profileKloRelationshipSchema,
quoteKloRelationshipIdentifier,
} from './relationship-profiling.js';
import type { KloQueryResult, KloReadOnlyQueryInput, KloScanContext } from './types.js';
class InMemorySqliteExecutor {
readonly db = new Database(':memory:');
queryCount = 0;
executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
this.queryCount += 1;
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return Promise.resolve({
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
});
}
close(): void {
this.db.close();
}
}
class FileSqliteExecutor {
readonly db: Database.Database;
queryCount = 0;
constructor(dataPath: string) {
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
}
executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
this.queryCount += 1;
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return Promise.resolve({
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
});
}
close(): void {
this.db.close();
}
}
function column(tableId: string, name: string, overrides: Partial<KloEnrichedColumn> = {}): KloEnrichedColumn {
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
return {
id: `${tableId}.${name}`,
tableId,
tableRef,
name,
nativeType: overrides.nativeType ?? 'INTEGER',
normalizedType: overrides.normalizedType ?? 'integer',
dimensionType: overrides.dimensionType ?? 'number',
nullable: overrides.nullable ?? true,
primaryKey: overrides.primaryKey ?? false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
...overrides,
};
}
function table(name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
const ref = { catalog: null, db: null, name };
return {
id: name,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
};
}
function schema(tables: KloEnrichedTable[]): KloEnrichedSchema {
return { connectionId: 'warehouse', tables, relationships: [] };
}
describe('relationship profiling', () => {
let executor: InMemorySqliteExecutor | null = null;
afterEach(() => {
executor?.close();
executor = null;
});
it('keeps profiling on the batched table path', async () => {
const source = await readFile(new URL('relationship-profiling.ts', import.meta.url), 'utf-8');
expect(source).not.toMatch(new RegExp('queryColumn' + 'Profile'));
expect(source).not.toMatch(/for \(const column of table\.columns\)[\s\S]*executeReadOnly/);
expect(source).toMatch(/queryTableProfile/);
expect(source).toMatch(/UNION ALL/);
});
it('quotes identifiers and formats table refs for supported local SQL drivers', () => {
expect(quoteKloRelationshipIdentifier('sqlite', 'odd"name')).toBe('"odd""name"');
expect(quoteKloRelationshipIdentifier('mysql', 'odd`name')).toBe('`odd``name`');
expect(quoteKloRelationshipIdentifier('sqlserver', 'odd]name')).toBe('[odd]]name]');
expect(formatKloRelationshipTableRef('sqlite', { catalog: null, db: null, name: 'accounts' })).toBe('"accounts"');
expect(formatKloRelationshipTableRef('postgres', { catalog: null, db: 'analytics', name: 'accounts' })).toBe(
'"analytics"."accounts"',
);
});
it('profiles row count, null rate, uniqueness, sample values, and text lengths', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, code TEXT, parent_id INTEGER);
INSERT INTO accounts (id, code, parent_id) VALUES
(1, 'A-1', NULL),
(2, 'B-2', 1),
(3, 'C-3', 1),
(4, 'C-3', 2);
`);
const result = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: schema([
table('accounts', [
column('accounts', 'id', { primaryKey: false, nullable: false }),
column('accounts', 'code', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
column('accounts', 'parent_id'),
]),
]),
executor,
ctx: { runId: 'profile-test' },
sampleValuesPerColumn: 3,
});
expect(result.sqlAvailable).toBe(true);
expect(result.queryCount).toBe(1);
expect(executor.queryCount).toBe(1);
expect(result.tables).toHaveLength(1);
expect(result.tables[0]).toMatchObject({ table: { name: 'accounts' }, rowCount: 4 });
expect(result.columns['accounts.id']).toMatchObject({
table: { name: 'accounts' },
column: 'id',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
minTextLength: 1,
maxTextLength: 1,
});
expect(result.columns['accounts.code']).toMatchObject({
distinctCount: 3,
uniquenessRatio: 0.75,
sampleValues: ['C-3', 'A-1', 'B-2'],
minTextLength: 3,
maxTextLength: 3,
});
expect(result.columns['accounts.parent_id']).toMatchObject({
nullCount: 1,
distinctCount: 2,
uniquenessRatio: 0.5,
nullRate: 0.25,
});
});
it('profiles each enabled table with one read-only SQL query', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, code TEXT, parent_id INTEGER);
CREATE TABLE users (id INTEGER, account_id INTEGER);
INSERT INTO accounts (id, code, parent_id) VALUES
(1, 'A-1', NULL),
(2, 'B-2', 1),
(3, 'C-3', 1),
(4, 'C-3', 2);
INSERT INTO users (id, account_id) VALUES
(10, 1),
(11, 1),
(12, 2);
`);
const result = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: schema([
table('accounts', [
column('accounts', 'id', { nullable: false }),
column('accounts', 'code', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
column('accounts', 'parent_id'),
]),
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id')]),
]),
executor,
ctx: { runId: 'profile-batched-query-count' },
sampleValuesPerColumn: 3,
});
expect(result.sqlAvailable).toBe(true);
expect(result.queryCount).toBe(2);
expect(executor.queryCount).toBe(2);
expect(result.tables).toEqual([
{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 4 },
{ table: { catalog: null, db: null, name: 'users' }, rowCount: 3 },
]);
expect(result.columns['accounts.code']).toMatchObject({
distinctCount: 3,
uniquenessRatio: 0.75,
sampleValues: ['C-3', 'A-1', 'B-2'],
});
expect(result.columns['users.account_id']).toMatchObject({
rowCount: 3,
nullCount: 0,
distinctCount: 2,
uniquenessRatio: 2 / 3,
});
});
it('bounds column profile statistics with profileSampleRows', async () => {
const executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL, account_code TEXT NOT NULL);
INSERT INTO accounts VALUES (1, 'a1'), (2, 'a2'), (3, 'a3'), (4, 'a4');
`);
const profiles = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: schema([
table('accounts', [
column('accounts', 'id', { nullable: false }),
column('accounts', 'account_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
}),
]),
]),
executor,
ctx: { runId: 'profile-sample-rows' },
profileSampleRows: 2,
});
expect(profiles.queryCount).toBe(1);
expect(executor.queryCount).toBe(1);
expect(profiles.tables).toEqual([{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 4 }]);
expect(profiles.columns['accounts.id']).toMatchObject({
rowCount: 2,
distinctCount: 2,
uniquenessRatio: 1,
});
expect(profiles.columns['accounts.account_code']?.sampleValues).toEqual(['a1', 'a2']);
executor.close();
});
it('reuses a profile cache inside one scan run but re-queries with a fresh cache', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL, account_code TEXT NOT NULL);
INSERT INTO accounts VALUES (1, 'a1'), (2, 'a2'), (3, 'a2');
`);
const relationshipSchema = schema([
table('accounts', [
column('accounts', 'id', { nullable: false }),
column('accounts', 'account_code', {
nativeType: 'TEXT',
normalizedType: 'text',
dimensionType: 'string',
nullable: false,
}),
]),
]);
const cache = createKloRelationshipProfileCache();
const first = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: relationshipSchema,
executor,
ctx: { runId: 'profile-cache-run' },
cache,
});
const second = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: relationshipSchema,
executor,
ctx: { runId: 'profile-cache-run' },
cache,
});
const third = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: relationshipSchema,
executor,
ctx: { runId: 'profile-cache-fresh-run' },
cache: createKloRelationshipProfileCache(),
});
expect(first.queryCount).toBe(1);
expect(second.queryCount).toBe(0);
expect(third.queryCount).toBe(1);
expect(executor.queryCount).toBe(2);
expect(second.tables).toEqual(first.tables);
expect(second.columns).toEqual(first.columns);
});
it('profiles the checked-in scale stress fixture with one query per table', async () => {
const fixtureRoot = new URL('../../test/fixtures/relationship-benchmarks/', import.meta.url);
const fixture = await loadKloRelationshipBenchmarkFixture(join(fixtureRoot.pathname, 'scale_stress_no_declared_constraints'));
if (!fixture.dataPath) {
throw new Error('scale_stress_no_declared_constraints is missing data.sqlite');
}
const maskedSnapshot = maskKloRelationshipBenchmarkSnapshot(
fixture.snapshot,
'declared_pks_and_declared_fks_removed',
);
const scaleExecutor = new FileSqliteExecutor(fixture.dataPath);
try {
const result = await profileKloRelationshipSchema({
connectionId: fixture.snapshot.connectionId,
driver: fixture.snapshot.driver,
schema: snapshotToKloEnrichedSchema(maskedSnapshot, new Map()),
executor: scaleExecutor,
ctx: { runId: 'scale-stress-profile-query-count' },
profileSampleRows: 3,
});
expect(fixture.snapshot.tables).toHaveLength(400);
expect(result.queryCount).toBe(400);
expect(result.queryCount).toBeLessThanOrEqual(2 * fixture.snapshot.tables.length);
expect(scaleExecutor.queryCount).toBe(400);
} finally {
scaleExecutor.close();
}
});
});

View file

@ -0,0 +1,467 @@
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
import type {
KloConnectionDriver,
KloQueryResult,
KloReadOnlyQueryInput,
KloScanContext,
KloTableRef,
} from './types.js';
export interface KloRelationshipReadOnlyExecutor {
executeReadOnly(input: KloReadOnlyQueryInput, ctx: KloScanContext): Promise<KloQueryResult>;
}
export interface KloRelationshipColumnProfile {
table: KloTableRef;
column: string;
nativeType: string;
normalizedType: string;
rowCount: number;
nullCount: number;
distinctCount: number;
uniquenessRatio: number;
nullRate: number;
sampleValues: string[];
minTextLength: number | null;
maxTextLength: number | null;
}
export interface KloRelationshipTableProfile {
table: KloTableRef;
rowCount: number;
}
export interface KloRelationshipProfileArtifact {
connectionId: string;
driver: KloConnectionDriver;
sqlAvailable: boolean;
queryCount: number;
tables: KloRelationshipTableProfile[];
columns: Record<string, KloRelationshipColumnProfile>;
warnings: string[];
}
interface KloRelationshipCachedTableProfile {
table: KloRelationshipTableProfile;
columns: Record<string, KloRelationshipColumnProfile>;
warnings: string[];
}
export interface KloRelationshipProfileCache {
readonly tableProfiles: Map<string, KloRelationshipCachedTableProfile>;
}
export interface ProfileKloRelationshipSchemaInput {
connectionId: string;
driver: KloConnectionDriver;
schema: KloEnrichedSchema;
executor: KloRelationshipReadOnlyExecutor | null;
ctx: KloScanContext;
sampleValuesPerColumn?: number;
profileSampleRows?: number;
cache?: KloRelationshipProfileCache;
}
export function createKloRelationshipProfileCache(): KloRelationshipProfileCache {
return { tableProfiles: new Map() };
}
const SAMPLE_VALUE_DELIMITER = '\u001f';
type QuoteStyle = 'double' | 'backtick' | 'bracket';
function quoteStyle(driver: KloConnectionDriver): QuoteStyle {
if (driver === 'mysql' || driver === 'clickhouse' || driver === 'posthog') {
return 'backtick';
}
if (driver === 'sqlserver') {
return 'bracket';
}
return 'double';
}
export function quoteKloRelationshipIdentifier(driver: KloConnectionDriver, identifier: string): string {
switch (quoteStyle(driver)) {
case 'backtick':
return `\`${identifier.replace(/`/g, '``')}\``;
case 'bracket':
return `[${identifier.replace(/\]/g, ']]')}]`;
case 'double':
return `"${identifier.replace(/"/g, '""')}"`;
}
}
export function formatKloRelationshipTableRef(driver: KloConnectionDriver, table: KloTableRef): string {
const parts =
driver === 'sqlite' || driver === 'posthog'
? [table.name]
: [table.catalog, table.db, table.name].filter((value): value is string => Boolean(value));
return parts.map((part) => quoteKloRelationshipIdentifier(driver, part)).join('.');
}
function textLengthExpression(driver: KloConnectionDriver, columnSql: string): string {
if (driver === 'mysql') {
return `CHAR_LENGTH(CAST(${columnSql} AS CHAR))`;
}
if (driver === 'sqlserver') {
return `LEN(CAST(${columnSql} AS NVARCHAR(MAX)))`;
}
if (driver === 'bigquery') {
return `LENGTH(CAST(${columnSql} AS STRING))`;
}
if (driver === 'clickhouse' || driver === 'posthog') {
return `length(toString(${columnSql}))`;
}
return `LENGTH(CAST(${columnSql} AS TEXT))`;
}
function limitSql(driver: KloConnectionDriver, limit: number): string {
if (driver === 'sqlserver') {
return '';
}
return ` LIMIT ${Math.max(1, Math.floor(limit))}`;
}
function topSql(driver: KloConnectionDriver, limit: number): string {
if (driver === 'sqlserver') {
return ` TOP (${Math.max(1, Math.floor(limit))})`;
}
return '';
}
function sampledTableSql(driver: KloConnectionDriver, tableSql: string, limit: number): string {
const safeLimit = Math.max(1, Math.floor(limit));
if (driver === 'sqlserver') {
return `(SELECT TOP (${safeLimit}) * FROM ${tableSql}) AS relationship_profile_sample`;
}
return `(SELECT * FROM ${tableSql}${limitSql(driver, safeLimit)}) AS relationship_profile_sample`;
}
function firstRow(result: KloQueryResult): unknown[] {
return result.rows[0] ?? [];
}
function headerIndex(result: KloQueryResult, header: string): number {
return result.headers.findIndex((candidate) => candidate.toLowerCase() === header.toLowerCase());
}
function valueAt(result: KloQueryResult, row: unknown[], header: string): unknown {
return row[headerIndex(result, header)];
}
function numberFromValue(value: unknown): number {
if (typeof value === 'number') {
return value;
}
if (typeof value === 'bigint') {
return Number(value);
}
if (typeof value === 'string' && value.trim() !== '') {
return Number(value);
}
return 0;
}
function nullableNumberFromValue(value: unknown): number | null {
if (value === null || value === undefined) {
return null;
}
if (typeof value === 'number') {
return value;
}
if (typeof value === 'bigint') {
return Number(value);
}
if (typeof value === 'string' && value.trim() !== '') {
return Number(value);
}
return null;
}
function numberAt(result: KloQueryResult, header: string): number {
return numberFromValue(valueAt(result, firstRow(result), header));
}
function columnKey(table: KloEnrichedTable, column: KloEnrichedColumn): string {
return `${table.ref.name}.${column.name}`;
}
function tableProfileCacheKey(input: {
connectionId: string;
driver: KloConnectionDriver;
ctx: KloScanContext;
table: KloTableRef;
sampleValuesPerColumn: number;
profileSampleRows: number;
}): string {
return [
input.ctx.runId,
input.connectionId,
input.driver,
input.table.catalog ?? '',
input.table.db ?? '',
input.table.name,
String(input.sampleValuesPerColumn),
String(input.profileSampleRows),
].join('\u001e');
}
function sqlStringLiteral(value: string): string {
return `'${value.replace(/'/g, "''")}'`;
}
function sampleAggregateSql(driver: KloConnectionDriver, innerSql: string): string {
if (driver === 'postgres') {
return `(SELECT STRING_AGG(CAST(value AS TEXT), CHR(31)) FROM (${innerSql}) AS relationship_profile_values)`;
}
if (driver === 'bigquery') {
return `(SELECT STRING_AGG(CAST(value AS STRING), '\\u001F') FROM (${innerSql}) AS relationship_profile_values)`;
}
if (driver === 'mysql') {
return `(SELECT GROUP_CONCAT(CAST(value AS CHAR) SEPARATOR CHAR(31)) FROM (${innerSql}) AS relationship_profile_values)`;
}
if (driver === 'sqlserver') {
return `(SELECT STRING_AGG(CAST(value AS NVARCHAR(MAX)), CHAR(31)) FROM (${innerSql}) AS relationship_profile_values)`;
}
if (driver === 'clickhouse' || driver === 'posthog') {
return `(SELECT arrayStringConcat(groupArray(toString(value)), '\\x1F') FROM (${innerSql}) AS relationship_profile_values)`;
}
return `(SELECT GROUP_CONCAT(CAST(value AS TEXT), char(31)) FROM (${innerSql}) AS relationship_profile_values)`;
}
function sampleValuesSql(input: {
driver: KloConnectionDriver;
tableSql: string;
columnSql: string;
limit: number;
}): string {
return [
`SELECT${topSql(input.driver, input.limit)} ${input.columnSql} AS value`,
`FROM ${input.tableSql}`,
`WHERE ${input.columnSql} IS NOT NULL`,
`GROUP BY ${input.columnSql}`,
`ORDER BY COUNT(*) DESC, ${input.columnSql} ASC`,
limitSql(input.driver, input.limit),
].join(' ');
}
function columnProfileSelectSql(input: {
connectionDriver: KloConnectionDriver;
tableSql: string;
profileTableSql: string;
column: KloEnrichedColumn;
sampleValuesPerColumn: number;
}): string {
const columnSql = quoteKloRelationshipIdentifier(input.connectionDriver, input.column.name);
const textLengthSql = textLengthExpression(input.connectionDriver, columnSql);
const samplesSql = sampleAggregateSql(
input.connectionDriver,
sampleValuesSql({
driver: input.connectionDriver,
tableSql: input.profileTableSql,
columnSql,
limit: input.sampleValuesPerColumn,
}),
);
return [
'SELECT',
`${sqlStringLiteral(input.column.name)} AS column_name,`,
`(SELECT COUNT(*) FROM ${input.tableSql}) AS table_row_count,`,
'COUNT(*) AS row_count,',
`SUM(CASE WHEN ${columnSql} IS NULL THEN 1 ELSE 0 END) AS null_count,`,
`COUNT(DISTINCT ${columnSql}) AS distinct_count,`,
`MIN(${textLengthSql}) AS min_text_length,`,
`MAX(${textLengthSql}) AS max_text_length,`,
`${samplesSql} AS sample_values`,
`FROM ${input.profileTableSql}`,
].join(' ');
}
function splitSampleValues(value: unknown): string[] {
if (value === null || value === undefined) {
return [];
}
const text = String(value);
if (text === '') {
return [];
}
return text.split(SAMPLE_VALUE_DELIMITER).filter((item) => item !== '');
}
async function queryCount(input: {
connectionId: string;
driver: KloConnectionDriver;
table: KloTableRef;
executor: KloRelationshipReadOnlyExecutor;
ctx: KloScanContext;
}): Promise<{ rowCount: number; queryCount: number }> {
const tableSql = formatKloRelationshipTableRef(input.driver, input.table);
const result = await input.executor.executeReadOnly(
{ connectionId: input.connectionId, sql: `SELECT COUNT(*) AS row_count FROM ${tableSql}`, maxRows: 1 },
input.ctx,
);
return { rowCount: numberAt(result, 'row_count'), queryCount: 1 };
}
async function queryTableProfile(input: {
connectionId: string;
driver: KloConnectionDriver;
table: KloEnrichedTable;
executor: KloRelationshipReadOnlyExecutor;
ctx: KloScanContext;
sampleValuesPerColumn: number;
profileSampleRows: number;
}): Promise<{
table: KloRelationshipTableProfile;
columns: Record<string, KloRelationshipColumnProfile>;
queryCount: number;
}> {
if (input.table.columns.length === 0) {
const rowCount = await queryCount({
connectionId: input.connectionId,
driver: input.driver,
table: input.table.ref,
executor: input.executor,
ctx: input.ctx,
});
return {
table: { table: input.table.ref, rowCount: rowCount.rowCount },
columns: {},
queryCount: rowCount.queryCount,
};
}
const tableSql = formatKloRelationshipTableRef(input.driver, input.table.ref);
const profileTableSql = sampledTableSql(input.driver, tableSql, input.profileSampleRows);
const sql = input.table.columns
.map((column) =>
columnProfileSelectSql({
connectionDriver: input.driver,
tableSql,
profileTableSql,
column,
sampleValuesPerColumn: input.sampleValuesPerColumn,
}),
)
.join(' UNION ALL ');
const result = await input.executor.executeReadOnly(
{ connectionId: input.connectionId, sql, maxRows: input.table.columns.length },
input.ctx,
);
const columnsByName = new Map(input.table.columns.map((column) => [column.name, column]));
const profiles: Record<string, KloRelationshipColumnProfile> = {};
let tableRowCount = 0;
for (const row of result.rows) {
const columnName = String(valueAt(result, row, 'column_name'));
const column = columnsByName.get(columnName);
if (!column) {
continue;
}
const rowCount = numberFromValue(valueAt(result, row, 'row_count'));
const nullCount = numberFromValue(valueAt(result, row, 'null_count'));
const distinctCount = numberFromValue(valueAt(result, row, 'distinct_count'));
tableRowCount = Math.max(tableRowCount, numberFromValue(valueAt(result, row, 'table_row_count')));
profiles[columnKey(input.table, column)] = {
table: input.table.ref,
column: column.name,
nativeType: column.nativeType,
normalizedType: column.normalizedType,
rowCount,
nullCount,
distinctCount,
uniquenessRatio: rowCount === 0 ? 0 : distinctCount / rowCount,
nullRate: rowCount === 0 ? 0 : nullCount / rowCount,
sampleValues: splitSampleValues(valueAt(result, row, 'sample_values')),
minTextLength: nullableNumberFromValue(valueAt(result, row, 'min_text_length')),
maxTextLength: nullableNumberFromValue(valueAt(result, row, 'max_text_length')),
};
}
return {
table: { table: input.table.ref, rowCount: tableRowCount },
columns: profiles,
queryCount: 1,
};
}
export async function profileKloRelationshipSchema(
input: ProfileKloRelationshipSchemaInput,
): Promise<KloRelationshipProfileArtifact> {
if (!input.executor) {
return {
connectionId: input.connectionId,
driver: input.driver,
sqlAvailable: false,
queryCount: 0,
tables: [],
columns: {},
warnings: ['read_only_sql_unavailable'],
};
}
let queryTotal = 0;
const tables: KloRelationshipTableProfile[] = [];
const columns: Record<string, KloRelationshipColumnProfile> = {};
const warnings: string[] = [];
for (const table of input.schema.tables.filter((candidate) => candidate.enabled)) {
const sampleValuesPerColumn = input.sampleValuesPerColumn ?? 5;
const profileSampleRows = input.profileSampleRows ?? 10000;
const cacheKey = tableProfileCacheKey({
connectionId: input.connectionId,
driver: input.driver,
ctx: input.ctx,
table: table.ref,
sampleValuesPerColumn,
profileSampleRows,
});
const cached = input.cache?.tableProfiles.get(cacheKey);
if (cached) {
tables.push(cached.table);
Object.assign(columns, cached.columns);
for (const warning of cached.warnings) {
warnings.push(warning);
}
continue;
}
try {
const tableProfile = await queryTableProfile({
connectionId: input.connectionId,
driver: input.driver,
table,
executor: input.executor,
ctx: input.ctx,
sampleValuesPerColumn,
profileSampleRows,
});
queryTotal += tableProfile.queryCount;
tables.push(tableProfile.table);
Object.assign(columns, tableProfile.columns);
input.cache?.tableProfiles.set(cacheKey, {
table: tableProfile.table,
columns: tableProfile.columns,
warnings: [],
});
} catch (error) {
const failureWarning = `profile_failed:${table.ref.name}:${error instanceof Error ? error.message : String(error)}`;
warnings.push(failureWarning);
input.cache?.tableProfiles.set(cacheKey, {
table: { table: table.ref, rowCount: 0 },
columns: {},
warnings: [failureWarning],
});
}
}
return {
connectionId: input.connectionId,
driver: input.driver,
sqlAvailable: true,
queryCount: queryTotal,
tables,
columns,
warnings,
};
}

View file

@ -0,0 +1,352 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import type { KloLocalProject } from '../project/index.js';
import { initKloProject } from '../project/index.js';
import { describe, expect, it, vi } from 'vitest';
import { applyLocalScanRelationshipReviewDecisions } from './relationship-review-apply.js';
import type { KloRelationshipReviewDecisionArtifact } from './relationship-review-decisions.js';
import type { ReadLocalScanRelationshipArtifactsResult } from './relationship-artifacts.js';
import type { WriteLocalScanManifestShardsResult } from './local-enrichment-artifacts.js';
import type { KloSchemaSnapshot } from './types.js';
const acceptedDecisionArtifact: KloRelationshipReviewDecisionArtifact = {
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
generatedAt: '2026-05-07T12:00:00.000Z',
decisions: [
{
candidateId: 'orders:orders.customer_id->customers:customers.id',
decision: 'accepted',
previousStatus: 'review',
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
decidedAt: '2026-05-07T12:01:00.000Z',
reviewer: 'Andrey',
note: 'Customer link is valid.',
from: {
tableId: 'public.orders',
columnIds: ['public.orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'public.customers',
columnIds: ['public.customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
source: 'deterministic_name',
score: 0.81,
confidence: 0.81,
pkScore: 0.93,
fkScore: 0.81,
reasons: ['review_threshold'],
},
{
candidateId: 'orders:orders.note_id->notes:notes.id',
decision: 'rejected',
previousStatus: 'review',
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
decidedAt: '2026-05-07T12:02:00.000Z',
reviewer: 'Andrey',
note: null,
from: {
tableId: 'public.orders',
columnIds: ['public.orders.note_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['note_id'],
},
to: {
tableId: 'public.notes',
columnIds: ['public.notes.id'],
table: { catalog: null, db: 'public', name: 'notes' },
columns: ['id'],
},
relationshipType: 'many_to_one',
source: 'embedding_similarity',
score: 0.7,
confidence: 0.7,
pkScore: 0.7,
fkScore: 0.7,
reasons: ['review_threshold'],
},
],
};
const artifacts: ReadLocalScanRelationshipArtifactsResult = {
runId: 'scan-run-a',
connectionId: 'warehouse',
syncId: 'sync-a',
report: {
connectionId: 'warehouse',
driver: 'postgres',
syncId: 'sync-a',
runId: 'scan-run-a',
trigger: 'cli',
mode: 'relationships',
dryRun: false,
artifactPaths: {
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-a',
reportPath: 'raw-sources/warehouse/live-database/sync-a/scan-report.json',
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
enrichmentArtifacts: ['raw-sources/warehouse/live-database/sync-a/enrichment/relationships.json'],
},
diffSummary: {
tablesAdded: 0,
tablesModified: 0,
tablesDeleted: 0,
tablesUnchanged: 2,
columnsAdded: 0,
columnsModified: 0,
columnsDeleted: 0,
},
manifestShardsWritten: 1,
structuralSyncStats: {
tablesCreated: 0,
tablesUpdated: 0,
tablesDeleted: 0,
columnsCreated: 0,
columnsUpdated: 0,
columnsDeleted: 0,
},
enrichment: {
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'completed',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'completed',
},
capabilityGaps: [],
warnings: [],
relationships: { accepted: 0, review: 1, rejected: 1, skipped: 0 },
enrichmentState: { resumedStages: [], completedStages: ['relationships'], failedStages: [] },
createdAt: '2026-05-07T12:00:00.000Z',
},
relationships: {
connectionId: 'warehouse',
accepted: [],
review: [],
rejected: [],
skipped: [],
},
diagnostics: null,
profile: null,
paths: {
relationships: 'raw-sources/warehouse/live-database/sync-a/enrichment/relationships.json',
diagnostics: null,
profile: null,
},
};
const snapshot: KloSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-05-07T12:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
catalog: null,
db: 'public',
name: 'customers',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: null,
estimatedRows: 2,
foreignKeys: [],
columns: [
{
name: 'customer_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
},
],
};
async function projectWithDecisions(
decisions = acceptedDecisionArtifact,
): Promise<{ project: KloLocalProject; tempDir: string }> {
const tempDir = await mkdtemp(join(tmpdir(), 'klo-relationship-review-apply-'));
const project = await initKloProject({
projectDir: join(tempDir, 'project'),
projectName: 'warehouse',
});
await project.fileStore.writeFile(
'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json',
`${JSON.stringify(decisions)}\n`,
'klo',
'klo@example.com',
'Seed relationship review decisions',
);
return { project, tempDir };
}
function manifestResult(): WriteLocalScanManifestShardsResult {
return {
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
manifestShardsWritten: 1,
};
}
describe('relationship review apply', () => {
it('previews all accepted decisions without writing manifest shards', async () => {
const { project, tempDir } = await projectWithDecisions();
const writeLocalScanManifestShards = vi.fn(async () => manifestResult());
try {
const result = await applyLocalScanRelationshipReviewDecisions(project, {
runId: 'scan-run-a',
applyAllAccepted: true,
dryRun: true,
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
readLocalScanStructuralSnapshot: vi.fn(async () => snapshot),
writeLocalScanManifestShards,
});
expect(result).toMatchObject({
runId: 'scan-run-a',
connectionId: 'warehouse',
syncId: 'sync-a',
dryRun: true,
selectedDecisions: 1,
appliedRelationships: 1,
manifestShards: [],
manifestShardsWritten: 0,
});
expect(result.relationships[0]).toMatchObject({
id: 'orders:orders.customer_id->customers:customers.id',
source: 'manual',
relationshipType: 'many_to_one',
confidence: 1,
});
expect(writeLocalScanManifestShards).not.toHaveBeenCalled();
} finally {
await rm(tempDir, { recursive: true, force: true });
}
});
it('writes selected accepted decisions as manual manifest relationships', async () => {
const { project, tempDir } = await projectWithDecisions();
const readLocalScanStructuralSnapshot = vi.fn(async () => snapshot);
const writeLocalScanManifestShards = vi.fn(async () => manifestResult());
try {
const result = await applyLocalScanRelationshipReviewDecisions(project, {
runId: 'scan-run-a',
candidateIds: ['orders:orders.customer_id->customers:customers.id'],
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
readLocalScanStructuralSnapshot,
writeLocalScanManifestShards,
});
expect(readLocalScanStructuralSnapshot).toHaveBeenCalledWith({
project: expect.any(Object),
connectionId: 'warehouse',
driver: 'postgres',
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-a',
extractedAtFallback: '2026-05-07T12:00:00.000Z',
});
expect(writeLocalScanManifestShards).toHaveBeenCalledWith({
project: expect.any(Object),
connectionId: 'warehouse',
syncId: 'sync-a',
driver: 'postgres',
snapshot,
dryRun: false,
relationshipUpdate: {
connectionId: 'warehouse',
accepted: [
expect.objectContaining({
id: 'orders:orders.customer_id->customers:customers.id',
source: 'manual',
from: expect.objectContaining({ columns: ['customer_id'] }),
to: expect.objectContaining({ columns: ['id'] }),
}),
],
rejected: [],
skipped: [],
},
});
expect(result.manifestShardsWritten).toBe(1);
} finally {
await rm(tempDir, { recursive: true, force: true });
}
});
it('rejects ambiguous apply selection input', async () => {
const { project, tempDir } = await projectWithDecisions();
try {
await expect(
applyLocalScanRelationshipReviewDecisions(project, {
runId: 'scan-run-a',
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
}),
).rejects.toThrow('Pass --all-accepted or at least one --candidate to choose review decisions to apply');
await expect(
applyLocalScanRelationshipReviewDecisions(project, {
runId: 'scan-run-a',
applyAllAccepted: true,
candidateIds: ['orders:orders.customer_id->customers:customers.id'],
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
}),
).rejects.toThrow('Use either --all-accepted or --candidate, not both');
} finally {
await rm(tempDir, { recursive: true, force: true });
}
});
it('refuses rejected decisions and missing candidate ids', async () => {
const { project, tempDir } = await projectWithDecisions();
try {
await expect(
applyLocalScanRelationshipReviewDecisions(project, {
runId: 'scan-run-a',
candidateIds: ['orders:orders.note_id->notes:notes.id'],
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
}),
).rejects.toThrow('Relationship review decision "orders:orders.note_id->notes:notes.id" is rejected, not accepted');
await expect(
applyLocalScanRelationshipReviewDecisions(project, {
runId: 'scan-run-a',
candidateIds: ['missing'],
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
}),
).rejects.toThrow('Relationship review decision "missing" was not found for scan run "scan-run-a"');
} finally {
await rm(tempDir, { recursive: true, force: true });
}
});
});

View file

@ -0,0 +1,231 @@
import type { KloLocalProject } from '../project/index.js';
import {
readLocalScanRelationshipArtifacts,
type ReadLocalScanRelationshipArtifactsResult,
} from './relationship-artifacts.js';
import {
readLocalScanStructuralSnapshot,
type ReadLocalScanStructuralSnapshotInput,
} from './local-structural-artifacts.js';
import {
writeLocalScanManifestShards,
type WriteLocalScanManifestShardsInput,
type WriteLocalScanManifestShardsResult,
} from './local-enrichment-artifacts.js';
import type { KloEnrichedRelationship, KloRelationshipUpdate } from './enrichment-types.js';
import type {
KloRelationshipReviewDecisionArtifact,
KloRelationshipReviewDecisionEntry,
} from './relationship-review-decisions.js';
const DECISIONS_FILE = 'relationship-review-decisions.json';
export interface ApplyLocalScanRelationshipReviewDecisionsInput {
runId: string;
applyAllAccepted?: boolean;
candidateIds?: readonly string[];
dryRun?: boolean;
readLocalScanRelationshipArtifacts?: typeof readLocalScanRelationshipArtifacts;
readLocalScanStructuralSnapshot?: (
input: ReadLocalScanStructuralSnapshotInput,
) => Promise<WriteLocalScanManifestShardsInput['snapshot']>;
writeLocalScanManifestShards?: (
input: WriteLocalScanManifestShardsInput,
) => Promise<WriteLocalScanManifestShardsResult>;
}
export interface AppliedRelationshipReviewDecision {
candidateId: string;
decidedAt: string;
reviewer: string;
note: string | null;
relationship: KloEnrichedRelationship;
}
export interface ApplyLocalScanRelationshipReviewDecisionsResult {
runId: string;
connectionId: string;
syncId: string;
dryRun: boolean;
decisionsPath: string;
selectedDecisions: number;
appliedRelationships: number;
relationships: KloEnrichedRelationship[];
manifestShards: string[];
manifestShardsWritten: number;
}
function decisionsPathFromRelationshipsPath(relationshipsPath: string): string {
return relationshipsPath.replace(/relationships\.json$/u, DECISIONS_FILE);
}
async function readDecisionArtifact(
project: KloLocalProject,
path: string,
runId: string,
): Promise<KloRelationshipReviewDecisionArtifact> {
let raw: { content: string };
try {
raw = await project.fileStore.readFile(path);
} catch {
throw new Error(`Relationship review decisions were not found for scan run "${runId}"`);
}
const parsed = JSON.parse(raw.content) as KloRelationshipReviewDecisionArtifact;
return {
connectionId: parsed.connectionId,
runId: parsed.runId,
syncId: parsed.syncId,
generatedAt: parsed.generatedAt,
decisions: Array.isArray(parsed.decisions) ? parsed.decisions : [],
};
}
function assertSelection(input: ApplyLocalScanRelationshipReviewDecisionsInput): void {
const candidateIds = input.candidateIds ?? [];
if (input.applyAllAccepted === true && candidateIds.length > 0) {
throw new Error('Use either --all-accepted or --candidate, not both');
}
if (input.applyAllAccepted !== true && candidateIds.length === 0) {
throw new Error('Pass --all-accepted or at least one --candidate to choose review decisions to apply');
}
}
function selectAcceptedDecisions(
artifact: KloRelationshipReviewDecisionArtifact,
input: ApplyLocalScanRelationshipReviewDecisionsInput,
): KloRelationshipReviewDecisionEntry[] {
assertSelection(input);
if (input.applyAllAccepted === true) {
return artifact.decisions.filter((decision) => decision.decision === 'accepted');
}
const decisionsById = new Map(artifact.decisions.map((decision) => [decision.candidateId, decision]));
const selected: KloRelationshipReviewDecisionEntry[] = [];
for (const candidateId of input.candidateIds ?? []) {
const decision = decisionsById.get(candidateId);
if (!decision) {
throw new Error(`Relationship review decision "${candidateId}" was not found for scan run "${input.runId}"`);
}
if (decision.decision !== 'accepted') {
throw new Error(`Relationship review decision "${candidateId}" is ${decision.decision}, not accepted`);
}
selected.push(decision);
}
return selected;
}
function tableId(table: KloRelationshipReviewDecisionEntry['from']['table']): string {
return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.');
}
function columnIds(table: KloRelationshipReviewDecisionEntry['from']['table'], columns: readonly string[]): string[] {
const prefix = tableId(table);
return columns.map((column) => `${prefix}.${column}`);
}
function relationshipFromDecision(decision: KloRelationshipReviewDecisionEntry): KloEnrichedRelationship {
return {
id: decision.candidateId,
source: 'manual',
from: {
tableId: tableId(decision.from.table),
columnIds: columnIds(decision.from.table, decision.from.columns),
table: decision.from.table,
columns: [...decision.from.columns],
},
to: {
tableId: tableId(decision.to.table),
columnIds: columnIds(decision.to.table, decision.to.columns),
table: decision.to.table,
columns: [...decision.to.columns],
},
relationshipType: decision.relationshipType,
confidence: 1,
isPrimaryKeyReference: true,
};
}
function relationshipUpdate(
connectionId: string,
relationships: readonly KloEnrichedRelationship[],
): KloRelationshipUpdate {
return {
connectionId,
accepted: [...relationships],
rejected: [],
skipped: [],
};
}
function assertApplyableArtifacts(artifacts: ReadLocalScanRelationshipArtifactsResult): string {
const rawSourcesDir = artifacts.report.artifactPaths.rawSourcesDir;
if (!rawSourcesDir) {
throw new Error(`Scan run "${artifacts.runId}" does not have raw source artifacts for manifest rewriting`);
}
return rawSourcesDir;
}
export async function applyLocalScanRelationshipReviewDecisions(
project: KloLocalProject,
input: ApplyLocalScanRelationshipReviewDecisionsInput,
): Promise<ApplyLocalScanRelationshipReviewDecisionsResult> {
const readArtifacts = input.readLocalScanRelationshipArtifacts ?? readLocalScanRelationshipArtifacts;
const artifacts = await readArtifacts(project, input.runId);
if (!artifacts) {
throw new Error(`Scan run "${input.runId}" was not found`);
}
const decisionsPath = decisionsPathFromRelationshipsPath(artifacts.paths.relationships);
const decisions = await readDecisionArtifact(project, decisionsPath, input.runId);
const selected = selectAcceptedDecisions(decisions, input);
const relationships = selected.map((decision) => relationshipFromDecision(decision));
const dryRun = input.dryRun === true;
if (dryRun || relationships.length === 0) {
return {
runId: artifacts.runId,
connectionId: artifacts.connectionId,
syncId: artifacts.syncId,
dryRun,
decisionsPath,
selectedDecisions: selected.length,
appliedRelationships: relationships.length,
relationships,
manifestShards: [],
manifestShardsWritten: 0,
};
}
const rawSourcesDir = assertApplyableArtifacts(artifacts);
const readSnapshot = input.readLocalScanStructuralSnapshot ?? readLocalScanStructuralSnapshot;
const writeManifestShards = input.writeLocalScanManifestShards ?? writeLocalScanManifestShards;
const snapshot = await readSnapshot({
project,
connectionId: artifacts.connectionId,
driver: artifacts.report.driver,
rawSourcesDir,
extractedAtFallback: artifacts.report.createdAt,
});
const manifest = await writeManifestShards({
project,
connectionId: artifacts.connectionId,
syncId: artifacts.syncId,
driver: artifacts.report.driver,
snapshot,
dryRun: false,
relationshipUpdate: relationshipUpdate(artifacts.connectionId, relationships),
});
return {
runId: artifacts.runId,
connectionId: artifacts.connectionId,
syncId: artifacts.syncId,
dryRun,
decisionsPath,
selectedDecisions: selected.length,
appliedRelationships: relationships.length,
relationships,
manifestShards: manifest.manifestShards,
manifestShardsWritten: manifest.manifestShardsWritten,
};
}

View file

@ -0,0 +1,365 @@
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { dirname, join } from 'node:path';
import { runLocalStageOnlyIngest, type SourceAdapter } from '../ingest/index.js';
import { initKloProject, loadKloProject } from '../project/index.js';
import { describe, expect, it } from 'vitest';
import { writeLocalScanRelationshipReviewDecision } from './relationship-review-decisions.js';
import type { KloRelationshipArtifact, KloRelationshipDiagnosticsArtifact } from './relationship-diagnostics.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
import type { KloScanReport } from './types.js';
const RUN_ID = 'scan-run-review';
const SYNC_ID = '2026-05-07-100000-scan-run-review';
async function writeProjectFile(projectDir: string, relativePath: string, content: string): Promise<void> {
const absolutePath = join(projectDir, relativePath);
await mkdir(dirname(absolutePath), { recursive: true });
await writeFile(absolutePath, content, 'utf-8');
}
async function createProject(projectDir: string): Promise<void> {
await initKloProject({ projectDir, projectName: 'warehouse' });
await writeFile(
join(projectDir, 'klo.yaml'),
[
'project: warehouse',
'connections:',
' warehouse:',
' driver: sqlite',
' path: warehouse.db',
' readonly: true',
'ingest:',
' adapters:',
' - live-database',
'',
].join('\n'),
'utf-8',
);
}
function liveDatabaseAdapter(): SourceAdapter {
return {
source: 'live-database',
skillNames: ['live_database_ingest'],
async fetch(_pullConfig, stagedDir) {
await mkdir(join(stagedDir, 'tables'), { recursive: true });
await writeFile(join(stagedDir, 'connection.json'), '{"connectionId":"warehouse"}\n', 'utf-8');
await writeFile(join(stagedDir, 'foreign-keys.json'), '{"foreignKeys":[]}\n', 'utf-8');
await writeFile(
join(stagedDir, 'tables', 'orders.json'),
'{"name":"orders","db":"public","columns":[{"name":"id","type":"integer","nullable":false,"primaryKey":true}]}\n',
'utf-8',
);
},
async detect(stagedDir) {
await writeFile(join(stagedDir, 'connection.json'), '{"connectionId":"warehouse"}\n', 'utf-8');
return true;
},
async chunk() {
return {
workUnits: [
{
unitKey: 'live-database-public-orders',
rawFiles: ['tables/orders.json'],
dependencyPaths: ['connection.json', 'foreign-keys.json'],
peerFileIndex: [],
},
],
};
},
};
}
async function createLiveDatabaseRun(projectDir: string): Promise<void> {
await createProject(projectDir);
const project = await loadKloProject({ projectDir });
await runLocalStageOnlyIngest({
project,
adapters: [liveDatabaseAdapter()],
adapter: 'live-database',
connectionId: 'warehouse',
jobId: RUN_ID,
now: () => new Date('2026-05-07T10:00:00.000Z'),
});
}
function reviewRelationships(): KloRelationshipArtifact {
return {
connectionId: 'warehouse',
accepted: [],
review: [
{
id: 'orders:orders.customer_id->customers:customers.id',
status: 'review',
source: 'deterministic_name',
from: {
tableId: 'orders',
columnIds: ['orders.customer_id'],
table: { catalog: null, db: 'public', name: 'orders' },
columns: ['customer_id'],
},
to: {
tableId: 'customers',
columnIds: ['customers.id'],
table: { catalog: null, db: 'public', name: 'customers' },
columns: ['id'],
},
relationshipType: 'many_to_one',
confidence: 0.62,
pkScore: 0.91,
fkScore: 0.62,
score: 0.62,
evidence: { sources: ['table_suffix'] },
validation: { status: 'passed' },
graph: { reasons: ['fk_score_review'] },
reasons: ['fk_score_review'],
},
],
rejected: [],
skipped: [],
};
}
function diagnostics(): KloRelationshipDiagnosticsArtifact {
return {
connectionId: 'warehouse',
generatedAt: '2026-05-07T10:00:00.000Z',
summary: { accepted: 0, review: 1, rejected: 0, skipped: 0 },
noAcceptedReason: 'relationship candidates require review before manifest writes',
candidateCountsBySource: { deterministic_name: 1 },
validation: { available: true, sqlAvailable: true, queryCount: 3 },
thresholds: { acceptThreshold: 0.85, reviewThreshold: 0.55 },
policy: {
validationRequiredForManifest: true,
maxCandidatesPerColumn: 25,
profileSampleRows: 10000,
validationConcurrency: 4,
},
warnings: [],
profileWarnings: [],
};
}
function profile(): KloRelationshipProfileArtifact {
return {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
tables: [],
columns: {},
queryCount: 3,
warnings: [],
};
}
function report(): KloScanReport {
return {
connectionId: 'warehouse',
driver: 'sqlite',
syncId: SYNC_ID,
runId: RUN_ID,
trigger: 'cli',
mode: 'relationships',
dryRun: false,
artifactPaths: {
rawSourcesDir: `raw-sources/warehouse/live-database/${SYNC_ID}`,
reportPath: `raw-sources/warehouse/live-database/${SYNC_ID}/scan-report.json`,
manifestShards: [],
enrichmentArtifacts: [
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationships.json`,
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationship-diagnostics.json`,
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationship-profile.json`,
],
},
diffSummary: {
tablesAdded: 0,
tablesModified: 0,
tablesDeleted: 0,
tablesUnchanged: 2,
columnsAdded: 0,
columnsModified: 0,
columnsDeleted: 0,
},
manifestShardsWritten: 0,
structuralSyncStats: {
tablesCreated: 0,
tablesUpdated: 0,
tablesDeleted: 0,
columnsCreated: 0,
columnsUpdated: 0,
columnsDeleted: 0,
},
enrichment: {
dataDictionary: 'skipped',
tableDescriptions: 'skipped',
columnDescriptions: 'skipped',
embeddings: 'skipped',
deterministicRelationships: 'completed',
llmRelationshipValidation: 'skipped',
statisticalValidation: 'completed',
},
relationships: { accepted: 0, review: 1, rejected: 0, skipped: 0 },
enrichmentState: {
resumedStages: [],
completedStages: ['relationships'],
failedStages: [],
},
warnings: [],
capabilityGaps: [],
createdAt: '2026-05-07T10:00:00.000Z',
};
}
async function writeScanArtifacts(projectDir: string): Promise<void> {
await writeProjectFile(
projectDir,
`raw-sources/warehouse/live-database/${SYNC_ID}/scan-report.json`,
JSON.stringify(report(), null, 2),
);
await writeProjectFile(
projectDir,
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationships.json`,
JSON.stringify(reviewRelationships(), null, 2),
);
await writeProjectFile(
projectDir,
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationship-diagnostics.json`,
JSON.stringify(diagnostics(), null, 2),
);
await writeProjectFile(
projectDir,
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationship-profile.json`,
JSON.stringify(profile(), null, 2),
);
}
describe('relationship review decisions', () => {
it('writes an accepted decision beside the scan relationship artifacts', async () => {
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-review-decisions-'));
try {
await createLiveDatabaseRun(projectDir);
await writeScanArtifacts(projectDir);
const project = await loadKloProject({ projectDir });
const result = await writeLocalScanRelationshipReviewDecision(project, {
runId: 'scan-run-review',
candidateId: 'orders:orders.customer_id->customers:customers.id',
decision: 'accepted',
reviewer: 'Andrey',
note: 'Matches the warehouse model',
decidedAt: '2026-05-07T12:00:00.000Z',
});
expect(result).not.toBeNull();
if (!result) {
throw new Error('Expected relationship review decision to be written');
}
expect(result.path).toBe(
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationship-review-decisions.json`,
);
expect(result.artifact.decisions).toHaveLength(1);
expect(result.decision).toMatchObject({
candidateId: 'orders:orders.customer_id->customers:customers.id',
decision: 'accepted',
previousStatus: 'review',
reviewer: 'Andrey',
note: 'Matches the warehouse model',
source: 'deterministic_name',
relationshipType: 'many_to_one',
score: 0.62,
reasons: ['fk_score_review'],
});
await expect(project.fileStore.readFile(result.path)).resolves.toMatchObject({
path: result.path,
content: expect.stringContaining('"decision": "accepted"'),
});
} finally {
await rm(projectDir, { recursive: true, force: true });
}
});
it('replaces the existing decision for the same candidate id', async () => {
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-review-replace-'));
try {
await createLiveDatabaseRun(projectDir);
await writeScanArtifacts(projectDir);
const project = await loadKloProject({ projectDir });
await writeLocalScanRelationshipReviewDecision(project, {
runId: 'scan-run-review',
candidateId: 'orders:orders.customer_id->customers:customers.id',
decision: 'accepted',
reviewer: 'Andrey',
note: 'First decision',
decidedAt: '2026-05-07T12:00:00.000Z',
});
const replacement = await writeLocalScanRelationshipReviewDecision(project, {
runId: 'scan-run-review',
candidateId: 'orders:orders.customer_id->customers:customers.id',
decision: 'rejected',
reviewer: 'Andrey',
note: 'Reviewed against source data and rejected',
decidedAt: '2026-05-07T12:05:00.000Z',
});
expect(replacement).not.toBeNull();
if (!replacement) {
throw new Error('Expected replacement relationship review decision to be written');
}
expect(replacement.artifact.decisions).toHaveLength(1);
expect(replacement.artifact.decisions[0]).toMatchObject({
decision: 'rejected',
note: 'Reviewed against source data and rejected',
decidedAt: '2026-05-07T12:05:00.000Z',
});
} finally {
await rm(projectDir, { recursive: true, force: true });
}
});
it('returns null when the scan run does not exist', async () => {
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-review-missing-run-'));
try {
await createProject(projectDir);
const project = await loadKloProject({ projectDir });
await expect(
writeLocalScanRelationshipReviewDecision(project, {
runId: 'missing-run',
candidateId: 'orders:orders.customer_id->customers:customers.id',
decision: 'accepted',
reviewer: 'Andrey',
note: null,
decidedAt: '2026-05-07T12:00:00.000Z',
}),
).resolves.toBeNull();
} finally {
await rm(projectDir, { recursive: true, force: true });
}
});
it('rejects unknown candidate ids for an existing scan run', async () => {
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-review-missing-candidate-'));
try {
await createLiveDatabaseRun(projectDir);
await writeScanArtifacts(projectDir);
const project = await loadKloProject({ projectDir });
await expect(
writeLocalScanRelationshipReviewDecision(project, {
runId: 'scan-run-review',
candidateId: 'orders:orders.unknown_id->customers:customers.id',
decision: 'accepted',
reviewer: 'Andrey',
note: null,
decidedAt: '2026-05-07T12:00:00.000Z',
}),
).rejects.toThrow(
'Relationship candidate "orders:orders.unknown_id->customers:customers.id" was not found in scan run "scan-run-review"',
);
} finally {
await rm(projectDir, { recursive: true, force: true });
}
});
});

View file

@ -0,0 +1,182 @@
import type { KloLocalProject } from '../project/index.js';
import type { KloRelationshipType } from './enrichment-types.js';
import { readLocalScanRelationshipArtifacts } from './relationship-artifacts.js';
import type {
KloRelationshipArtifactEdge,
KloRelationshipArtifactEndpoint,
} from './relationship-diagnostics.js';
import type { KloResolvedRelationshipStatus } from './relationship-graph-resolver.js';
const LOCAL_AUTHOR = 'klo';
const LOCAL_AUTHOR_EMAIL = 'klo@example.com';
const DECISIONS_FILE = 'relationship-review-decisions.json';
export type KloRelationshipReviewDecisionValue = 'accepted' | 'rejected';
export interface WriteLocalScanRelationshipReviewDecisionInput {
runId: string;
candidateId: string;
decision: KloRelationshipReviewDecisionValue;
reviewer: string;
note: string | null;
decidedAt?: string;
}
export interface KloRelationshipReviewDecisionEntry {
candidateId: string;
decision: KloRelationshipReviewDecisionValue;
previousStatus: KloResolvedRelationshipStatus;
connectionId: string;
runId: string;
syncId: string;
decidedAt: string;
reviewer: string;
note: string | null;
from: KloRelationshipArtifactEndpoint;
to: KloRelationshipArtifactEndpoint;
relationshipType: KloRelationshipType;
source: string;
score: number | null;
confidence: number;
pkScore: number | null;
fkScore: number | null;
reasons: string[];
}
export interface KloRelationshipReviewDecisionArtifact {
connectionId: string;
runId: string;
syncId: string;
generatedAt: string;
decisions: KloRelationshipReviewDecisionEntry[];
}
export interface WriteLocalScanRelationshipReviewDecisionResult {
path: string;
decision: KloRelationshipReviewDecisionEntry;
artifact: KloRelationshipReviewDecisionArtifact;
}
function reviewDecisionPath(relationshipsPath: string): string {
return relationshipsPath.replace(/relationships\.json$/u, DECISIONS_FILE);
}
function allCandidateEdges(result: Awaited<ReturnType<typeof readLocalScanRelationshipArtifacts>>): KloRelationshipArtifactEdge[] {
if (!result) {
return [];
}
return [...result.relationships.accepted, ...result.relationships.review, ...result.relationships.rejected];
}
async function readExistingDecisions(
project: KloLocalProject,
path: string,
fallback: Omit<KloRelationshipReviewDecisionArtifact, 'decisions'>,
): Promise<KloRelationshipReviewDecisionArtifact> {
try {
const raw = await project.fileStore.readFile(path);
const parsed = JSON.parse(raw.content) as KloRelationshipReviewDecisionArtifact;
return {
connectionId: parsed.connectionId,
runId: parsed.runId,
syncId: parsed.syncId,
generatedAt: parsed.generatedAt,
decisions: Array.isArray(parsed.decisions) ? parsed.decisions : [],
};
} catch {
return { ...fallback, decisions: [] };
}
}
function decisionEntry(input: {
candidate: KloRelationshipArtifactEdge;
connectionId: string;
runId: string;
syncId: string;
decision: KloRelationshipReviewDecisionValue;
reviewer: string;
note: string | null;
decidedAt: string;
}): KloRelationshipReviewDecisionEntry {
return {
candidateId: input.candidate.id,
decision: input.decision,
previousStatus: input.candidate.status,
connectionId: input.connectionId,
runId: input.runId,
syncId: input.syncId,
decidedAt: input.decidedAt,
reviewer: input.reviewer,
note: input.note,
from: input.candidate.from,
to: input.candidate.to,
relationshipType: input.candidate.relationshipType,
source: input.candidate.source,
score: input.candidate.score,
confidence: input.candidate.confidence,
pkScore: input.candidate.pkScore,
fkScore: input.candidate.fkScore,
reasons: [...input.candidate.reasons],
};
}
function upsertDecision(
existing: readonly KloRelationshipReviewDecisionEntry[],
next: KloRelationshipReviewDecisionEntry,
): KloRelationshipReviewDecisionEntry[] {
return [...existing.filter((item) => item.candidateId !== next.candidateId), next].sort((left, right) =>
left.candidateId.localeCompare(right.candidateId),
);
}
export async function writeLocalScanRelationshipReviewDecision(
project: KloLocalProject,
input: WriteLocalScanRelationshipReviewDecisionInput,
): Promise<WriteLocalScanRelationshipReviewDecisionResult | null> {
const artifacts = await readLocalScanRelationshipArtifacts(project, input.runId);
if (!artifacts) {
return null;
}
const candidate = allCandidateEdges(artifacts).find((edge) => edge.id === input.candidateId);
if (!candidate) {
throw new Error(`Relationship candidate "${input.candidateId}" was not found in scan run "${input.runId}"`);
}
const decidedAt = input.decidedAt ?? new Date().toISOString();
const path = reviewDecisionPath(artifacts.paths.relationships);
const fallback = {
connectionId: artifacts.connectionId,
runId: artifacts.runId,
syncId: artifacts.syncId,
generatedAt: decidedAt,
};
const existing = await readExistingDecisions(project, path, fallback);
const decision = decisionEntry({
candidate,
connectionId: artifacts.connectionId,
runId: artifacts.runId,
syncId: artifacts.syncId,
decision: input.decision,
reviewer: input.reviewer,
note: input.note,
decidedAt,
});
const artifact: KloRelationshipReviewDecisionArtifact = {
connectionId: artifacts.connectionId,
runId: artifacts.runId,
syncId: artifacts.syncId,
generatedAt: decidedAt,
decisions: upsertDecision(existing.decisions, decision),
};
await project.fileStore.writeFile(
path,
`${JSON.stringify(artifact, null, 2)}\n`,
LOCAL_AUTHOR,
LOCAL_AUTHOR_EMAIL,
`scan(live-database): record relationship review decision runId=${input.runId}`,
);
return { path, decision, artifact };
}

View file

@ -0,0 +1,108 @@
import { describe, expect, it } from 'vitest';
import {
calibrateWeightsFromSyntheticFixtures,
defaultKloRelationshipScoreWeights,
normalizeKloRelationshipScoreWeights,
scoreKloRelationshipCandidate,
type KloRelationshipSignalVector,
} from './relationship-scoring.js';
function signals(overrides: Partial<KloRelationshipSignalVector> = {}): KloRelationshipSignalVector {
return {
nameSimilarity: 0.5,
typeCompatibility: 1,
valueOverlap: 0,
embeddingSimilarity: 0,
profileUniqueness: 0.5,
profileNullRate: 0.5,
structuralPrior: 0.5,
...overrides,
};
}
describe('relationship scoring', () => {
it('scores stronger evidence higher without hard-gating on names', () => {
const weakNameStrongProfile = scoreKloRelationshipCandidate(
signals({
nameSimilarity: 0.05,
typeCompatibility: 1,
valueOverlap: 0.7,
profileUniqueness: 1,
profileNullRate: 1,
structuralPrior: 0.7,
}),
);
const strongNameWeakProfile = scoreKloRelationshipCandidate(
signals({
nameSimilarity: 0.95,
typeCompatibility: 1,
valueOverlap: 0,
profileUniqueness: 0.3,
profileNullRate: 0.4,
structuralPrior: 0.5,
}),
);
expect(weakNameStrongProfile.score).toBeGreaterThan(strongNameWeakProfile.score);
expect(weakNameStrongProfile.contributions.profileUniqueness).toBeGreaterThan(0);
expect(weakNameStrongProfile.contributions.nameSimilarity).toBeLessThan(0.02);
});
it('normalizes partial and invalid weights into a usable vector', () => {
const weights = normalizeKloRelationshipScoreWeights({
nameSimilarity: 3,
typeCompatibility: -1,
valueOverlap: Number.POSITIVE_INFINITY,
profileUniqueness: 1,
});
const total = Object.values(weights).reduce((sum, value) => sum + value, 0);
expect(total).toBeCloseTo(1, 6);
expect(weights.nameSimilarity).toBeGreaterThan(weights.profileUniqueness);
expect(weights.typeCompatibility).toBe(0);
expect(weights.valueOverlap).toBe(0);
});
it('returns deterministic defaults as a defensive copy', () => {
const first = defaultKloRelationshipScoreWeights();
const second = defaultKloRelationshipScoreWeights();
expect(first).toEqual(second);
expect(first).not.toBe(second);
expect(Object.values(first).reduce((sum, value) => sum + value, 0)).toBeCloseTo(1, 6);
});
it('calibrates only from synthetic observations', () => {
expect(() =>
calibrateWeightsFromSyntheticFixtures([
{
fixtureId: 'chinook_with_declared_metadata',
origin: 'public',
expectedRelationship: true,
signals: signals({ nameSimilarity: 1 }),
},
]),
).toThrow(/synthetic/i);
});
it('calibrates deterministic weights from positive and negative synthetic observations', () => {
const weights = calibrateWeightsFromSyntheticFixtures([
{
fixtureId: 'synthetic_positive',
origin: 'synthetic',
expectedRelationship: true,
signals: signals({ nameSimilarity: 0.8, valueOverlap: 0.9, profileUniqueness: 1, profileNullRate: 1 }),
},
{
fixtureId: 'synthetic_negative',
origin: 'synthetic',
expectedRelationship: false,
signals: signals({ nameSimilarity: 0.2, valueOverlap: 0.1, profileUniqueness: 0.4, profileNullRate: 0.5 }),
},
]);
expect(Object.values(weights).reduce((sum, value) => sum + value, 0)).toBeCloseTo(1, 6);
expect(weights.valueOverlap).toBeGreaterThan(weights.structuralPrior);
expect(weights.profileUniqueness).toBeGreaterThan(weights.embeddingSimilarity);
});
});

View file

@ -0,0 +1,155 @@
export const KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS = [
'nameSimilarity',
'typeCompatibility',
'valueOverlap',
'embeddingSimilarity',
'profileUniqueness',
'profileNullRate',
'structuralPrior',
] as const;
export type KloRelationshipScoreSignal = (typeof KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS)[number];
export type KloRelationshipFixtureOrigin = 'synthetic' | 'public' | 'customer';
export interface KloRelationshipSignalVector {
nameSimilarity: number;
typeCompatibility: number;
valueOverlap: number;
embeddingSimilarity: number;
profileUniqueness: number;
profileNullRate: number;
structuralPrior: number;
}
export type KloRelationshipScoreWeights = Record<KloRelationshipScoreSignal, number>;
export interface KloRelationshipScoreBreakdown {
score: number;
signals: KloRelationshipSignalVector;
weights: KloRelationshipScoreWeights;
contributions: KloRelationshipScoreWeights;
}
export interface KloRelationshipScoringCalibrationObservation {
fixtureId: string;
origin: KloRelationshipFixtureOrigin;
expectedRelationship: boolean;
signals: KloRelationshipSignalVector;
}
const DEFAULT_WEIGHTS: KloRelationshipScoreWeights = {
nameSimilarity: 0.24,
typeCompatibility: 0.1,
valueOverlap: 0.22,
embeddingSimilarity: 0.1,
profileUniqueness: 0.22,
profileNullRate: 0.08,
structuralPrior: 0.04,
};
function clampScore(value: number): number {
if (!Number.isFinite(value)) {
return 0;
}
return Math.max(0, Math.min(1, value));
}
function roundScore(value: number): number {
return Number(clampScore(value).toFixed(3));
}
function sanitizeSignalVector(signals: KloRelationshipSignalVector): KloRelationshipSignalVector {
return {
nameSimilarity: roundScore(signals.nameSimilarity),
typeCompatibility: roundScore(signals.typeCompatibility),
valueOverlap: roundScore(signals.valueOverlap),
embeddingSimilarity: roundScore(signals.embeddingSimilarity),
profileUniqueness: roundScore(signals.profileUniqueness),
profileNullRate: roundScore(signals.profileNullRate),
structuralPrior: roundScore(signals.structuralPrior),
};
}
export function defaultKloRelationshipScoreWeights(): KloRelationshipScoreWeights {
return { ...DEFAULT_WEIGHTS };
}
export function normalizeKloRelationshipScoreWeights(
weights: Partial<KloRelationshipScoreWeights> = DEFAULT_WEIGHTS,
): KloRelationshipScoreWeights {
const rawEntries = KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS.map((key) => {
const value = weights[key] ?? 0;
return [key, Number.isFinite(value) ? Math.max(0, value) : 0] as const;
});
const total = rawEntries.reduce((sum, [, value]) => sum + value, 0);
if (total <= 0) {
return defaultKloRelationshipScoreWeights();
}
return Object.fromEntries(rawEntries.map(([key, value]) => [key, value / total])) as KloRelationshipScoreWeights;
}
export function scoreKloRelationshipCandidate(
signals: KloRelationshipSignalVector,
weights: Partial<KloRelationshipScoreWeights> = DEFAULT_WEIGHTS,
): KloRelationshipScoreBreakdown {
const sanitizedSignals = sanitizeSignalVector(signals);
const normalizedWeights = normalizeKloRelationshipScoreWeights(weights);
const contributions = Object.fromEntries(
KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS.map((key) => [
key,
Number((sanitizedSignals[key] * normalizedWeights[key]).toFixed(6)),
]),
) as KloRelationshipScoreWeights;
const rawWeightedScore = KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS.reduce((sum, key) => sum + contributions[key], 0);
const scoredConfidence = sanitizedSignals.typeCompatibility <= 0 ? 0 : 0.56 + rawWeightedScore * 0.65;
return {
score: roundScore(scoredConfidence),
signals: sanitizedSignals,
weights: normalizedWeights,
contributions,
};
}
function averageSignal(
observations: readonly KloRelationshipScoringCalibrationObservation[],
key: KloRelationshipScoreSignal,
): number {
if (observations.length === 0) {
return 0;
}
return observations.reduce((sum, observation) => sum + clampScore(observation.signals[key]), 0) / observations.length;
}
export function calibrateWeightsFromSyntheticFixtures(
observations: readonly KloRelationshipScoringCalibrationObservation[],
): KloRelationshipScoreWeights {
const nonSynthetic = observations.find((observation) => observation.origin !== 'synthetic');
if (nonSynthetic) {
throw new Error(
`Relationship score calibration accepts only synthetic fixtures; ${nonSynthetic.fixtureId} is ${nonSynthetic.origin}`,
);
}
if (observations.length === 0) {
return defaultKloRelationshipScoreWeights();
}
const positives = observations.filter((observation) => observation.expectedRelationship);
const negatives = observations.filter((observation) => !observation.expectedRelationship);
if (positives.length === 0 || negatives.length === 0) {
return defaultKloRelationshipScoreWeights();
}
const calibrated = Object.fromEntries(
KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS.map((key) => {
const positiveAverage = averageSignal(positives, key);
const negativeAverage = averageSignal(negatives, key);
const separation = Math.max(0, positiveAverage - negativeAverage);
return [key, separation + DEFAULT_WEIGHTS[key] * 0.25];
}),
) as KloRelationshipScoreWeights;
return normalizeKloRelationshipScoreWeights(calibrated);
}

View file

@ -0,0 +1,241 @@
import type { KloLocalProject } from '../project/index.js';
import { describe, expect, it, vi } from 'vitest';
import {
adviseLocalRelationshipFeedbackThresholds,
buildKloRelationshipThresholdAdviceReport,
formatKloRelationshipThresholdAdviceMarkdown,
} from './relationship-threshold-advice.js';
import type {
ExportLocalRelationshipFeedbackLabelsResult,
KloRelationshipFeedbackLabel,
} from './relationship-feedback-export.js';
function label(
input: Partial<KloRelationshipFeedbackLabel> & Pick<KloRelationshipFeedbackLabel, 'candidateId' | 'decision' | 'score'>,
): KloRelationshipFeedbackLabel {
return {
schemaVersion: 1,
previousStatus: 'review',
connectionId: 'warehouse',
runId: 'scan-run-a',
syncId: 'sync-a',
decidedAt: '2026-05-07T12:00:00.000Z',
reviewer: 'Andrey',
note: null,
relationshipType: 'many_to_one',
source: 'deterministic_name',
confidence: input.score ?? 0,
pkScore: input.pkScore ?? null,
fkScore: input.fkScore ?? input.score,
fromTable: 'public.orders',
fromColumns: ['customer_id'],
toTable: 'public.customers',
toColumns: ['id'],
reasons: [],
artifactPath: 'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json',
...input,
};
}
function feedback(labels: KloRelationshipFeedbackLabel[]): ExportLocalRelationshipFeedbackLabelsResult {
return {
generatedAt: '2026-05-07T13:00:00.000Z',
filters: { connectionId: null, decision: 'all' },
summary: {
total: labels.length,
accepted: labels.filter((item) => item.decision === 'accepted').length,
rejected: labels.filter((item) => item.decision === 'rejected').length,
connections: new Set(labels.map((item) => item.connectionId)).size,
runs: new Set(labels.map((item) => `${item.connectionId}:${item.runId}`)).size,
},
labels,
warnings: [],
};
}
describe('relationship threshold advice', () => {
it('selects the highest-quality threshold candidate when enough labels exist', () => {
const report = buildKloRelationshipThresholdAdviceReport(
feedback([
label({
candidateId: 'orders:orders.customer_id->customers:customers.id',
decision: 'accepted',
score: 0.91,
pkScore: 0.97,
fkScore: 0.91,
}),
label({
candidateId: 'orders:orders.account_id->accounts:accounts.id',
decision: 'accepted',
score: 0.61,
pkScore: 0.88,
fkScore: 0.61,
}),
label({
candidateId: 'orders:orders.note_id->notes:notes.id',
decision: 'rejected',
score: 0.21,
pkScore: 0.4,
fkScore: 0.21,
}),
label({
candidateId: 'orders:orders.region_id->regions:regions.id',
decision: 'rejected',
score: 0.88,
pkScore: 0.9,
fkScore: 0.88,
}),
]),
{
acceptThresholds: [0.9, 0.85],
reviewThresholds: [0.55],
minTotalLabels: 4,
minAcceptedLabels: 2,
minRejectedLabels: 2,
minAcceptedBandPrecision: 0.75,
minAcceptedOrReviewRecall: 0.75,
minRejectedBandPrecision: 0.75,
},
);
expect(report.status).toBe('ready');
expect(report.summary).toMatchObject({
totalLabels: 4,
scoredLabels: 4,
acceptedLabels: 2,
rejectedLabels: 2,
eligibleCandidates: 1,
});
expect(report.recommended).toMatchObject({
acceptThreshold: 0.9,
reviewThreshold: 0.55,
eligible: true,
acceptedBandPrecision: 1,
acceptedRecall: 0.5,
acceptedOrReviewRecall: 1,
rejectedBandPrecision: 1,
rejectedRecall: 1,
falseAcceptedRejectedLabels: 0,
falseRejectedAcceptedLabels: 0,
});
expect(report.candidates.map((candidate) => [candidate.acceptThreshold, candidate.reviewThreshold, candidate.eligible])).toEqual([
[0.9, 0.55, true],
[0.85, 0.55, false],
]);
});
it('reports insufficient labels without hiding evaluated candidates', () => {
const report = buildKloRelationshipThresholdAdviceReport(
feedback([
label({ candidateId: 'orders:orders.customer_id->customers:customers.id', decision: 'accepted', score: 0.91 }),
label({ candidateId: 'orders:orders.note_id->notes:notes.id', decision: 'rejected', score: 0.21 }),
]),
{
acceptThresholds: [0.9],
reviewThresholds: [0.55],
minTotalLabels: 10,
minAcceptedLabels: 5,
minRejectedLabels: 5,
},
);
expect(report.status).toBe('insufficient_labels');
expect(report.recommended).toBeNull();
expect(report.summary).toMatchObject({
totalLabels: 2,
scoredLabels: 2,
acceptedLabels: 1,
rejectedLabels: 1,
eligibleCandidates: 1,
});
expect(report.reasons).toEqual([
'Need at least 10 scored labels; found 2.',
'Need at least 5 accepted labels; found 1.',
'Need at least 5 rejected labels; found 1.',
]);
expect(report.candidates).toHaveLength(1);
});
it('reports no eligible thresholds when label counts pass but quality gates fail', () => {
const report = buildKloRelationshipThresholdAdviceReport(
feedback([
label({ candidateId: 'a', decision: 'accepted', score: 0.92 }),
label({ candidateId: 'b', decision: 'accepted', score: 0.58 }),
label({ candidateId: 'c', decision: 'rejected', score: 0.91 }),
label({ candidateId: 'd', decision: 'rejected', score: 0.2 }),
]),
{
acceptThresholds: [0.9],
reviewThresholds: [0.55],
minTotalLabels: 4,
minAcceptedLabels: 2,
minRejectedLabels: 2,
minAcceptedBandPrecision: 0.9,
},
);
expect(report.status).toBe('no_eligible_thresholds');
expect(report.recommended).toBeNull();
expect(report.reasons).toEqual(['No threshold candidate met the precision and recall gates.']);
expect(report.candidates[0]).toMatchObject({
acceptThreshold: 0.9,
reviewThreshold: 0.55,
eligible: false,
acceptedBandPrecision: 0.5,
});
});
it('wraps the feedback exporter and preserves warnings', async () => {
const project = { projectDir: '/tmp/klo-project' } as KloLocalProject;
const exportLocalRelationshipFeedbackLabels = vi.fn(async () => ({
...feedback([]),
warnings: [
{
path: 'raw-sources/broken/live-database/sync/enrichment/relationship-review-decisions.json',
message: 'Unexpected token',
},
],
}));
const report = await adviseLocalRelationshipFeedbackThresholds(project, {
connectionId: 'warehouse',
exportLocalRelationshipFeedbackLabels,
minTotalLabels: 1,
});
expect(exportLocalRelationshipFeedbackLabels).toHaveBeenCalledWith(project, {
connectionId: 'warehouse',
decision: 'all',
});
expect(report.warnings).toEqual([
{
path: 'raw-sources/broken/live-database/sync/enrichment/relationship-review-decisions.json',
message: 'Unexpected token',
},
]);
});
it('formats a stable human-readable report', () => {
const report = buildKloRelationshipThresholdAdviceReport(
feedback([
label({ candidateId: 'orders:orders.customer_id->customers:customers.id', decision: 'accepted', score: 0.91 }),
label({ candidateId: 'orders:orders.account_id->accounts:accounts.id', decision: 'accepted', score: 0.61 }),
label({ candidateId: 'orders:orders.note_id->notes:notes.id', decision: 'rejected', score: 0.21 }),
label({ candidateId: 'orders:orders.region_id->regions:regions.id', decision: 'rejected', score: 0.88 }),
]),
{
acceptThresholds: [0.9],
reviewThresholds: [0.55],
minTotalLabels: 4,
minAcceptedLabels: 2,
minRejectedLabels: 2,
minAcceptedBandPrecision: 0.75,
},
);
expect(formatKloRelationshipThresholdAdviceMarkdown(report)).toContain('KLO relationship threshold advice');
expect(formatKloRelationshipThresholdAdviceMarkdown(report)).toContain('Status: ready');
expect(formatKloRelationshipThresholdAdviceMarkdown(report)).toContain('Recommended: accept=0.90 review=0.55');
expect(formatKloRelationshipThresholdAdviceMarkdown(report)).toContain('acceptedPrecision=1.000');
});
});

View file

@ -0,0 +1,335 @@
import type { KloLocalProject } from '../project/index.js';
import {
exportLocalRelationshipFeedbackLabels,
type ExportLocalRelationshipFeedbackLabelsInput,
type ExportLocalRelationshipFeedbackLabelsResult,
type KloRelationshipFeedbackExportWarning,
type KloRelationshipFeedbackLabel,
} from './relationship-feedback-export.js';
import type { KloResolvedRelationshipStatus } from './relationship-graph-resolver.js';
const DEFAULT_ACCEPT_THRESHOLDS = [0.95, 0.9, 0.85, 0.8, 0.75] as const;
const DEFAULT_REVIEW_THRESHOLDS = [0.65, 0.6, 0.55, 0.5, 0.45] as const;
type AdvicePredictedStatus = KloResolvedRelationshipStatus;
export type KloRelationshipThresholdAdviceStatus = 'ready' | 'insufficient_labels' | 'no_eligible_thresholds';
export interface BuildKloRelationshipThresholdAdviceReportInput {
acceptThresholds?: readonly number[];
reviewThresholds?: readonly number[];
minTotalLabels?: number;
minAcceptedLabels?: number;
minRejectedLabels?: number;
minAcceptedBandPrecision?: number;
minAcceptedOrReviewRecall?: number;
minRejectedBandPrecision?: number;
}
export interface AdviseLocalRelationshipFeedbackThresholdsInput
extends Omit<ExportLocalRelationshipFeedbackLabelsInput, 'decision'>,
BuildKloRelationshipThresholdAdviceReportInput {
exportLocalRelationshipFeedbackLabels?: typeof exportLocalRelationshipFeedbackLabels;
}
export interface KloRelationshipThresholdAdviceCandidate {
acceptThreshold: number;
reviewThreshold: number;
eligible: boolean;
predictedAccepted: number;
predictedReview: number;
predictedRejected: number;
acceptedBandPrecision: number | null;
acceptedRecall: number | null;
acceptedOrReviewRecall: number | null;
rejectedBandPrecision: number | null;
rejectedRecall: number | null;
falseAcceptedRejectedLabels: number;
falseRejectedAcceptedLabels: number;
}
export interface KloRelationshipThresholdAdviceReport {
generatedAt: string;
filters: ExportLocalRelationshipFeedbackLabelsResult['filters'];
status: KloRelationshipThresholdAdviceStatus;
gates: {
minTotalLabels: number;
minAcceptedLabels: number;
minRejectedLabels: number;
minAcceptedBandPrecision: number;
minAcceptedOrReviewRecall: number;
minRejectedBandPrecision: number;
};
summary: {
totalLabels: number;
scoredLabels: number;
unscoredLabels: number;
acceptedLabels: number;
rejectedLabels: number;
evaluatedCandidates: number;
eligibleCandidates: number;
};
recommended: KloRelationshipThresholdAdviceCandidate | null;
candidates: KloRelationshipThresholdAdviceCandidate[];
reasons: string[];
warnings: KloRelationshipFeedbackExportWarning[];
}
interface ResolvedAdviceInput {
acceptThresholds: number[];
reviewThresholds: number[];
minTotalLabels: number;
minAcceptedLabels: number;
minRejectedLabels: number;
minAcceptedBandPrecision: number;
minAcceptedOrReviewRecall: number;
minRejectedBandPrecision: number;
}
function resolveInput(input: BuildKloRelationshipThresholdAdviceReportInput): ResolvedAdviceInput {
return {
acceptThresholds: [...(input.acceptThresholds ?? DEFAULT_ACCEPT_THRESHOLDS)].sort((left, right) => right - left),
reviewThresholds: [...(input.reviewThresholds ?? DEFAULT_REVIEW_THRESHOLDS)].sort((left, right) => right - left),
minTotalLabels: input.minTotalLabels ?? 20,
minAcceptedLabels: input.minAcceptedLabels ?? 5,
minRejectedLabels: input.minRejectedLabels ?? 5,
minAcceptedBandPrecision: input.minAcceptedBandPrecision ?? 0.9,
minAcceptedOrReviewRecall: input.minAcceptedOrReviewRecall ?? 0.8,
minRejectedBandPrecision: input.minRejectedBandPrecision ?? 0.8,
};
}
function roundMetric(value: number): number {
return Math.round(value * 1000) / 1000;
}
function ratio(numerator: number, denominator: number): number | null {
return denominator === 0 ? null : roundMetric(numerator / denominator);
}
function prediction(score: number, acceptThreshold: number, reviewThreshold: number): AdvicePredictedStatus {
if (score >= acceptThreshold) {
return 'accepted';
}
if (score >= reviewThreshold) {
return 'review';
}
return 'rejected';
}
function isMetricAtLeast(value: number | null, minimum: number): boolean {
return value !== null && value >= minimum;
}
function thresholdCandidate(
labels: readonly KloRelationshipFeedbackLabel[],
acceptThreshold: number,
reviewThreshold: number,
gates: ResolvedAdviceInput,
): KloRelationshipThresholdAdviceCandidate {
const scored = labels.filter((label): label is KloRelationshipFeedbackLabel & { score: number } => label.score !== null);
const acceptedLabels = scored.filter((label) => label.decision === 'accepted');
const rejectedLabels = scored.filter((label) => label.decision === 'rejected');
const predictions = scored.map((label) => ({
label,
predictedStatus: prediction(label.score, acceptThreshold, reviewThreshold),
}));
const predictedAccepted = predictions.filter((item) => item.predictedStatus === 'accepted');
const predictedReview = predictions.filter((item) => item.predictedStatus === 'review');
const predictedRejected = predictions.filter((item) => item.predictedStatus === 'rejected');
const acceptedBandPrecision = ratio(
predictedAccepted.filter((item) => item.label.decision === 'accepted').length,
predictedAccepted.length,
);
const acceptedOrReviewRecall = ratio(
predictions.filter((item) => item.label.decision === 'accepted' && item.predictedStatus !== 'rejected').length,
acceptedLabels.length,
);
const rejectedBandPrecision = ratio(
predictedRejected.filter((item) => item.label.decision === 'rejected').length,
predictedRejected.length,
);
return {
acceptThreshold,
reviewThreshold,
eligible:
predictedAccepted.length > 0 &&
predictedRejected.length > 0 &&
isMetricAtLeast(acceptedBandPrecision, gates.minAcceptedBandPrecision) &&
isMetricAtLeast(acceptedOrReviewRecall, gates.minAcceptedOrReviewRecall) &&
isMetricAtLeast(rejectedBandPrecision, gates.minRejectedBandPrecision),
predictedAccepted: predictedAccepted.length,
predictedReview: predictedReview.length,
predictedRejected: predictedRejected.length,
acceptedBandPrecision,
acceptedRecall: ratio(
predictedAccepted.filter((item) => item.label.decision === 'accepted').length,
acceptedLabels.length,
),
acceptedOrReviewRecall,
rejectedBandPrecision,
rejectedRecall: ratio(
predictions.filter((item) => item.label.decision === 'rejected' && item.predictedStatus !== 'accepted').length,
rejectedLabels.length,
),
falseAcceptedRejectedLabels: predictedAccepted.filter((item) => item.label.decision === 'rejected').length,
falseRejectedAcceptedLabels: predictedRejected.filter((item) => item.label.decision === 'accepted').length,
};
}
function metricRank(value: number | null): number {
return value ?? -1;
}
function sortCandidates(
candidates: readonly KloRelationshipThresholdAdviceCandidate[],
): KloRelationshipThresholdAdviceCandidate[] {
return [...candidates].sort(
(left, right) =>
Number(right.eligible) - Number(left.eligible) ||
metricRank(right.acceptedBandPrecision) - metricRank(left.acceptedBandPrecision) ||
metricRank(right.acceptedOrReviewRecall) - metricRank(left.acceptedOrReviewRecall) ||
metricRank(right.rejectedBandPrecision) - metricRank(left.rejectedBandPrecision) ||
right.acceptThreshold - left.acceptThreshold ||
right.reviewThreshold - left.reviewThreshold,
);
}
function labelGateReasons(labels: readonly KloRelationshipFeedbackLabel[], gates: ResolvedAdviceInput): string[] {
const scored = labels.filter((label) => label.score !== null);
const accepted = scored.filter((label) => label.decision === 'accepted');
const rejected = scored.filter((label) => label.decision === 'rejected');
const reasons: string[] = [];
if (scored.length < gates.minTotalLabels) {
reasons.push(`Need at least ${gates.minTotalLabels} scored labels; found ${scored.length}.`);
}
if (accepted.length < gates.minAcceptedLabels) {
reasons.push(`Need at least ${gates.minAcceptedLabels} accepted labels; found ${accepted.length}.`);
}
if (rejected.length < gates.minRejectedLabels) {
reasons.push(`Need at least ${gates.minRejectedLabels} rejected labels; found ${rejected.length}.`);
}
return reasons;
}
export function buildKloRelationshipThresholdAdviceReport(
feedback: ExportLocalRelationshipFeedbackLabelsResult,
input: BuildKloRelationshipThresholdAdviceReportInput = {},
): KloRelationshipThresholdAdviceReport {
const gates = resolveInput(input);
const scored = feedback.labels.filter((label) => label.score !== null);
const acceptedLabels = scored.filter((label) => label.decision === 'accepted');
const rejectedLabels = scored.filter((label) => label.decision === 'rejected');
const candidates = sortCandidates(
gates.acceptThresholds.flatMap((acceptThreshold) =>
gates.reviewThresholds.flatMap((reviewThreshold) =>
acceptThreshold > reviewThreshold
? [thresholdCandidate(feedback.labels, acceptThreshold, reviewThreshold, gates)]
: [],
),
),
);
const labelReasons = labelGateReasons(feedback.labels, gates);
const eligibleCandidates = candidates.filter((candidate) => candidate.eligible);
const status: KloRelationshipThresholdAdviceStatus =
labelReasons.length > 0 ? 'insufficient_labels' : eligibleCandidates.length > 0 ? 'ready' : 'no_eligible_thresholds';
const reasons =
status === 'insufficient_labels'
? labelReasons
: status === 'no_eligible_thresholds'
? ['No threshold candidate met the precision and recall gates.']
: [];
return {
generatedAt: feedback.generatedAt,
filters: feedback.filters,
status,
gates: {
minTotalLabels: gates.minTotalLabels,
minAcceptedLabels: gates.minAcceptedLabels,
minRejectedLabels: gates.minRejectedLabels,
minAcceptedBandPrecision: gates.minAcceptedBandPrecision,
minAcceptedOrReviewRecall: gates.minAcceptedOrReviewRecall,
minRejectedBandPrecision: gates.minRejectedBandPrecision,
},
summary: {
totalLabels: feedback.labels.length,
scoredLabels: scored.length,
unscoredLabels: feedback.labels.length - scored.length,
acceptedLabels: acceptedLabels.length,
rejectedLabels: rejectedLabels.length,
evaluatedCandidates: candidates.length,
eligibleCandidates: eligibleCandidates.length,
},
recommended: status === 'ready' ? eligibleCandidates[0] ?? null : null,
candidates,
reasons,
warnings: [...feedback.warnings],
};
}
export async function adviseLocalRelationshipFeedbackThresholds(
project: KloLocalProject,
input: AdviseLocalRelationshipFeedbackThresholdsInput = {},
): Promise<KloRelationshipThresholdAdviceReport> {
const exporter = input.exportLocalRelationshipFeedbackLabels ?? exportLocalRelationshipFeedbackLabels;
const feedback = await exporter(project, {
connectionId: input.connectionId,
decision: 'all',
});
return buildKloRelationshipThresholdAdviceReport(feedback, input);
}
function formatMetric(value: number | null): string {
return value === null ? 'n/a' : value.toFixed(3);
}
function candidateLine(candidate: KloRelationshipThresholdAdviceCandidate): string {
return [
`accept=${candidate.acceptThreshold.toFixed(2)}`,
`review=${candidate.reviewThreshold.toFixed(2)}`,
`eligible=${candidate.eligible ? 'yes' : 'no'}`,
`acceptedPrecision=${formatMetric(candidate.acceptedBandPrecision)}`,
`acceptedRecall=${formatMetric(candidate.acceptedRecall)}`,
`acceptedOrReviewRecall=${formatMetric(candidate.acceptedOrReviewRecall)}`,
`rejectedPrecision=${formatMetric(candidate.rejectedBandPrecision)}`,
`rejectedRecall=${formatMetric(candidate.rejectedRecall)}`,
`falseAcceptedRejected=${candidate.falseAcceptedRejectedLabels}`,
`falseRejectedAccepted=${candidate.falseRejectedAcceptedLabels}`,
].join(' ');
}
export function formatKloRelationshipThresholdAdviceMarkdown(report: KloRelationshipThresholdAdviceReport): string {
const lines = [
'KLO relationship threshold advice',
`Generated: ${report.generatedAt}`,
`Filter connection: ${report.filters.connectionId ?? 'all'}`,
`Status: ${report.status}`,
`Labels: total=${report.summary.totalLabels} scored=${report.summary.scoredLabels} accepted=${report.summary.acceptedLabels} rejected=${report.summary.rejectedLabels}`,
`Gates: minTotal=${report.gates.minTotalLabels} minAccepted=${report.gates.minAcceptedLabels} minRejected=${report.gates.minRejectedLabels} acceptedPrecision=${report.gates.minAcceptedBandPrecision.toFixed(3)} acceptedOrReviewRecall=${report.gates.minAcceptedOrReviewRecall.toFixed(3)} rejectedPrecision=${report.gates.minRejectedBandPrecision.toFixed(3)}`,
`Evaluated candidates: ${report.summary.evaluatedCandidates}`,
`Eligible candidates: ${report.summary.eligibleCandidates}`,
`Recommended: ${
report.recommended
? `accept=${report.recommended.acceptThreshold.toFixed(2)} review=${report.recommended.reviewThreshold.toFixed(2)}`
: 'none'
}`,
];
if (report.reasons.length > 0) {
lines.push('', 'Reasons', ...report.reasons.map((reason) => ` - ${reason}`));
}
if (report.candidates.length > 0) {
lines.push('', 'Top candidates', ...report.candidates.slice(0, 5).map((candidate) => ` - ${candidateLine(candidate)}`));
}
if (report.warnings.length > 0) {
lines.push('', 'Warnings');
for (const warning of report.warnings.slice(0, 5)) {
lines.push(` - ${warning.path}: ${warning.message}`);
}
}
return `${lines.join('\n')}\n`;
}

View file

@ -0,0 +1,492 @@
import Database from 'better-sqlite3';
import { afterEach, describe, expect, it } from 'vitest';
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
import { generateKloRelationshipDiscoveryCandidates } from './relationship-candidates.js';
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
import { profileKloRelationshipSchema } from './relationship-profiling.js';
import { validateKloRelationshipDiscoveryCandidates } from './relationship-validation.js';
import type { KloQueryResult, KloReadOnlyQueryInput, KloScanContext } from './types.js';
class InMemorySqliteExecutor {
readonly db = new Database(':memory:');
queryCount = 0;
executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
this.queryCount += 1;
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
const headers = Object.keys(rows[0] ?? {});
return Promise.resolve({
headers,
rows: rows.map((row) => headers.map((header) => row[header])),
totalRows: rows.length,
rowCount: rows.length,
});
}
close(): void {
this.db.close();
}
}
function column(tableId: string, name: string, overrides: Partial<KloEnrichedColumn> = {}): KloEnrichedColumn {
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
return {
id: `${tableId}.${name}`,
tableId,
tableRef,
name,
nativeType: overrides.nativeType ?? 'INTEGER',
normalizedType: overrides.normalizedType ?? 'integer',
dimensionType: overrides.dimensionType ?? 'number',
nullable: overrides.nullable ?? true,
primaryKey: overrides.primaryKey ?? false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
...overrides,
};
}
function table(name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
const ref = { catalog: null, db: null, name };
return {
id: name,
ref,
enabled: true,
descriptions: {},
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
};
}
function schema(tables?: KloEnrichedTable[]): KloEnrichedSchema {
return {
connectionId: 'warehouse',
tables: tables ?? [
table('accounts', [
column('accounts', 'id', { nullable: false }),
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
]),
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id', { nullable: false })]),
table('invoices', [
column('invoices', 'id', { nullable: false }),
column('invoices', 'account_id', { nullable: false }),
]),
],
relationships: [],
};
}
describe('relationship validation', () => {
let executor: InMemorySqliteExecutor | null = null;
afterEach(() => {
executor?.close();
executor = null;
});
it('accepts a relationship-discovery candidate with unique parent values and full source coverage', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, name TEXT);
CREATE TABLE users (id INTEGER, account_id INTEGER);
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex'), (3, 'Initech');
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2), (12, 3);
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 2), (22, 999);
`);
const testSchema = schema();
const profiles = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: testSchema,
executor,
ctx: { runId: 'validate-test' },
});
const candidates = generateKloRelationshipDiscoveryCandidates(testSchema).filter(
(candidate) => candidate.from.table.name === 'users',
);
const validated = await validateKloRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
driver: 'sqlite',
candidates,
profiles,
executor,
ctx: { runId: 'validate-test' },
});
expect(validated).toHaveLength(1);
expect(validated[0]).toMatchObject({
from: { table: { name: 'users' }, columns: ['account_id'] },
to: { table: { name: 'accounts' }, columns: ['id'] },
status: 'accepted',
score: expect.any(Number),
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationCount: 0,
violationRatio: 0,
reasons: expect.arrayContaining(['validation_passed']),
},
});
expect(validated[0]?.score).toBeGreaterThanOrEqual(0.85);
});
it('rejects a candidate with missing parent values and records the deterministic reason', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, name TEXT);
CREATE TABLE users (id INTEGER, account_id INTEGER);
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2);
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 999), (22, 1000);
`);
const testSchema = schema();
const profiles = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: testSchema,
executor,
ctx: { runId: 'validate-test' },
});
const candidates = generateKloRelationshipDiscoveryCandidates(testSchema).filter(
(candidate) => candidate.from.table.name === 'invoices',
);
const validated = await validateKloRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
driver: 'sqlite',
candidates,
profiles,
executor,
ctx: { runId: 'validate-test' },
settings: {
minSourceCoverage: 0.9,
maxViolationRatio: 0.01,
},
});
expect(validated).toHaveLength(1);
expect(validated[0]).toMatchObject({
from: { table: { name: 'invoices' }, columns: ['account_id'] },
to: { table: { name: 'accounts' }, columns: ['id'] },
status: 'rejected',
validation: {
sourceCoverage: 1 / 3,
violationCount: 2,
violationRatio: 2 / 3,
reasons: expect.arrayContaining(['low_source_coverage', 'excessive_violations']),
},
});
});
it('keeps over-budget candidates review-only without executing coverage SQL for them', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, name TEXT);
CREATE TABLE users (id INTEGER, account_id INTEGER);
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex'), (3, 'Initech');
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2), (12, 3);
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 2), (22, 3);
`);
const testSchema = schema();
const profiles = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: testSchema,
executor,
ctx: { runId: 'validate-budget-profile' },
});
executor.queryCount = 0;
const candidates = generateKloRelationshipDiscoveryCandidates(testSchema).map((candidate) => ({
...candidate,
confidence: candidate.from.table.name === 'users' ? 0.99 : 0.5,
}));
const validated = await validateKloRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
driver: 'sqlite',
candidates,
profiles,
executor,
ctx: { runId: 'validate-budget' },
tableCount: testSchema.tables.length,
settings: {
validationBudget: 1,
},
});
expect(executor.queryCount).toBe(1);
expect(validated).toHaveLength(2);
expect(validated.find((candidate) => candidate.from.table.name === 'users')).toMatchObject({
status: 'accepted',
validation: { reasons: expect.arrayContaining(['validation_passed']) },
});
expect(validated.find((candidate) => candidate.from.table.name === 'invoices')).toMatchObject({
status: 'review',
validation: {
reasons: ['validation_unattempted'],
},
});
});
it('treats validation budget zero as review-only validation without coverage SQL', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER, name TEXT);
CREATE TABLE users (id INTEGER, account_id INTEGER);
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2);
`);
const testSchema = schema([
table('accounts', [
column('accounts', 'id', { nullable: false }),
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
]),
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id', { nullable: false })]),
]);
const profiles = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: testSchema,
executor,
ctx: { runId: 'validate-zero-budget-profile' },
});
executor.queryCount = 0;
const candidates = generateKloRelationshipDiscoveryCandidates(testSchema);
const validated = await validateKloRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
driver: 'sqlite',
candidates,
profiles,
executor,
ctx: { runId: 'validate-zero-budget' },
tableCount: testSchema.tables.length,
settings: {
validationBudget: 0,
},
});
expect(executor.queryCount).toBe(0);
expect(validated).toHaveLength(1);
expect(validated[0]).toMatchObject({
status: 'review',
score: expect.any(Number),
validation: {
checkedValues: 0,
reasons: ['validation_unattempted'],
},
});
});
it('marks rejected LLM proposals with the spec rejection reason', async () => {
executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE customers (id INTEGER);
CREATE TABLE orders (buyer_ref INTEGER);
INSERT INTO customers (id) VALUES (1), (2);
INSERT INTO orders (buyer_ref) VALUES (98), (99);
`);
const testSchema = schema([
table('customers', [column('customers', 'id', { nullable: false })]),
table('orders', [column('orders', 'buyer_ref')]),
]);
const profiles = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: testSchema,
executor,
ctx: { runId: 'llm-rejected-validation' },
});
const [candidate] = generateKloRelationshipDiscoveryCandidates(
schema([
table('customers', [column('customers', 'id', { nullable: false })]),
table('orders', [column('orders', 'customer_id')]),
]),
);
if (!candidate) {
throw new Error('Expected base candidate');
}
const llmCandidate = {
...candidate,
id: 'orders:(orders.buyer_ref)->customers:(customers.id)',
from: { ...candidate.from, columnIds: ['orders.buyer_ref'], columns: ['buyer_ref'] },
source: 'llm_proposal' as const,
evidence: {
...candidate.evidence,
reasons: ['llm_proposal'],
llmConfidence: 0.84,
llmRationale: 'Buyer references should map to customers.',
},
};
const [validated] = await validateKloRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
driver: 'sqlite',
candidates: [llmCandidate],
profiles,
executor,
ctx: { runId: 'llm-rejected-validation' },
});
expect(validated?.status).toBe('rejected');
expect(validated?.validation.reasons).toEqual(
expect.arrayContaining(['low_source_coverage', 'llm_proposed_but_validation_failed']),
);
});
it('limits validation query concurrency', async () => {
const executor = new InMemorySqliteExecutor();
executor.db.exec(`
CREATE TABLE accounts (id INTEGER NOT NULL);
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
CREATE TABLE invoices (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
INSERT INTO accounts VALUES (1), (2);
INSERT INTO orders VALUES (10, 1), (11, 2);
INSERT INTO invoices VALUES (20, 1), (21, 2);
`);
let active = 0;
let maxActive = 0;
const throttled = {
executeReadOnly: async (input: KloReadOnlyQueryInput, ctx: KloScanContext) => {
active += 1;
maxActive = Math.max(maxActive, active);
await new Promise((resolve) => setTimeout(resolve, input.sql.includes('WITH child_values') ? 10 : 0));
const result = await executor.executeReadOnly(input, ctx);
active -= 1;
return result;
},
};
const testSchema = schema([
table('accounts', [column('accounts', 'id', { nullable: false })]),
table('orders', [column('orders', 'id', { nullable: false }), column('orders', 'account_id')]),
table('invoices', [column('invoices', 'id', { nullable: false }), column('invoices', 'account_id')]),
]);
const profiles = await profileKloRelationshipSchema({
connectionId: 'warehouse',
driver: 'sqlite',
schema: testSchema,
executor,
ctx: { runId: 'validation-concurrency-profile' },
});
const candidates = generateKloRelationshipDiscoveryCandidates(testSchema);
await validateKloRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
driver: 'sqlite',
candidates,
profiles,
executor: throttled,
ctx: { runId: 'validation-concurrency' },
settings: { concurrency: 1 },
});
expect(maxActive).toBe(1);
executor.close();
});
it('pins column_suffix_match validation scoring for plan-code suffix candidates', async () => {
const candidate = {
id: 'mart:(current_plan_code)->plans:(plan_code)',
from: {
tableId: 'mart-account-segments-id',
columnIds: ['current-plan-code-col'],
table: { catalog: null, db: null, name: 'mart_account_segments' },
columns: ['current_plan_code'],
},
to: {
tableId: 'plans-id',
columnIds: ['plan-code-col'],
table: { catalog: null, db: null, name: 'stg_plans' },
columns: ['plan_code'],
},
relationshipType: 'many_to_one' as const,
confidence: 0.902,
source: 'column_suffix_match' as const,
status: 'review' as const,
evidence: {
sourceColumnBase: 'current_plan',
targetTableBase: 'plan',
targetColumnBase: 'plan_code',
targetKeyScore: 0.86,
nameScore: 0.78,
reasons: ['column_suffix_match', 'profile_unique_target'],
},
};
const profiles = {
connectionId: 'warehouse',
driver: 'sqlite',
sqlAvailable: true,
queryCount: 0,
tables: [],
warnings: [],
columns: {
'mart_account_segments.current_plan_code': {
table: { catalog: null, db: null, name: 'mart_account_segments' },
column: 'current_plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
'stg_plans.plan_code': {
table: { catalog: null, db: null, name: 'stg_plans' },
column: 'plan_code',
nativeType: 'TEXT',
normalizedType: 'text',
rowCount: 4,
nullCount: 0,
distinctCount: 4,
uniquenessRatio: 1,
nullRate: 0,
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
minTextLength: 4,
maxTextLength: 10,
},
},
} satisfies KloRelationshipProfileArtifact;
const executor = {
async executeReadOnly() {
return {
headers: ['child_distinct', 'parent_distinct', 'overlap', 'violation_count'],
rows: [[4, 4, 4, 0]],
rowCount: 1,
totalRows: 1,
};
},
};
const [validated] = await validateKloRelationshipDiscoveryCandidates({
connectionId: 'warehouse',
driver: 'sqlite',
candidates: [candidate],
profiles,
executor,
ctx: { runId: 'rule-b-validation-score' },
});
expect(validated).toMatchObject({
status: 'accepted',
score: 0.98,
validation: {
targetUniqueness: 1,
sourceCoverage: 1,
violationRatio: 0,
reasons: ['validation_passed'],
},
});
});
});

View file

@ -0,0 +1,370 @@
import type { KloRelationshipEndpoint } from './enrichment-types.js';
import { applyKloRelationshipValidationBudget, type KloRelationshipValidationBudget } from './relationship-budget.js';
import type { KloRelationshipDiscoveryCandidate } from './relationship-candidates.js';
import {
formatKloRelationshipTableRef,
type KloRelationshipProfileArtifact,
type KloRelationshipReadOnlyExecutor,
quoteKloRelationshipIdentifier,
} from './relationship-profiling.js';
import type { KloConnectionDriver, KloQueryResult, KloScanContext } from './types.js';
export type KloValidatedRelationshipStatus = 'accepted' | 'review' | 'rejected';
export interface KloRelationshipValidationSettings {
acceptThreshold: number;
reviewThreshold: number;
minTargetUniqueness: number;
minSourceCoverage: number;
maxViolationRatio: number;
maxDistinctSourceValues: number;
concurrency: number;
validationBudget?: KloRelationshipValidationBudget;
}
export interface KloRelationshipValidationEvidence {
targetUniqueness: number;
sourceCoverage: number;
violationCount: number;
violationRatio: number;
sourceNullRate: number;
targetNullRate: number;
childDistinct: number;
parentDistinct: number;
overlap: number;
checkedValues: number;
reasons: string[];
}
export interface KloValidatedRelationshipDiscoveryCandidate
extends Omit<KloRelationshipDiscoveryCandidate, 'status'> {
status: KloValidatedRelationshipStatus;
score: number;
validation: KloRelationshipValidationEvidence;
}
export interface ValidateKloRelationshipDiscoveryCandidatesInput {
connectionId: string;
driver: KloConnectionDriver;
candidates: readonly KloRelationshipDiscoveryCandidate[];
profiles: KloRelationshipProfileArtifact;
executor: KloRelationshipReadOnlyExecutor | null;
ctx: KloScanContext;
tableCount?: number;
settings?: Partial<KloRelationshipValidationSettings>;
}
const DEFAULT_SETTINGS: KloRelationshipValidationSettings = {
acceptThreshold: 0.85,
reviewThreshold: 0.55,
minTargetUniqueness: 0.9,
minSourceCoverage: 0.9,
maxViolationRatio: 0.01,
maxDistinctSourceValues: 10000,
concurrency: 4,
};
function mergeSettings(
settings: Partial<KloRelationshipValidationSettings> | undefined,
): KloRelationshipValidationSettings {
return { ...DEFAULT_SETTINGS, ...settings };
}
function profileKey(table: string, column: string): string {
return `${table}.${column}`;
}
function singleRelationshipColumn(endpointValue: KloRelationshipEndpoint): string {
const column = endpointValue.columns[0];
if (!column) {
throw new Error(`Expected relationship endpoint ${endpointValue.table.name} to contain one column`);
}
return column;
}
function headerIndex(result: KloQueryResult, header: string): number {
return result.headers.findIndex((candidate) => candidate.toLowerCase() === header.toLowerCase());
}
function firstRow(result: KloQueryResult): unknown[] {
return result.rows[0] ?? [];
}
function numberAt(result: KloQueryResult, header: string): number {
const value = firstRow(result)[headerIndex(result, header)];
if (typeof value === 'number') {
return value;
}
if (typeof value === 'bigint') {
return Number(value);
}
if (typeof value === 'string' && value.trim() !== '') {
return Number(value);
}
return 0;
}
function limitSql(driver: KloConnectionDriver, limit: number): string {
if (driver === 'sqlserver') {
return '';
}
return ` LIMIT ${Math.max(1, Math.floor(limit))}`;
}
function topSql(driver: KloConnectionDriver, limit: number): string {
if (driver === 'sqlserver') {
return ` TOP (${Math.max(1, Math.floor(limit))})`;
}
return '';
}
function buildCoverageSql(input: {
driver: KloConnectionDriver;
childTable: string;
childColumn: string;
parentTable: string;
parentColumn: string;
maxDistinctSourceValues: number;
}): string {
const childTable = formatKloRelationshipTableRef(input.driver, { catalog: null, db: null, name: input.childTable });
const parentTable = formatKloRelationshipTableRef(input.driver, { catalog: null, db: null, name: input.parentTable });
const childColumn = quoteKloRelationshipIdentifier(input.driver, input.childColumn);
const parentColumn = quoteKloRelationshipIdentifier(input.driver, input.parentColumn);
const limit = limitSql(input.driver, input.maxDistinctSourceValues);
const top = topSql(input.driver, input.maxDistinctSourceValues);
return [
'WITH child_values AS (',
`SELECT DISTINCT${top} ${childColumn} AS value FROM ${childTable} WHERE ${childColumn} IS NOT NULL${limit}`,
'), parent_values AS (',
`SELECT DISTINCT ${parentColumn} AS value FROM ${parentTable} WHERE ${parentColumn} IS NOT NULL`,
')',
'SELECT',
'(SELECT COUNT(*) FROM child_values) AS child_distinct,',
'(SELECT COUNT(*) FROM parent_values) AS parent_distinct,',
'SUM(CASE WHEN parent_values.value IS NOT NULL THEN 1 ELSE 0 END) AS overlap,',
'SUM(CASE WHEN parent_values.value IS NULL THEN 1 ELSE 0 END) AS violation_count',
'FROM child_values',
'LEFT JOIN parent_values ON child_values.value = parent_values.value',
].join(' ');
}
function score(input: {
candidateConfidence: number;
targetUniqueness: number;
sourceCoverage: number;
violationRatio: number;
}): number {
const violationScore = Math.max(0, 1 - input.violationRatio);
return Number(
Math.min(
1,
0.2 * input.candidateConfidence +
0.3 * input.targetUniqueness +
0.4 * input.sourceCoverage +
0.1 * violationScore,
).toFixed(3),
);
}
function statusFor(input: {
score: number;
reasons: readonly string[];
settings: KloRelationshipValidationSettings;
}): KloValidatedRelationshipStatus {
if (
input.reasons.includes('low_target_uniqueness') ||
input.reasons.includes('low_source_coverage') ||
input.reasons.includes('excessive_violations')
) {
return 'rejected';
}
if (
input.score >= input.settings.acceptThreshold &&
!input.reasons.includes('low_target_uniqueness') &&
!input.reasons.includes('low_source_coverage') &&
!input.reasons.includes('excessive_violations')
) {
return 'accepted';
}
if (input.score >= input.settings.reviewThreshold) {
return 'review';
}
return 'rejected';
}
async function mapWithConcurrency<TInput, TOutput>(
inputs: readonly TInput[],
concurrency: number,
mapOne: (input: TInput) => Promise<TOutput>,
): Promise<TOutput[]> {
const safeConcurrency = Math.max(1, Math.floor(concurrency));
const outputs: TOutput[] = new Array(inputs.length);
let nextIndex = 0;
async function worker(): Promise<void> {
while (nextIndex < inputs.length) {
const index = nextIndex;
nextIndex += 1;
outputs[index] = await mapOne(inputs[index] as TInput);
}
}
await Promise.all(Array.from({ length: Math.min(safeConcurrency, inputs.length) }, () => worker()));
return outputs;
}
function reviewWithoutValidation(
candidate: KloRelationshipDiscoveryCandidate,
profiles: KloRelationshipProfileArtifact,
reason: 'validation_unavailable' | 'profile_unavailable' | 'validation_unattempted',
): KloValidatedRelationshipDiscoveryCandidate {
const sourceColumn = singleRelationshipColumn(candidate.from);
const targetColumn = singleRelationshipColumn(candidate.to);
const sourceProfile = profiles.columns[profileKey(candidate.from.table.name, sourceColumn)];
const targetProfile = profiles.columns[profileKey(candidate.to.table.name, targetColumn)];
return {
...candidate,
status: 'review',
score: Number((candidate.confidence * 0.6).toFixed(3)),
validation: {
targetUniqueness: targetProfile?.uniquenessRatio ?? 0,
sourceCoverage: 0,
violationCount: 0,
violationRatio: 1,
sourceNullRate: sourceProfile?.nullRate ?? 0,
targetNullRate: targetProfile?.nullRate ?? 0,
childDistinct: sourceProfile?.distinctCount ?? 0,
parentDistinct: targetProfile?.distinctCount ?? 0,
overlap: 0,
checkedValues: 0,
reasons: [reason],
},
};
}
export async function validateKloRelationshipDiscoveryCandidates(
input: ValidateKloRelationshipDiscoveryCandidatesInput,
): Promise<KloValidatedRelationshipDiscoveryCandidate[]> {
const settings = mergeSettings(input.settings);
if (!input.executor || !input.profiles.sqlAvailable) {
return input.candidates.map((candidate) =>
reviewWithoutValidation(candidate, input.profiles, 'validation_unavailable'),
);
}
const executor = input.executor;
async function validateCandidate(
candidate: KloRelationshipDiscoveryCandidate,
): Promise<KloValidatedRelationshipDiscoveryCandidate> {
const sourceColumn = singleRelationshipColumn(candidate.from);
const targetColumn = singleRelationshipColumn(candidate.to);
const sourceProfile = input.profiles.columns[profileKey(candidate.from.table.name, sourceColumn)];
const targetProfile = input.profiles.columns[profileKey(candidate.to.table.name, targetColumn)];
if (!sourceProfile || !targetProfile) {
return reviewWithoutValidation(candidate, input.profiles, 'profile_unavailable');
}
const result = await executor.executeReadOnly(
{
connectionId: input.connectionId,
sql: buildCoverageSql({
driver: input.driver,
childTable: candidate.from.table.name,
childColumn: sourceColumn,
parentTable: candidate.to.table.name,
parentColumn: targetColumn,
maxDistinctSourceValues: settings.maxDistinctSourceValues,
}),
maxRows: 1,
},
input.ctx,
);
const childDistinct = numberAt(result, 'child_distinct');
const parentDistinct = numberAt(result, 'parent_distinct');
const overlap = numberAt(result, 'overlap');
const violationCount = numberAt(result, 'violation_count');
const sourceCoverage = childDistinct === 0 ? 0 : overlap / childDistinct;
const violationRatio = childDistinct === 0 ? 1 : violationCount / childDistinct;
const targetUniqueness = targetProfile.uniquenessRatio;
const reasons: string[] = [];
if (targetUniqueness < settings.minTargetUniqueness) {
reasons.push('low_target_uniqueness');
}
if (sourceCoverage < settings.minSourceCoverage) {
reasons.push('low_source_coverage');
}
if (violationRatio > settings.maxViolationRatio) {
reasons.push('excessive_violations');
}
if (reasons.length === 0) {
reasons.push('validation_passed');
}
const candidateScore = score({
candidateConfidence: candidate.confidence,
targetUniqueness,
sourceCoverage,
violationRatio,
});
const candidateStatus = statusFor({ score: candidateScore, reasons, settings });
if (candidate.source === 'llm_proposal' && candidateStatus === 'rejected') {
reasons.push('llm_proposed_but_validation_failed');
}
return {
...candidate,
status: candidateStatus,
score: candidateScore,
validation: {
targetUniqueness,
sourceCoverage,
violationCount,
violationRatio,
sourceNullRate: sourceProfile.nullRate,
targetNullRate: targetProfile.nullRate,
childDistinct,
parentDistinct,
overlap,
checkedValues: childDistinct,
reasons,
},
};
}
const budgeted = applyKloRelationshipValidationBudget({
candidates: input.candidates,
tableCount: input.tableCount ?? 0,
budget: settings.validationBudget ?? (input.tableCount === undefined ? 'all' : undefined),
score: (candidate) => candidate.confidence,
});
const validated = await mapWithConcurrency(
budgeted.toValidate.map((entry) => entry.candidate),
settings.concurrency,
validateCandidate,
);
const byOriginalIndex = new Map<number, KloValidatedRelationshipDiscoveryCandidate>();
for (let index = 0; index < budgeted.toValidate.length; index += 1) {
const originalIndex = budgeted.toValidate[index]?.originalIndex;
const candidate = validated[index];
if (originalIndex !== undefined && candidate) {
byOriginalIndex.set(originalIndex, candidate);
}
}
for (const entry of budgeted.deferred) {
byOriginalIndex.set(
entry.originalIndex,
reviewWithoutValidation(entry.candidate, input.profiles, 'validation_unattempted'),
);
}
return input.candidates.map((_, index) => {
const candidate = byOriginalIndex.get(index);
if (!candidate) {
throw new Error(`Missing relationship validation result for candidate at index ${index}`);
}
return candidate;
});
}

View file

@ -0,0 +1,237 @@
import { mkdirSync } from 'node:fs';
import { dirname } from 'node:path';
import Database from 'better-sqlite3';
import type {
KloScanEnrichmentCompletedStage,
KloScanEnrichmentFailedStage,
KloScanEnrichmentStageLookup,
KloScanEnrichmentStageRecord,
KloScanEnrichmentStateStore,
} from './enrichment-state.js';
import type { KloScanEnrichmentStage, KloScanMode } from './types.js';
export interface SqliteLocalScanEnrichmentStateStoreOptions {
dbPath: string;
}
interface StageRow {
run_id: string;
connection_id: string;
sync_id: string;
mode: KloScanMode;
stage: KloScanEnrichmentStage;
input_hash: string;
status: 'completed' | 'failed';
output_json: string | null;
error_message: string | null;
updated_at: string;
}
function parseStageRow<TOutput = unknown>(row: StageRow): KloScanEnrichmentStageRecord<TOutput> {
if (row.status === 'completed') {
return {
runId: row.run_id,
connectionId: row.connection_id,
syncId: row.sync_id,
mode: row.mode,
stage: row.stage,
inputHash: row.input_hash,
status: 'completed',
output: JSON.parse(row.output_json ?? 'null') as TOutput,
errorMessage: null,
updatedAt: row.updated_at,
};
}
return {
runId: row.run_id,
connectionId: row.connection_id,
syncId: row.sync_id,
mode: row.mode,
stage: row.stage,
inputHash: row.input_hash,
status: 'failed',
output: null,
errorMessage: row.error_message ?? 'Unknown enrichment stage failure',
updatedAt: row.updated_at,
};
}
function isSafeRunId(runId: string): boolean {
return /^[a-zA-Z0-9][a-zA-Z0-9_.-]*$/.test(runId);
}
export class SqliteLocalScanEnrichmentStateStore implements KloScanEnrichmentStateStore {
private readonly db: Database.Database;
constructor(options: SqliteLocalScanEnrichmentStateStoreOptions) {
mkdirSync(dirname(options.dbPath), { recursive: true });
this.db = new Database(options.dbPath);
this.db.pragma('journal_mode = WAL');
this.db.exec(`
CREATE TABLE IF NOT EXISTS local_scan_enrichment_stages (
run_id TEXT NOT NULL,
stage TEXT NOT NULL,
input_hash TEXT NOT NULL,
connection_id TEXT NOT NULL,
sync_id TEXT NOT NULL,
mode TEXT NOT NULL,
status TEXT NOT NULL,
output_json TEXT,
error_message TEXT,
updated_at TEXT NOT NULL,
PRIMARY KEY (run_id, stage)
);
CREATE INDEX IF NOT EXISTS local_scan_enrichment_stages_run_idx
ON local_scan_enrichment_stages (run_id, updated_at, stage);
`);
}
async findCompletedStage<TOutput = unknown>(
input: KloScanEnrichmentStageLookup,
): Promise<KloScanEnrichmentCompletedStage<TOutput> | null> {
if (!isSafeRunId(input.runId)) {
return null;
}
const row = this.db
.prepare(
`
SELECT *
FROM local_scan_enrichment_stages
WHERE run_id = ?
AND stage = ?
AND input_hash = ?
AND status = 'completed'
`,
)
.get(input.runId, input.stage, input.inputHash) as StageRow | undefined;
if (!row) {
return null;
}
const parsed = parseStageRow<TOutput>(row);
return parsed.status === 'completed' ? parsed : null;
}
async saveCompletedStage<TOutput = unknown>(
input: Omit<KloScanEnrichmentCompletedStage<TOutput>, 'status' | 'errorMessage'>,
): Promise<void> {
this.db
.prepare(
`
INSERT INTO local_scan_enrichment_stages (
run_id,
stage,
input_hash,
connection_id,
sync_id,
mode,
status,
output_json,
error_message,
updated_at
)
VALUES (
@runId,
@stage,
@inputHash,
@connectionId,
@syncId,
@mode,
'completed',
@outputJson,
NULL,
@updatedAt
)
ON CONFLICT(run_id, stage) DO UPDATE SET
input_hash = excluded.input_hash,
connection_id = excluded.connection_id,
sync_id = excluded.sync_id,
mode = excluded.mode,
status = excluded.status,
output_json = excluded.output_json,
error_message = excluded.error_message,
updated_at = excluded.updated_at
`,
)
.run({
runId: input.runId,
stage: input.stage,
inputHash: input.inputHash,
connectionId: input.connectionId,
syncId: input.syncId,
mode: input.mode,
outputJson: JSON.stringify(input.output),
updatedAt: input.updatedAt,
});
}
async saveFailedStage(input: Omit<KloScanEnrichmentFailedStage, 'status' | 'output'>): Promise<void> {
this.db
.prepare(
`
INSERT INTO local_scan_enrichment_stages (
run_id,
stage,
input_hash,
connection_id,
sync_id,
mode,
status,
output_json,
error_message,
updated_at
)
VALUES (
@runId,
@stage,
@inputHash,
@connectionId,
@syncId,
@mode,
'failed',
NULL,
@errorMessage,
@updatedAt
)
ON CONFLICT(run_id, stage) DO UPDATE SET
input_hash = excluded.input_hash,
connection_id = excluded.connection_id,
sync_id = excluded.sync_id,
mode = excluded.mode,
status = excluded.status,
output_json = excluded.output_json,
error_message = excluded.error_message,
updated_at = excluded.updated_at
`,
)
.run({
runId: input.runId,
stage: input.stage,
inputHash: input.inputHash,
connectionId: input.connectionId,
syncId: input.syncId,
mode: input.mode,
errorMessage: input.errorMessage,
updatedAt: input.updatedAt,
});
}
async listRunStages(runId: string): Promise<KloScanEnrichmentStageRecord[]> {
if (!isSafeRunId(runId)) {
return [];
}
const rows = this.db
.prepare(
`
SELECT *
FROM local_scan_enrichment_stages
WHERE run_id = ?
ORDER BY updated_at ASC, stage ASC
`,
)
.all(runId) as StageRow[];
return rows.map((row) => parseStageRow(row));
}
}

View file

@ -0,0 +1,24 @@
import { describe, expect, it } from 'vitest';
import { inferKloDimensionType, kloColumnTypeMappingFromNative, normalizeKloNativeType } from './type-normalization.js';
describe('KLO scan type normalization', () => {
it('normalizes native database type strings', () => {
expect(normalizeKloNativeType(' NUMERIC(12, 2) ')).toBe('numeric');
expect(normalizeKloNativeType('TIMESTAMP WITH TIME ZONE')).toBe('timestamp with time zone');
expect(normalizeKloNativeType('')).toBe('unknown');
});
it('infers dimension types from native types', () => {
expect(inferKloDimensionType('BOOLEAN')).toBe('boolean');
expect(inferKloDimensionType('timestamp with time zone')).toBe('time');
expect(inferKloDimensionType('decimal(10,2)')).toBe('number');
expect(inferKloDimensionType('varchar(255)')).toBe('string');
});
it('builds a complete column type mapping', () => {
expect(kloColumnTypeMappingFromNative('BIGINT')).toEqual({
normalizedType: 'bigint',
dimensionType: 'number',
});
});
});

View file

@ -0,0 +1,32 @@
import type { KloSchemaDimensionType } from './types.js';
export interface KloColumnTypeMapping {
normalizedType: string;
dimensionType: KloSchemaDimensionType;
}
export function normalizeKloNativeType(nativeType: string): string {
const normalized = nativeType.toLowerCase().replace(/\([^)]*\)/g, '').replace(/\s+/g, ' ').trim();
return normalized.length > 0 ? normalized : 'unknown';
}
export function inferKloDimensionType(nativeType: string): KloSchemaDimensionType {
const normalized = normalizeKloNativeType(nativeType);
if (/\b(bool|boolean)\b/.test(normalized)) {
return 'boolean';
}
if (/\b(date|datetime|time|timestamp)\b/.test(normalized)) {
return 'time';
}
if (/\b(int|integer|bigint|smallint|decimal|numeric|number|float|double|real)\b/.test(normalized)) {
return 'number';
}
return 'string';
}
export function kloColumnTypeMappingFromNative(nativeType: string): KloColumnTypeMapping {
return {
normalizedType: normalizeKloNativeType(nativeType),
dimensionType: inferKloDimensionType(nativeType),
};
}

View file

@ -0,0 +1,258 @@
import { describe, expect, it } from 'vitest';
import {
createKloConnectorCapabilities,
type KloEventPropertyDiscovery,
type KloEventPropertyDiscoveryInput,
type KloEventPropertyValuesInput,
type KloEventPropertyValuesResult,
type KloEventStreamDiscoveryPort,
type KloEventTypeDiscovery,
type KloEventTypeDiscoveryInput,
type KloNetworkEndpoint,
type KloNetworkTunnelPort,
type KloQueryResult,
type KloScanConnector,
type KloScanContext,
type KloScanInput,
type KloSchemaSnapshot,
} from './types.js';
describe('KLO scan contract types', () => {
it('defaults to structural-only connector capabilities', () => {
expect(createKloConnectorCapabilities()).toEqual({
structuralIntrospection: true,
tableSampling: false,
columnSampling: false,
columnStats: false,
readOnlySql: false,
nestedAnalysis: false,
eventStreamDiscovery: false,
formalForeignKeys: false,
estimatedRowCounts: false,
});
});
it('keeps structural introspection mandatory when optional capabilities are enabled', () => {
expect(
createKloConnectorCapabilities({
tableSampling: true,
readOnlySql: true,
eventStreamDiscovery: true,
estimatedRowCounts: true,
}),
).toEqual({
structuralIntrospection: true,
tableSampling: true,
columnSampling: false,
columnStats: false,
readOnlySql: true,
nestedAnalysis: false,
eventStreamDiscovery: true,
formalForeignKeys: false,
estimatedRowCounts: true,
});
});
it('describes the connector surface without requiring enrichment methods', async () => {
const snapshot: KloSchemaSnapshot = {
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-29T00:00:00.000Z',
scope: { schemas: ['public'] },
metadata: { source: 'unit-test' },
tables: [
{
catalog: null,
db: 'public',
name: 'orders',
kind: 'table',
comment: 'Customer orders',
estimatedRows: 42,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Primary key',
},
],
foreignKeys: [],
},
],
};
const connector: KloScanConnector = {
id: 'test-postgres',
driver: 'postgres',
capabilities: createKloConnectorCapabilities({ estimatedRowCounts: true }),
async introspect(input: KloScanInput, ctx: KloScanContext) {
expect(input.connectionId).toBe('warehouse');
expect(ctx.runId).toBe('scan-run-1');
return snapshot;
},
};
await expect(
connector.introspect(
{
connectionId: 'warehouse',
driver: 'postgres',
scope: { schemas: ['public'] },
mode: 'structural',
},
{ runId: 'scan-run-1' },
),
).resolves.toEqual(snapshot);
});
it('models optional event-stream discovery as a connector capability and port', async () => {
const eventTypes: KloEventTypeDiscovery[] = [{ value: '$pageview', count: 42 }];
const propertyKeys: KloEventPropertyDiscovery[] = [{ key: '$browser', count: 31 }];
const propertyValues: KloEventPropertyValuesResult = { values: ['Chrome', 'Safari'], cardinality: 2 };
const discovery: KloEventStreamDiscoveryPort = {
async listEventTypes(input: KloEventTypeDiscoveryInput) {
expect(input).toEqual({
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
eventColumn: 'event',
limit: 2,
minCount: 30,
lookbackDays: 14,
});
return eventTypes;
},
async listPropertyKeys(input: KloEventPropertyDiscoveryInput) {
expect(input).toEqual({
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
jsonColumn: 'properties',
sampleSize: 1000,
limit: 5,
lookbackDays: 7,
});
return propertyKeys;
},
async listPropertyValues(input: KloEventPropertyValuesInput) {
expect(input).toEqual({
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
jsonColumn: 'properties',
propertyKey: '$browser',
limit: 3,
maxCardinality: 1000,
lookbackDays: 30,
});
return propertyValues;
},
};
const connector: KloScanConnector = {
id: 'posthog:product',
driver: 'posthog',
capabilities: createKloConnectorCapabilities({ eventStreamDiscovery: true }),
eventStreamDiscovery: discovery,
async introspect() {
return {
connectionId: 'product',
driver: 'posthog',
extractedAt: '2026-04-29T00:00:00.000Z',
scope: { catalogs: ['157881'] },
metadata: {},
tables: [],
};
},
};
await expect(
connector.eventStreamDiscovery?.listEventTypes(
{
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
eventColumn: 'event',
limit: 2,
minCount: 30,
lookbackDays: 14,
},
{ runId: 'scan-run-1' },
),
).resolves.toEqual([{ value: '$pageview', count: 42 }]);
await expect(
connector.eventStreamDiscovery?.listPropertyKeys(
{
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
jsonColumn: 'properties',
sampleSize: 1000,
limit: 5,
lookbackDays: 7,
},
{ runId: 'scan-run-1' },
),
).resolves.toEqual([{ key: '$browser', count: 31 }]);
await expect(
connector.eventStreamDiscovery?.listPropertyValues(
{
connectionId: 'product',
table: { catalog: '157881', db: null, name: 'events' },
jsonColumn: 'properties',
propertyKey: '$browser',
limit: 3,
maxCardinality: 1000,
lookbackDays: 30,
},
{ runId: 'scan-run-1' },
),
).resolves.toEqual({ values: ['Chrome', 'Safari'], cardinality: 2 });
});
it('keeps read-only query results separate from schema snapshots', () => {
const result: KloQueryResult = {
headers: ['id', 'amount'],
headerTypes: ['integer', 'numeric'],
rows: [[1, 10.5]],
totalRows: 1,
rowCount: 1,
};
expect(result).toEqual({
headers: ['id', 'amount'],
headerTypes: ['integer', 'numeric'],
rows: [[1, 10.5]],
totalRows: 1,
rowCount: 1,
});
});
it('models host-provided network tunnel endpoint resolution without app imports', async () => {
const endpoint: KloNetworkEndpoint = {
host: '127.0.0.1',
port: 15432,
close: async () => undefined,
};
const tunnelPort: KloNetworkTunnelPort<{ networkProxy?: { type: 'ssh_tunnel' } }> = {
async resolveEndpoint(input) {
expect(input).toEqual({
connectionId: 'warehouse',
driver: 'postgres',
host: 'db.internal',
port: 5432,
connection: { networkProxy: { type: 'ssh_tunnel' } },
});
return endpoint;
},
};
await expect(
tunnelPort.resolveEndpoint({
connectionId: 'warehouse',
driver: 'postgres',
host: 'db.internal',
port: 5432,
connection: { networkProxy: { type: 'ssh_tunnel' } },
}),
).resolves.toBe(endpoint);
});
});

View file

@ -0,0 +1,391 @@
export type KloConnectionDriver =
| 'sqlite'
| 'postgres'
| 'postgresql'
| 'sqlserver'
| 'bigquery'
| 'snowflake'
| 'posthog'
| 'mysql'
| 'clickhouse';
export type KloScanMode = 'structural' | 'relationships' | 'enriched';
export type KloScanTrigger = 'cli' | 'mcp' | 'schema_scan' | 'scheduled' | 'manual';
export interface KloConnectorCapabilities {
structuralIntrospection: true;
tableSampling: boolean;
columnSampling: boolean;
columnStats: boolean;
readOnlySql: boolean;
nestedAnalysis: boolean;
eventStreamDiscovery: boolean;
formalForeignKeys: boolean;
estimatedRowCounts: boolean;
}
export type KloOptionalConnectorCapabilities = Partial<Omit<KloConnectorCapabilities, 'structuralIntrospection'>>;
export function createKloConnectorCapabilities(
capabilities: KloOptionalConnectorCapabilities = {},
): KloConnectorCapabilities {
return {
structuralIntrospection: true,
tableSampling: capabilities.tableSampling ?? false,
columnSampling: capabilities.columnSampling ?? false,
columnStats: capabilities.columnStats ?? false,
readOnlySql: capabilities.readOnlySql ?? false,
nestedAnalysis: capabilities.nestedAnalysis ?? false,
eventStreamDiscovery: capabilities.eventStreamDiscovery ?? false,
formalForeignKeys: capabilities.formalForeignKeys ?? false,
estimatedRowCounts: capabilities.estimatedRowCounts ?? false,
};
}
export interface KloSchemaScope {
catalogs?: string[];
schemas?: string[];
datasets?: string[];
}
export type KloSchemaTableKind = 'table' | 'view' | 'external' | 'event_stream';
export type KloSchemaDimensionType = 'time' | 'string' | 'number' | 'boolean';
export interface KloSchemaColumn {
name: string;
nativeType: string;
normalizedType: string;
dimensionType: KloSchemaDimensionType;
nullable: boolean;
primaryKey: boolean;
comment: string | null;
}
export interface KloSchemaForeignKey {
fromColumn: string;
toCatalog: string | null;
toDb: string | null;
toTable: string;
toColumn: string;
constraintName: string | null;
}
export interface KloSchemaTable {
catalog: string | null;
db: string | null;
name: string;
kind: KloSchemaTableKind;
comment: string | null;
estimatedRows: number | null;
columns: KloSchemaColumn[];
foreignKeys: KloSchemaForeignKey[];
}
export interface KloSchemaSnapshot {
connectionId: string;
driver: KloConnectionDriver;
extractedAt: string;
scope: KloSchemaScope;
tables: KloSchemaTable[];
metadata: Record<string, unknown>;
}
export interface KloCredentialEnvReference {
kind: 'env';
name: string;
}
export interface KloCredentialFileReference {
kind: 'file';
path: string;
}
export interface KloResolvedCredentialEnvelope {
kind: 'resolved';
source: 'standalone' | 'host';
values: Record<string, unknown>;
redacted?: boolean;
}
export type KloCredentialEnvelope =
| KloCredentialEnvReference
| KloCredentialFileReference
| KloResolvedCredentialEnvelope;
export interface KloNetworkEndpoint {
host: string;
port: number;
close?: () => Promise<void>;
}
export interface KloNetworkTunnelRequest<TConnection = Record<string, unknown>> {
connectionId: string;
driver: KloConnectionDriver;
host: string;
port: number;
connection: TConnection;
}
export interface KloNetworkTunnelPort<TConnection = Record<string, unknown>> {
resolveEndpoint(input: KloNetworkTunnelRequest<TConnection>): Promise<KloNetworkEndpoint | null>;
}
export interface KloScanInput {
connectionId: string;
driver: KloConnectionDriver;
scope?: KloSchemaScope;
mode?: KloScanMode;
dryRun?: boolean;
detectRelationships?: boolean;
credentials?: KloCredentialEnvelope;
metadata?: Record<string, unknown>;
}
export interface KloProgressUpdateOptions {
transient?: boolean;
}
export interface KloProgressPort {
update(progress: number, message?: string, options?: KloProgressUpdateOptions): Promise<void>;
startPhase(weight: number): KloProgressPort;
}
export interface KloScanLoggerPort {
debug(message: string, metadata?: Record<string, unknown>): void;
info(message: string, metadata?: Record<string, unknown>): void;
warn(message: string, metadata?: Record<string, unknown>): void;
error(message: string, metadata?: Record<string, unknown>): void;
}
export interface KloScanContext {
runId: string;
signal?: AbortSignal;
progress?: KloProgressPort;
logger?: KloScanLoggerPort;
}
export interface KloTableRef {
catalog: string | null;
db: string | null;
name: string;
}
export interface KloTableSampleInput {
connectionId: string;
table: KloTableRef;
columns?: string[];
limit: number;
}
export interface KloTableSampleResult {
headers: string[];
rows: unknown[][];
totalRows: number;
}
export interface KloColumnSampleInput {
connectionId: string;
table: KloTableRef;
column: string;
limit: number;
}
export interface KloColumnSampleResult {
values: unknown[];
nullCount: number | null;
distinctCount: number | null;
}
export interface KloColumnStatsInput {
connectionId: string;
table: KloTableRef;
column: string;
}
export interface KloColumnStatsResult {
min: unknown;
max: unknown;
average: number | null;
nullCount: number | null;
distinctCount: number | null;
}
export interface KloEventTypeDiscoveryInput {
connectionId: string;
table: KloTableRef;
eventColumn: string;
limit: number;
minCount?: number;
lookbackDays?: number;
}
export interface KloEventTypeDiscovery {
value: string;
count: number;
}
export interface KloEventPropertyDiscoveryInput {
connectionId: string;
table: KloTableRef;
jsonColumn: string;
sampleSize: number;
limit: number;
lookbackDays?: number;
}
export interface KloEventPropertyDiscovery {
key: string;
count: number;
}
export interface KloEventPropertyValuesInput {
connectionId: string;
table: KloTableRef;
jsonColumn: string;
propertyKey: string;
limit: number;
maxCardinality?: number;
lookbackDays?: number;
}
export interface KloEventPropertyValuesResult {
values: string[];
cardinality: number;
}
export interface KloEventStreamDiscoveryPort {
listEventTypes(input: KloEventTypeDiscoveryInput, ctx: KloScanContext): Promise<KloEventTypeDiscovery[]>;
listPropertyKeys(input: KloEventPropertyDiscoveryInput, ctx: KloScanContext): Promise<KloEventPropertyDiscovery[]>;
listPropertyValues(
input: KloEventPropertyValuesInput,
ctx: KloScanContext,
): Promise<KloEventPropertyValuesResult | null>;
}
export interface KloReadOnlyQueryInput {
connectionId: string;
sql: string;
maxRows?: number;
}
export interface KloQueryResult {
headers: string[];
headerTypes?: string[];
rows: unknown[][];
totalRows: number;
rowCount: number | null;
}
export interface KloScanConnector {
id: string;
driver: KloConnectionDriver;
capabilities: KloConnectorCapabilities;
eventStreamDiscovery?: KloEventStreamDiscoveryPort;
introspect(input: KloScanInput, ctx: KloScanContext): Promise<KloSchemaSnapshot>;
sampleColumn?(input: KloColumnSampleInput, ctx: KloScanContext): Promise<KloColumnSampleResult>;
sampleTable?(input: KloTableSampleInput, ctx: KloScanContext): Promise<KloTableSampleResult>;
columnStats?(input: KloColumnStatsInput, ctx: KloScanContext): Promise<KloColumnStatsResult | null>;
executeReadOnly?(input: KloReadOnlyQueryInput, ctx: KloScanContext): Promise<KloQueryResult>;
cleanup?(): Promise<void>;
}
export interface KloEmbeddingPort {
dimensions: number;
maxBatchSize: number;
embedBatch(texts: string[]): Promise<number[][]>;
}
export interface KloStructuralSyncStats {
tablesCreated: number;
tablesUpdated: number;
tablesDeleted: number;
columnsCreated: number;
columnsUpdated: number;
columnsDeleted: number;
}
export interface KloScanDiffSummary {
tablesAdded: number;
tablesModified: number;
tablesDeleted: number;
tablesUnchanged: number;
columnsAdded: number;
columnsModified: number;
columnsDeleted: number;
}
export interface KloScanArtifactPaths {
rawSourcesDir: string | null;
reportPath: string | null;
manifestShards: string[];
enrichmentArtifacts: string[];
}
export type KloScanWarningCode =
| 'connector_capability_missing'
| 'sampling_failed'
| 'statistics_failed'
| 'llm_unavailable'
| 'embedding_unavailable'
| 'scan_enrichment_backend_not_configured'
| 'relationship_validation_failed'
| 'relationship_llm_invalid_reference'
| 'relationship_llm_proposal_failed'
| 'credential_redacted'
| 'enrichment_failed';
export interface KloScanWarning {
code: KloScanWarningCode;
message: string;
table?: string;
column?: string;
recoverable: boolean;
metadata?: Record<string, unknown>;
}
export interface KloScanEnrichmentSummary {
dataDictionary: 'skipped' | 'completed' | 'failed';
tableDescriptions: 'skipped' | 'completed' | 'failed';
columnDescriptions: 'skipped' | 'completed' | 'failed';
embeddings: 'skipped' | 'completed' | 'failed';
deterministicRelationships: 'skipped' | 'completed' | 'failed';
llmRelationshipValidation: 'skipped' | 'completed' | 'failed';
statisticalValidation: 'skipped' | 'completed' | 'failed';
}
export interface KloScanRelationshipSummary {
accepted: number;
review: number;
rejected: number;
skipped: number;
}
export type KloScanEnrichmentStage = 'descriptions' | 'embeddings' | 'relationships';
export interface KloScanEnrichmentStateSummary {
resumedStages: KloScanEnrichmentStage[];
completedStages: KloScanEnrichmentStage[];
failedStages: KloScanEnrichmentStage[];
}
export interface KloScanReport {
connectionId: string;
driver: KloConnectionDriver;
syncId: string;
runId: string;
trigger: KloScanTrigger;
mode: KloScanMode;
dryRun: boolean;
artifactPaths: KloScanArtifactPaths;
diffSummary: KloScanDiffSummary;
manifestShardsWritten: number;
structuralSyncStats: KloStructuralSyncStats;
enrichment: KloScanEnrichmentSummary;
capabilityGaps: Array<keyof Omit<KloConnectorCapabilities, 'structuralIntrospection'>>;
warnings: KloScanWarning[];
relationships: KloScanRelationshipSummary;
enrichmentState: KloScanEnrichmentStateSummary;
createdAt: string;
}