mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-22 08:38:08 +02:00
Initial open-source release
This commit is contained in:
commit
1a42152e6f
1199 changed files with 257054 additions and 0 deletions
183
packages/context/src/scan/credentials.test.ts
Normal file
183
packages/context/src/scan/credentials.test.ts
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
redactKloCredentialEnvelope,
|
||||
redactKloCredentialValue,
|
||||
redactKloScanMetadata,
|
||||
redactKloScanReport,
|
||||
redactKloScanWarning,
|
||||
} from './credentials.js';
|
||||
import type { KloCredentialEnvelope, KloScanReport, KloScanWarning } from './types.js';
|
||||
|
||||
describe('KLO scan credential redaction', () => {
|
||||
it('keeps credential references inspectable', () => {
|
||||
const envReference: KloCredentialEnvelope = { kind: 'env', name: 'DATABASE_URL' };
|
||||
const fileReference: KloCredentialEnvelope = { kind: 'file', path: '~/.config/klo/warehouse' };
|
||||
|
||||
expect(redactKloCredentialEnvelope(envReference)).toEqual(envReference);
|
||||
expect(redactKloCredentialEnvelope(fileReference)).toEqual(fileReference);
|
||||
});
|
||||
|
||||
it('redacts resolved credential envelope values recursively', () => {
|
||||
expect(
|
||||
redactKloCredentialEnvelope({
|
||||
kind: 'resolved',
|
||||
source: 'host',
|
||||
values: {
|
||||
username: 'readonly',
|
||||
password: 'secret-password', // pragma: allowlist secret
|
||||
nested: {
|
||||
api_key: 'phx_123', // pragma: allowlist secret
|
||||
warehouse: 'compute_wh',
|
||||
},
|
||||
headers: [{ authorizationToken: 'token-value' }, { label: 'safe' }],
|
||||
},
|
||||
}),
|
||||
).toEqual({
|
||||
kind: 'resolved',
|
||||
source: 'host',
|
||||
redacted: true,
|
||||
values: {
|
||||
username: 'readonly',
|
||||
password: REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
nested: {
|
||||
api_key: REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
warehouse: 'compute_wh',
|
||||
},
|
||||
headers: [{ authorizationToken: REDACTED_KLO_CREDENTIAL_VALUE }, { label: 'safe' }],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts scan metadata fields that commonly contain secrets', () => {
|
||||
expect(
|
||||
redactKloScanMetadata({
|
||||
driver: 'postgres',
|
||||
url: 'postgres://user:pass@example.test/db', // pragma: allowlist secret
|
||||
serviceAccountJson: {
|
||||
client_email: 'reader@example.test',
|
||||
private_key: 'pem-value', // pragma: allowlist secret
|
||||
},
|
||||
safeCount: 3,
|
||||
}),
|
||||
).toEqual({
|
||||
driver: 'postgres',
|
||||
url: REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
serviceAccountJson: {
|
||||
client_email: 'reader@example.test',
|
||||
private_key: REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
},
|
||||
safeCount: 3,
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts scan warning messages and metadata without hiding safe context', () => {
|
||||
const warning: KloScanWarning = {
|
||||
code: 'sampling_failed',
|
||||
message: 'sample failed for postgres://reader:secret@example.test/db', // pragma: allowlist secret
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
table: 'orders',
|
||||
url: 'postgres://reader:secret@example.test/db', // pragma: allowlist secret
|
||||
nested: {
|
||||
api_key: 'sk_test_123', // pragma: allowlist secret
|
||||
schema: 'public',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
expect(redactKloScanWarning(warning)).toEqual({
|
||||
code: 'sampling_failed',
|
||||
message: 'sample failed for postgres://reader:<redacted>@example.test/db',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
table: 'orders',
|
||||
url: REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
nested: {
|
||||
api_key: REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
schema: 'public',
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts scan report warning metadata recursively', () => {
|
||||
const report: KloScanReport = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
syncId: 'sync-1',
|
||||
runId: 'run-1',
|
||||
trigger: 'cli',
|
||||
mode: 'structural',
|
||||
dryRun: false,
|
||||
artifactPaths: {
|
||||
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-1',
|
||||
reportPath: 'raw-sources/warehouse/live-database/sync-1/scan-report.json',
|
||||
manifestShards: [],
|
||||
enrichmentArtifacts: [],
|
||||
},
|
||||
diffSummary: {
|
||||
tablesAdded: 0,
|
||||
tablesModified: 0,
|
||||
tablesDeleted: 0,
|
||||
tablesUnchanged: 0,
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
manifestShardsWritten: 0,
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 0,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'skipped',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
capabilityGaps: [],
|
||||
warnings: [
|
||||
{
|
||||
code: 'credential_redacted',
|
||||
message: 'metadata redacted',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
credentials_json: '{"private_key":"pem-value"}', // pragma: allowlist secret
|
||||
safeCount: 2,
|
||||
},
|
||||
},
|
||||
],
|
||||
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
|
||||
enrichmentState: {
|
||||
resumedStages: [],
|
||||
completedStages: [],
|
||||
failedStages: [],
|
||||
},
|
||||
createdAt: '2026-04-29T00:00:00.000Z',
|
||||
};
|
||||
|
||||
const redacted = redactKloScanReport(report);
|
||||
|
||||
expect(redacted.warnings[0]?.metadata).toEqual({
|
||||
credentials_json: REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
safeCount: 2,
|
||||
});
|
||||
expect(report.warnings[0]?.metadata).toEqual({
|
||||
credentials_json: '{"private_key":"pem-value"}', // pragma: allowlist secret
|
||||
safeCount: 2,
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts standalone primitive credential values only when the field key is sensitive', () => {
|
||||
expect(redactKloCredentialValue('password', 'abc')).toBe(REDACTED_KLO_CREDENTIAL_VALUE);
|
||||
expect(redactKloCredentialValue('schema', 'public')).toBe('public');
|
||||
});
|
||||
});
|
||||
50
packages/context/src/scan/credentials.ts
Normal file
50
packages/context/src/scan/credentials.ts
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
import {
|
||||
redactKloSensitiveMetadata,
|
||||
redactKloSensitiveText,
|
||||
redactKloSensitiveValue,
|
||||
REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
} from '../core/redaction.js';
|
||||
import type { KloCredentialEnvelope, KloScanReport, KloScanWarning } from './types.js';
|
||||
|
||||
export { REDACTED_KLO_CREDENTIAL_VALUE };
|
||||
|
||||
export function redactKloCredentialValue(key: string, value: unknown): unknown {
|
||||
return redactKloSensitiveValue(key, value);
|
||||
}
|
||||
|
||||
export function redactKloScanMetadata(metadata: Record<string, unknown>): Record<string, unknown> {
|
||||
return redactKloSensitiveMetadata(metadata);
|
||||
}
|
||||
|
||||
export function redactKloCredentialEnvelope(envelope: KloCredentialEnvelope): KloCredentialEnvelope {
|
||||
if (envelope.kind !== 'resolved') {
|
||||
return envelope;
|
||||
}
|
||||
return {
|
||||
kind: 'resolved',
|
||||
source: envelope.source,
|
||||
redacted: true,
|
||||
values: redactKloScanMetadata(envelope.values),
|
||||
};
|
||||
}
|
||||
|
||||
export function redactKloScanWarning(warning: KloScanWarning): KloScanWarning {
|
||||
if (!warning.metadata) {
|
||||
return {
|
||||
...warning,
|
||||
message: redactKloSensitiveText(warning.message),
|
||||
};
|
||||
}
|
||||
return {
|
||||
...warning,
|
||||
message: redactKloSensitiveText(warning.message),
|
||||
metadata: redactKloScanMetadata(warning.metadata),
|
||||
};
|
||||
}
|
||||
|
||||
export function redactKloScanReport(report: KloScanReport): KloScanReport {
|
||||
return {
|
||||
...report,
|
||||
warnings: report.warnings.map((warning) => redactKloScanWarning(warning)),
|
||||
};
|
||||
}
|
||||
114
packages/context/src/scan/data-dictionary.test.ts
Normal file
114
packages/context/src/scan/data-dictionary.test.ts
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
defaultKloDataDictionarySettings,
|
||||
isKloDataDictionaryCandidate,
|
||||
shouldKloSampleColumnForDictionary,
|
||||
} from './data-dictionary.js';
|
||||
|
||||
const defaultPatterns = defaultKloDataDictionarySettings.excludePatterns;
|
||||
|
||||
describe('KLO scan data dictionary policy', () => {
|
||||
it('includes text-like and boolean categorical types', () => {
|
||||
expect(isKloDataDictionaryCandidate('varchar(50)', 'status', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('VARCHAR', 'category', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('text', 'region', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('string', 'payment_method', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('nvarchar(100)', 'tier', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('enum', 'status', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('boolean', 'active', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('bool', 'verified', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('character varying(50)', 'region', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('character(1)', 'flag', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('ntext', 'category', defaultPatterns)).toBe(true);
|
||||
});
|
||||
|
||||
it('excludes non-categorical primitive types', () => {
|
||||
expect(isKloDataDictionaryCandidate('integer', 'count', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('bigint', 'total', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('timestamp', 'created', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('date', 'birth', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('numeric', 'amount', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('decimal(10,2)', 'price', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('float', 'rate', defaultPatterns)).toBe(false);
|
||||
});
|
||||
|
||||
it('excludes configured high-cardinality or sensitive name patterns', () => {
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'user_id', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'session_uuid', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'api_key', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'password_hash', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'auth_token', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'id', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'created_at', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'birth_date', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('text', 'description', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('text', 'email_body', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'image_url', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'email', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'phone_number', defaultPatterns)).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'street_address', defaultPatterns)).toBe(false);
|
||||
});
|
||||
|
||||
it('keeps business categorical names eligible', () => {
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'status', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'region', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'country', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'payment_method', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'currency', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'plan', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'category', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'tier', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'gender', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'language', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'order_type', defaultPatterns)).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'order_status', defaultPatterns)).toBe(true);
|
||||
});
|
||||
|
||||
it('respects host-provided exclusion patterns and skips invalid regex patterns', () => {
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'company_size', ['company'])).toBe(false);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'status', ['company'])).toBe(true);
|
||||
expect(isKloDataDictionaryCandidate('varchar', 'status', ['[invalid', '(unclosed'])).toBe(true);
|
||||
});
|
||||
|
||||
it('skips columns that already have persisted dictionary state', () => {
|
||||
expect(
|
||||
shouldKloSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'status',
|
||||
sampleValues: ['paid'],
|
||||
cardinality: null,
|
||||
settings: defaultKloDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: false, reason: 'already_populated' });
|
||||
|
||||
expect(
|
||||
shouldKloSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'empty_status',
|
||||
sampleValues: null,
|
||||
cardinality: 0,
|
||||
settings: defaultKloDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: false, reason: 'empty_column' });
|
||||
|
||||
expect(
|
||||
shouldKloSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'customer_name',
|
||||
sampleValues: null,
|
||||
cardinality: 300,
|
||||
settings: defaultKloDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: false, reason: 'high_cardinality' });
|
||||
|
||||
expect(
|
||||
shouldKloSampleColumnForDictionary({
|
||||
columnType: 'varchar',
|
||||
columnName: 'status',
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
settings: defaultKloDataDictionarySettings,
|
||||
}),
|
||||
).toEqual({ sample: true });
|
||||
});
|
||||
});
|
||||
109
packages/context/src/scan/data-dictionary.ts
Normal file
109
packages/context/src/scan/data-dictionary.ts
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
export interface KloDataDictionarySettings {
|
||||
cardinalityThreshold: number;
|
||||
maxValuesToStore: number;
|
||||
sampleSize: number;
|
||||
useDbStatistics: boolean;
|
||||
excludePatterns: string[];
|
||||
}
|
||||
|
||||
export const defaultKloDataDictionarySettings: KloDataDictionarySettings = {
|
||||
cardinalityThreshold: 200,
|
||||
maxValuesToStore: 100,
|
||||
sampleSize: 10000,
|
||||
useDbStatistics: true,
|
||||
excludePatterns: [
|
||||
'_id$',
|
||||
'_uuid$',
|
||||
'_key$',
|
||||
'_hash$',
|
||||
'_token$',
|
||||
'^id$',
|
||||
'^uuid$',
|
||||
'_at$',
|
||||
'_date$',
|
||||
'_time$',
|
||||
'description$',
|
||||
'comment$',
|
||||
'notes?$',
|
||||
'message$',
|
||||
'body$',
|
||||
'content$',
|
||||
'_url$',
|
||||
'_path$',
|
||||
'email$',
|
||||
'^phone',
|
||||
'address$',
|
||||
],
|
||||
};
|
||||
|
||||
export type KloDataDictionarySkipReason =
|
||||
| 'not_candidate'
|
||||
| 'already_populated'
|
||||
| 'empty_column'
|
||||
| 'high_cardinality';
|
||||
|
||||
export interface KloDataDictionarySampleDecision {
|
||||
sample: boolean;
|
||||
reason?: KloDataDictionarySkipReason;
|
||||
}
|
||||
|
||||
export interface KloDataDictionaryColumnState {
|
||||
columnType: string;
|
||||
columnName: string;
|
||||
sampleValues?: readonly string[] | null;
|
||||
cardinality?: number | null;
|
||||
settings: KloDataDictionarySettings;
|
||||
}
|
||||
|
||||
const categoricalCandidateTypes = /^(n?varchar|n?char|n?text|string|character|enum|bool(ean)?)/i;
|
||||
|
||||
export function isKloDataDictionaryCandidate(
|
||||
columnType: string,
|
||||
columnName: string,
|
||||
excludePatterns: readonly string[] = defaultKloDataDictionarySettings.excludePatterns,
|
||||
): boolean {
|
||||
const typeLower = columnType.toLowerCase();
|
||||
const nameLower = columnName.toLowerCase();
|
||||
|
||||
if (!categoricalCandidateTypes.test(typeLower)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (const patternText of excludePatterns) {
|
||||
try {
|
||||
const pattern = new RegExp(patternText, 'i');
|
||||
if (pattern.test(nameLower)) {
|
||||
return false;
|
||||
}
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
export function shouldKloSampleColumnForDictionary(
|
||||
input: KloDataDictionaryColumnState,
|
||||
): KloDataDictionarySampleDecision {
|
||||
const sampleValues = input.sampleValues ?? null;
|
||||
const cardinality = input.cardinality ?? null;
|
||||
|
||||
if (sampleValues && sampleValues.length > 0) {
|
||||
return { sample: false, reason: 'already_populated' };
|
||||
}
|
||||
|
||||
if (cardinality === 0) {
|
||||
return { sample: false, reason: 'empty_column' };
|
||||
}
|
||||
|
||||
if (cardinality !== null && cardinality > input.settings.cardinalityThreshold) {
|
||||
return { sample: false, reason: 'high_cardinality' };
|
||||
}
|
||||
|
||||
if (!isKloDataDictionaryCandidate(input.columnType, input.columnName, input.settings.excludePatterns)) {
|
||||
return { sample: false, reason: 'not_candidate' };
|
||||
}
|
||||
|
||||
return { sample: true };
|
||||
}
|
||||
318
packages/context/src/scan/description-generation.test.ts
Normal file
318
packages/context/src/scan/description-generation.test.ts
Normal file
|
|
@ -0,0 +1,318 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
|
||||
vi.mock('ai', async (importOriginal) => {
|
||||
const actual = await importOriginal<typeof import('ai')>();
|
||||
return { ...actual, generateText: vi.fn() };
|
||||
});
|
||||
|
||||
import { generateText } from 'ai';
|
||||
import {
|
||||
buildKloColumnDescriptionPrompt,
|
||||
buildKloDataSourceDescriptionPrompt,
|
||||
buildKloTableDescriptionPrompt,
|
||||
type KloDescriptionCachePort,
|
||||
KloDescriptionGenerator,
|
||||
} from './description-generation.js';
|
||||
import { createKloConnectorCapabilities, type KloScanConnector } from './types.js';
|
||||
|
||||
function createCache(initial: Record<string, string> = {}): KloDescriptionCachePort {
|
||||
const data = new Map(Object.entries(initial));
|
||||
return {
|
||||
buildTableKey: (table) => [table.catalog, table.db, table.name].filter(Boolean).join('.'),
|
||||
buildColumnKey: (table, columnName) => [table.catalog, table.db, table.name, columnName].filter(Boolean).join('.'),
|
||||
buildConnectionKey: (connectionName) => `__connection:${connectionName}`,
|
||||
get: vi.fn(async (key: string) => data.get(key) ?? null),
|
||||
set: vi.fn(async (key: string, value: string) => {
|
||||
data.set(key, value);
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
function createLlmProvider(text = 'generated description') {
|
||||
vi.mocked(generateText).mockResolvedValue({ text } as never);
|
||||
return {
|
||||
getModel: vi.fn().mockReturnValue({ modelId: 'claude-sonnet-4-6', provider: 'anthropic' }),
|
||||
getModelByName: vi.fn(),
|
||||
cacheMarker: vi.fn(),
|
||||
repairToolCallHandler: vi.fn(),
|
||||
thinkingProviderOptions: vi.fn(),
|
||||
telemetryConfig: vi.fn(),
|
||||
promptCachingConfig: vi.fn(() => ({
|
||||
enabled: false,
|
||||
systemTtl: '1h',
|
||||
toolsTtl: '1h',
|
||||
historyTtl: '5m',
|
||||
cacheSystem: true,
|
||||
cacheTools: true,
|
||||
cacheHistory: true,
|
||||
vertexFallbackTo5m: false,
|
||||
})),
|
||||
activeBackend: vi.fn(() => 'anthropic'),
|
||||
} as any;
|
||||
}
|
||||
|
||||
function createConnector(): KloScanConnector {
|
||||
return {
|
||||
id: 'test-connector',
|
||||
driver: 'postgres',
|
||||
capabilities: createKloConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
columnSampling: true,
|
||||
nestedAnalysis: true,
|
||||
}),
|
||||
introspect: vi.fn(async () => {
|
||||
throw new Error('introspection is not used by description generation');
|
||||
}),
|
||||
sampleColumn: vi.fn(async () => ({
|
||||
values: ['paid', 'refunded', null],
|
||||
nullCount: 1,
|
||||
distinctCount: 2,
|
||||
})),
|
||||
sampleTable: vi.fn(async () => ({
|
||||
headers: ['id', 'status', 'amount'],
|
||||
rows: [
|
||||
[1, 'paid', 20],
|
||||
[2, 'refunded', 10],
|
||||
],
|
||||
totalRows: 2,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
describe('KLO description prompt builders', () => {
|
||||
it('builds column prompts with sample values, source descriptions, and nested BigQuery guidance', () => {
|
||||
const prompt = buildKloColumnDescriptionPrompt({
|
||||
columnName: 'payload',
|
||||
columnValues: [{ nested: true }, '[1,2]'],
|
||||
tableContext: 'Table: events | Columns: payload | Data source: BIGQUERY',
|
||||
dataSourceType: 'BIGQUERY',
|
||||
supportsNestedAnalysis: true,
|
||||
rawDescriptions: { db: 'Raw event payload', ai: 'Old AI text', user: 'User text' },
|
||||
});
|
||||
|
||||
expect(prompt).toContain(
|
||||
'<table_context> Table: events | Columns: payload | Data source: BIGQUERY </table_context>',
|
||||
);
|
||||
expect(prompt).toContain('<column_name> payload </column_name>');
|
||||
expect(prompt).toContain('<sample_values> [object Object], [1,2] </sample_values>');
|
||||
expect(prompt).toContain('<db_documentation> Raw event payload </db_documentation>');
|
||||
expect(prompt).not.toContain('Old AI text');
|
||||
expect(prompt).not.toContain('User text');
|
||||
expect(prompt).toContain('nested/structured data');
|
||||
});
|
||||
|
||||
it('builds table and data-source prompts from sampled rows', () => {
|
||||
const sample = {
|
||||
headers: ['id', 'status'],
|
||||
rows: [
|
||||
[1, 'paid'],
|
||||
[2, 'refunded'],
|
||||
],
|
||||
totalRows: 2,
|
||||
};
|
||||
|
||||
expect(
|
||||
buildKloTableDescriptionPrompt({
|
||||
tableName: 'orders',
|
||||
sampleData: sample,
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
rawDescriptions: { dbt: 'Fact table for commerce orders' },
|
||||
}),
|
||||
).toContain('status: paid, refunded');
|
||||
|
||||
expect(
|
||||
buildKloDataSourceDescriptionPrompt({
|
||||
tableSamples: [['orders', sample]],
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
}),
|
||||
).toContain('orders (2 columns, 2 sample rows)');
|
||||
});
|
||||
});
|
||||
|
||||
describe('KloDescriptionGenerator', () => {
|
||||
it('generates column descriptions with pre-fetched values, cache hits, and word-limit metadata', async () => {
|
||||
const cache = createCache({ 'warehouse.public.orders.cached_status': 'Cached status description' });
|
||||
const llmProvider = createLlmProvider('Payment state');
|
||||
const connector = createConnector();
|
||||
const generator = new KloDescriptionGenerator({
|
||||
llmProvider,
|
||||
cache,
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
temperature: 0.2,
|
||||
concurrencyLimit: 2,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: 'warehouse',
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [
|
||||
{ name: 'status', sampleValues: ['paid', 'refunded'], rawDescriptions: { db: 'Payment lifecycle' } },
|
||||
{ name: 'cached_status', sampleValues: ['open'] },
|
||||
],
|
||||
},
|
||||
skipExisting: false,
|
||||
existingDescriptions: {},
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
columnDescriptions: [
|
||||
['status', 'Payment state'],
|
||||
['cached_status', 'Cached status description'],
|
||||
],
|
||||
processedColumns: ['status'],
|
||||
skippedColumns: ['cached_status'],
|
||||
});
|
||||
expect(connector.sampleColumn).not.toHaveBeenCalled();
|
||||
expect(generateText).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
temperature: 0.2,
|
||||
messages: expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
role: 'user',
|
||||
content: expect.stringContaining('Please provide a concise description in 12 words or less.'),
|
||||
}),
|
||||
]),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('samples through the connector when column values are not pre-fetched', async () => {
|
||||
const connector = createConnector();
|
||||
const generator = new KloDescriptionGenerator({
|
||||
llmProvider: createLlmProvider('Current order state'),
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(connector.sampleColumn).toHaveBeenCalledWith(
|
||||
{
|
||||
connectionId: 'conn-1',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
limit: 50,
|
||||
},
|
||||
{ runId: 'run-1' },
|
||||
);
|
||||
expect(result.columnDescriptions).toEqual([['status', 'Current order state']]);
|
||||
});
|
||||
|
||||
it('samples through a description sampling port without requiring structural introspection', async () => {
|
||||
const sampler = {
|
||||
id: 'description-sampler:conn-1',
|
||||
sampleColumn: vi.fn(async () => ({
|
||||
values: ['paid', 'refunded'],
|
||||
nullCount: null,
|
||||
distinctCount: null,
|
||||
})),
|
||||
sampleTable: vi.fn(async () => ({
|
||||
headers: ['id', 'status'],
|
||||
rows: [[1, 'paid']],
|
||||
totalRows: 1,
|
||||
})),
|
||||
};
|
||||
const generator = new KloDescriptionGenerator({
|
||||
llmProvider: createLlmProvider('Generated through sampler'),
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await generator.generateColumnDescriptions({
|
||||
connectionId: 'conn-1',
|
||||
connector: sampler,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
supportsNestedAnalysis: false,
|
||||
table: {
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
columns: [{ name: 'status' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.columnDescriptions).toEqual([['status', 'Generated through sampler']]);
|
||||
expect(sampler.sampleColumn).toHaveBeenCalledWith(
|
||||
{
|
||||
connectionId: 'conn-1',
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
limit: 50,
|
||||
},
|
||||
{ runId: 'run-1' },
|
||||
);
|
||||
expect('introspect' in sampler).toBe(false);
|
||||
});
|
||||
|
||||
it('generates and caches table and data-source descriptions', async () => {
|
||||
const cache = createCache();
|
||||
const connector = createConnector();
|
||||
const generator = new KloDescriptionGenerator({
|
||||
llmProvider: createLlmProvider('Commerce orders'),
|
||||
cache,
|
||||
settings: {
|
||||
columnMaxWords: 12,
|
||||
tableMaxWords: 18,
|
||||
dataSourceMaxWords: 24,
|
||||
concurrencyLimit: 2,
|
||||
},
|
||||
});
|
||||
|
||||
await expect(
|
||||
generator.generateTableDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
table: { catalog: 'warehouse', db: 'public', name: 'orders', rawDescriptions: { db: 'Raw orders' } },
|
||||
}),
|
||||
).resolves.toBe('Commerce orders');
|
||||
|
||||
await expect(
|
||||
generator.generateDataSourceDescription({
|
||||
connectionId: 'conn-1',
|
||||
connector,
|
||||
context: { runId: 'run-1' },
|
||||
dataSourceType: 'POSTGRESQL',
|
||||
tables: [
|
||||
{ catalog: 'warehouse', db: 'public', name: 'orders' },
|
||||
{ catalog: 'warehouse', db: 'public', name: 'customers' },
|
||||
],
|
||||
connectionName: 'Warehouse',
|
||||
}),
|
||||
).resolves.toBe('Commerce orders');
|
||||
|
||||
expect(cache.set).toHaveBeenCalledWith('warehouse.public.orders', 'Commerce orders');
|
||||
expect(cache.set).toHaveBeenCalledWith('__connection:Warehouse', 'Commerce orders');
|
||||
});
|
||||
});
|
||||
582
packages/context/src/scan/description-generation.ts
Normal file
582
packages/context/src/scan/description-generation.ts
Normal file
|
|
@ -0,0 +1,582 @@
|
|||
import type { KloLlmProvider } from '@klo/llm';
|
||||
import { generateKloText } from '../llm/index.js';
|
||||
import type {
|
||||
KloColumnSampleInput,
|
||||
KloColumnSampleResult,
|
||||
KloScanContext,
|
||||
KloScanLoggerPort,
|
||||
KloTableRef,
|
||||
KloTableSampleInput,
|
||||
KloTableSampleResult,
|
||||
} from './types.js';
|
||||
|
||||
export interface KloDescriptionCachePort {
|
||||
buildTableKey(table: KloTableRef): string;
|
||||
buildColumnKey(table: KloTableRef, columnName: string): string;
|
||||
buildConnectionKey(connectionName: string): string;
|
||||
get(key: string): Promise<string | null>;
|
||||
set(key: string, value: string): Promise<void>;
|
||||
}
|
||||
|
||||
export interface KloDescriptionSamplingPort {
|
||||
id: string;
|
||||
sampleColumn?(input: KloColumnSampleInput, ctx: KloScanContext): Promise<KloColumnSampleResult>;
|
||||
sampleTable?(input: KloTableSampleInput, ctx: KloScanContext): Promise<KloTableSampleResult>;
|
||||
}
|
||||
|
||||
export interface KloDescriptionGenerationSettings {
|
||||
columnMaxWords: number;
|
||||
tableMaxWords: number;
|
||||
dataSourceMaxWords: number;
|
||||
temperature?: number;
|
||||
concurrencyLimit?: number;
|
||||
}
|
||||
|
||||
interface ResolvedKloDescriptionGenerationSettings {
|
||||
columnMaxWords: number;
|
||||
tableMaxWords: number;
|
||||
dataSourceMaxWords: number;
|
||||
temperature?: number;
|
||||
concurrencyLimit: number;
|
||||
}
|
||||
|
||||
export interface KloDescriptionColumn {
|
||||
name: string;
|
||||
type?: string;
|
||||
rawDescriptions?: Record<string, string>;
|
||||
sampleValues?: unknown[];
|
||||
}
|
||||
|
||||
export interface KloDescriptionColumnTable extends KloTableRef {
|
||||
columns: KloDescriptionColumn[];
|
||||
}
|
||||
|
||||
export interface KloDescriptionTableInput extends KloTableRef {
|
||||
rawDescriptions?: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface KloColumnAnalysisResult {
|
||||
columnDescriptions: Array<[string, string | null]>;
|
||||
processedColumns: string[];
|
||||
skippedColumns: string[];
|
||||
}
|
||||
|
||||
export interface KloColumnDescriptionPromptInput {
|
||||
columnName: string;
|
||||
columnValues: unknown[];
|
||||
tableContext: string;
|
||||
dataSourceType: string;
|
||||
supportsNestedAnalysis: boolean;
|
||||
rawDescriptions?: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface KloTableDescriptionPromptInput {
|
||||
tableName: string;
|
||||
sampleData: KloTableSampleResult;
|
||||
dataSourceType: string;
|
||||
rawDescriptions?: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface KloDataSourceDescriptionPromptInput {
|
||||
tableSamples: Array<[string, KloTableSampleResult]>;
|
||||
dataSourceType: string;
|
||||
}
|
||||
|
||||
export interface KloGenerateColumnDescriptionsInput {
|
||||
connectionId: string;
|
||||
connector: KloDescriptionSamplingPort;
|
||||
context: KloScanContext;
|
||||
dataSourceType: string;
|
||||
supportsNestedAnalysis: boolean;
|
||||
table: KloDescriptionColumnTable;
|
||||
skipExisting?: boolean;
|
||||
existingDescriptions?: Record<string, string | null>;
|
||||
}
|
||||
|
||||
export interface KloGenerateTableDescriptionInput {
|
||||
connectionId: string;
|
||||
connector: KloDescriptionSamplingPort;
|
||||
context: KloScanContext;
|
||||
dataSourceType: string;
|
||||
table: KloDescriptionTableInput;
|
||||
}
|
||||
|
||||
export interface KloGenerateDataSourceDescriptionInput {
|
||||
connectionId: string;
|
||||
connector: KloDescriptionSamplingPort;
|
||||
context: KloScanContext;
|
||||
dataSourceType: string;
|
||||
tables: KloTableRef[];
|
||||
connectionName?: string;
|
||||
}
|
||||
|
||||
export interface KloDescriptionGeneratorOptions {
|
||||
llmProvider: KloLlmProvider;
|
||||
cache?: KloDescriptionCachePort;
|
||||
logger?: KloScanLoggerPort;
|
||||
settings: KloDescriptionGenerationSettings;
|
||||
}
|
||||
|
||||
interface ColumnTaskResult {
|
||||
columnName: string;
|
||||
description: string | null;
|
||||
processed: boolean;
|
||||
skipped: boolean;
|
||||
}
|
||||
|
||||
function descriptionSources(rawDescriptions: Record<string, string> | undefined): Array<[string, string]> {
|
||||
if (!rawDescriptions) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return Object.entries(rawDescriptions).filter(([source, text]) => source !== 'ai' && source !== 'user' && !!text);
|
||||
}
|
||||
|
||||
function errorMessage(error: unknown): string {
|
||||
return error instanceof Error ? error.message : String(error);
|
||||
}
|
||||
|
||||
function toTableRef(table: KloTableRef): KloTableRef {
|
||||
return {
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
name: table.name,
|
||||
};
|
||||
}
|
||||
|
||||
async function runWithConcurrency<TInput, TOutput>(
|
||||
items: readonly TInput[],
|
||||
concurrencyLimit: number,
|
||||
worker: (item: TInput, index: number) => Promise<TOutput>,
|
||||
): Promise<TOutput[]> {
|
||||
const results: TOutput[] = [];
|
||||
let nextIndex = 0;
|
||||
const workerCount = Math.max(1, Math.min(concurrencyLimit, items.length || 1));
|
||||
|
||||
await Promise.all(
|
||||
Array.from({ length: workerCount }, async () => {
|
||||
while (nextIndex < items.length) {
|
||||
const index = nextIndex;
|
||||
nextIndex += 1;
|
||||
const item = items[index];
|
||||
if (item !== undefined) {
|
||||
results[index] = await worker(item, index);
|
||||
}
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
export function appendKloWordLimitInstruction(prompt: string, maxWords: number): string {
|
||||
return `${prompt}\n\nPlease provide a concise description in ${maxWords} words or less.`;
|
||||
}
|
||||
|
||||
export function buildKloColumnDescriptionPrompt(input: KloColumnDescriptionPromptInput): string {
|
||||
const sampleValues = input.columnValues.slice(0, 5);
|
||||
const valuesStr = sampleValues
|
||||
.filter((value) => value !== null && value !== undefined)
|
||||
.map((value) => String(value))
|
||||
.join(', ');
|
||||
|
||||
let prompt = `Analyze this database column and provide a concise description:
|
||||
|
||||
<table_context> ${input.tableContext} </table_context>
|
||||
|
||||
<column_name> ${input.columnName} </column_name>
|
||||
|
||||
<sample_values> ${valuesStr} </sample_values>
|
||||
`;
|
||||
|
||||
const sources = descriptionSources(input.rawDescriptions);
|
||||
if (sources.length > 0) {
|
||||
prompt += '\nExisting descriptions from other sources:\n';
|
||||
for (const [source, text] of sources) {
|
||||
prompt += `<${source}_documentation> ${text} </${source}_documentation>\n`;
|
||||
}
|
||||
prompt +=
|
||||
'\nSynthesize a description that captures the most important information from all sources. Prioritize the sources as authoritative context.\n';
|
||||
}
|
||||
|
||||
prompt += `
|
||||
Provide a brief description of what this column contains without repeating the column name.
|
||||
Focus on the data's meaning and business purpose. Start directly with the content description.
|
||||
Example:
|
||||
"first names of individuals, likely employees or contacts" instead of "The column contains first names..."
|
||||
"Job titles or roles of individuals..." instead of "This column contains job titles..."
|
||||
`;
|
||||
|
||||
if (input.dataSourceType === 'BIGQUERY' && input.supportsNestedAnalysis) {
|
||||
const hasNestedData = sampleValues.some((value) => {
|
||||
const text = String(value);
|
||||
return text.includes('nested') || text.includes('{') || text.includes('[');
|
||||
});
|
||||
if (hasNestedData) {
|
||||
prompt +=
|
||||
'\nNote: This column contains nested/structured data (JSON, STRUCT, or ARRAY) - describe its general business purpose and data organization.';
|
||||
}
|
||||
}
|
||||
|
||||
return prompt.trim();
|
||||
}
|
||||
|
||||
export function buildKloTableDescriptionPrompt(input: KloTableDescriptionPromptInput): string {
|
||||
const columnInfo: string[] = [];
|
||||
for (let index = 0; index < Math.min(input.sampleData.headers.length, 10); index += 1) {
|
||||
const header = input.sampleData.headers[index];
|
||||
const sampleValues = input.sampleData.rows
|
||||
.slice(0, 3)
|
||||
.map((row) => row[index])
|
||||
.filter((value) => value !== null && value !== undefined);
|
||||
columnInfo.push(`${header}: ${sampleValues.map((value) => String(value)).join(', ')}`);
|
||||
}
|
||||
|
||||
let prompt = `
|
||||
Analyze this database table and provide a concise description:
|
||||
|
||||
Table: ${input.tableName}
|
||||
Columns and sample data: ${columnInfo.join(' | ')}
|
||||
Total rows in sample: ${input.sampleData.rows.length}
|
||||
Data source type: ${input.dataSourceType}
|
||||
`;
|
||||
|
||||
const sources = descriptionSources(input.rawDescriptions);
|
||||
if (sources.length > 0) {
|
||||
prompt += '\n Existing descriptions from other sources:\n';
|
||||
for (const [source, text] of sources) {
|
||||
prompt += ` ${source}: ${text}\n`;
|
||||
}
|
||||
prompt +=
|
||||
'\n Synthesize a description that captures the most important information from all sources. Prioritize the sources as authoritative context.\n';
|
||||
}
|
||||
|
||||
if (input.dataSourceType === 'BIGQUERY') {
|
||||
prompt +=
|
||||
"\nNote (Don't include this note in the final answer.): This is a BigQuery table which may contain nested structures, arrays, or other complex data types.";
|
||||
}
|
||||
|
||||
prompt += `
|
||||
|
||||
Provide a brief description of what this table represents and its business purpose.
|
||||
Do NOT list or describe individual columns or fields.
|
||||
Start directly with the content description without mentioning the table name.
|
||||
Focus on the data's meaning and business purpose.
|
||||
Example: "Information about healthcare professionals used for workforce management" instead of "The blahblah table contains information about healthcare professionals including their names, titles..."
|
||||
`;
|
||||
|
||||
return prompt.trim();
|
||||
}
|
||||
|
||||
export function buildKloDataSourceDescriptionPrompt(input: KloDataSourceDescriptionPromptInput): string {
|
||||
const tablesText = input.tableSamples
|
||||
.map(
|
||||
([tableName, sampleData]) =>
|
||||
`${tableName} (${sampleData.headers.length} columns, ${sampleData.rows.length} sample rows)`,
|
||||
)
|
||||
.join(' | ');
|
||||
|
||||
let prompt = `
|
||||
Analyze this database and provide a concise description:
|
||||
|
||||
Tables: ${tablesText}
|
||||
Total tables analyzed: ${input.tableSamples.length}
|
||||
Data source type: ${input.dataSourceType}
|
||||
`;
|
||||
|
||||
if (input.dataSourceType === 'BIGQUERY') {
|
||||
prompt +=
|
||||
"\nNote (Don't include this note in the final answer): This is a BigQuery dataset which may contain large-scale analytics data, nested structures, and complex data types.";
|
||||
}
|
||||
|
||||
prompt += `
|
||||
|
||||
Provide a direct, concise description of what this database represents and its business purpose.
|
||||
Do NOT start with phrases like "This database appears to represent" or "This BigQuery dataset".
|
||||
Start directly with the domain or business area description.
|
||||
Focus on the overall data model and its intended use.
|
||||
Example: "Healthcare-related database with a focus on patient management..." instead of "This database appears to represent a healthcare-related system..."
|
||||
`;
|
||||
|
||||
return prompt.trim();
|
||||
}
|
||||
|
||||
export class KloDescriptionGenerator {
|
||||
private readonly llmProvider: KloLlmProvider;
|
||||
private readonly cache?: KloDescriptionCachePort;
|
||||
private readonly logger?: KloScanLoggerPort;
|
||||
private readonly settings: ResolvedKloDescriptionGenerationSettings;
|
||||
|
||||
constructor(options: KloDescriptionGeneratorOptions) {
|
||||
this.llmProvider = options.llmProvider;
|
||||
this.cache = options.cache;
|
||||
this.logger = options.logger;
|
||||
this.settings = {
|
||||
columnMaxWords: options.settings.columnMaxWords,
|
||||
tableMaxWords: options.settings.tableMaxWords,
|
||||
dataSourceMaxWords: options.settings.dataSourceMaxWords,
|
||||
...(options.settings.temperature !== undefined ? { temperature: options.settings.temperature } : {}),
|
||||
concurrencyLimit: options.settings.concurrencyLimit ?? 5,
|
||||
};
|
||||
}
|
||||
|
||||
async generateColumnDescriptions(input: KloGenerateColumnDescriptionsInput): Promise<KloColumnAnalysisResult> {
|
||||
const columnsToProcess = input.table.columns;
|
||||
const tableContext = `Table: ${input.table.name} | Columns: ${columnsToProcess.map((column) => column.name).join(', ')} | Data source: ${input.dataSourceType}`;
|
||||
|
||||
const results = await runWithConcurrency(columnsToProcess, this.settings.concurrencyLimit, async (column) =>
|
||||
this.generateOneColumnDescription(input, column, tableContext),
|
||||
);
|
||||
|
||||
const columnDescriptions: Array<[string, string | null]> = [];
|
||||
const processedColumns: string[] = [];
|
||||
const skippedColumns: string[] = [];
|
||||
|
||||
for (const result of results) {
|
||||
columnDescriptions.push([result.columnName, result.description]);
|
||||
if (result.skipped) {
|
||||
skippedColumns.push(result.columnName);
|
||||
} else if (result.processed) {
|
||||
processedColumns.push(result.columnName);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
columnDescriptions,
|
||||
processedColumns,
|
||||
skippedColumns,
|
||||
};
|
||||
}
|
||||
|
||||
async generateTableDescription(input: KloGenerateTableDescriptionInput): Promise<string> {
|
||||
const tableRef = toTableRef(input.table);
|
||||
const cacheKey = this.cache?.buildTableKey(tableRef);
|
||||
if (cacheKey) {
|
||||
const cached = await this.cache?.get(cacheKey);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
}
|
||||
|
||||
if (!input.connector.sampleTable) {
|
||||
this.logger?.warn('KLO scan connector does not support table sampling for table description generation', {
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
});
|
||||
return 'Table not found';
|
||||
}
|
||||
|
||||
try {
|
||||
const sampleData = await input.connector.sampleTable(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
table: tableRef,
|
||||
limit: 20,
|
||||
},
|
||||
input.context,
|
||||
);
|
||||
const prompt = buildKloTableDescriptionPrompt({
|
||||
tableName: input.table.name,
|
||||
sampleData,
|
||||
dataSourceType: input.dataSourceType,
|
||||
rawDescriptions: input.table.rawDescriptions,
|
||||
});
|
||||
const description = await this.generateAiDescription(
|
||||
prompt,
|
||||
this.settings.tableMaxWords,
|
||||
'klo-table-description',
|
||||
);
|
||||
if (cacheKey) {
|
||||
await this.cache?.set(cacheKey, description);
|
||||
}
|
||||
return description;
|
||||
} catch (error) {
|
||||
this.logger?.error(`Error generating table description: ${errorMessage(error)}`);
|
||||
return 'Table not found';
|
||||
}
|
||||
}
|
||||
|
||||
async generateDataSourceDescription(input: KloGenerateDataSourceDescriptionInput): Promise<string> {
|
||||
if (input.tables.length === 0) {
|
||||
return 'No tables found in database';
|
||||
}
|
||||
|
||||
const cacheKey = input.connectionName ? this.cache?.buildConnectionKey(input.connectionName) : undefined;
|
||||
if (cacheKey) {
|
||||
const cached = await this.cache?.get(cacheKey);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
}
|
||||
|
||||
if (!input.connector.sampleTable) {
|
||||
this.logger?.warn('KLO scan connector does not support table sampling for data-source description generation', {
|
||||
connectorId: input.connector.id,
|
||||
});
|
||||
return 'No accessible tables found in database';
|
||||
}
|
||||
|
||||
const tablesToAnalyze = input.tables.slice(0, 10);
|
||||
const tableSamples = await runWithConcurrency(tablesToAnalyze, this.settings.concurrencyLimit, async (table) => {
|
||||
try {
|
||||
const sampleData = await input.connector.sampleTable!(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
table: toTableRef(table),
|
||||
limit: 5,
|
||||
},
|
||||
input.context,
|
||||
);
|
||||
return [table.name, sampleData] as [string, KloTableSampleResult];
|
||||
} catch (error) {
|
||||
this.logger?.warn(`Failed to sample table '${table.name}' for data source analysis - ${errorMessage(error)}`);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
|
||||
const accessibleSamples = tableSamples.filter(
|
||||
(sample): sample is [string, KloTableSampleResult] => sample !== null,
|
||||
);
|
||||
if (accessibleSamples.length === 0) {
|
||||
return 'No accessible tables found in database';
|
||||
}
|
||||
|
||||
try {
|
||||
const prompt = buildKloDataSourceDescriptionPrompt({
|
||||
tableSamples: accessibleSamples,
|
||||
dataSourceType: input.dataSourceType,
|
||||
});
|
||||
const description = await this.generateAiDescription(
|
||||
prompt,
|
||||
this.settings.dataSourceMaxWords,
|
||||
'klo-data-source-description',
|
||||
);
|
||||
if (cacheKey) {
|
||||
await this.cache?.set(cacheKey, description);
|
||||
}
|
||||
return description;
|
||||
} catch (error) {
|
||||
this.logger?.error(`Error generating data source description: ${errorMessage(error)}`);
|
||||
return 'Failed to generate data source description';
|
||||
}
|
||||
}
|
||||
|
||||
private async generateOneColumnDescription(
|
||||
input: KloGenerateColumnDescriptionsInput,
|
||||
column: KloDescriptionColumn,
|
||||
tableContext: string,
|
||||
): Promise<ColumnTaskResult> {
|
||||
const existingDescription = input.existingDescriptions?.[column.name];
|
||||
if (input.skipExisting && existingDescription) {
|
||||
return {
|
||||
columnName: column.name,
|
||||
description: existingDescription,
|
||||
skipped: true,
|
||||
processed: false,
|
||||
};
|
||||
}
|
||||
|
||||
const tableRef = toTableRef(input.table);
|
||||
const cacheKey = this.cache?.buildColumnKey(tableRef, column.name);
|
||||
if (cacheKey) {
|
||||
const cached = await this.cache?.get(cacheKey);
|
||||
if (cached) {
|
||||
return {
|
||||
columnName: column.name,
|
||||
description: cached,
|
||||
skipped: true,
|
||||
processed: false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
let columnValues = column.sampleValues;
|
||||
if (!columnValues || columnValues.length === 0) {
|
||||
if (!input.connector.sampleColumn) {
|
||||
this.logger?.warn('KLO scan connector does not support column sampling for column description generation', {
|
||||
connectorId: input.connector.id,
|
||||
table: input.table.name,
|
||||
column: column.name,
|
||||
});
|
||||
return {
|
||||
columnName: column.name,
|
||||
description: null,
|
||||
skipped: false,
|
||||
processed: false,
|
||||
};
|
||||
}
|
||||
|
||||
const sample = await input.connector.sampleColumn(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
table: tableRef,
|
||||
column: column.name,
|
||||
limit: 50,
|
||||
},
|
||||
input.context,
|
||||
);
|
||||
columnValues = sample.values;
|
||||
}
|
||||
|
||||
const nonNullValues = (columnValues ?? []).filter((value) => value !== null && value !== undefined);
|
||||
if (nonNullValues.length === 0) {
|
||||
return {
|
||||
columnName: column.name,
|
||||
description: null,
|
||||
skipped: false,
|
||||
processed: false,
|
||||
};
|
||||
}
|
||||
|
||||
const prompt = buildKloColumnDescriptionPrompt({
|
||||
columnName: column.name,
|
||||
columnValues: nonNullValues,
|
||||
tableContext,
|
||||
dataSourceType: input.dataSourceType,
|
||||
supportsNestedAnalysis: input.supportsNestedAnalysis,
|
||||
rawDescriptions: column.rawDescriptions,
|
||||
});
|
||||
const description = await this.generateAiDescription(
|
||||
prompt,
|
||||
this.settings.columnMaxWords,
|
||||
'klo-column-description',
|
||||
);
|
||||
|
||||
if (cacheKey) {
|
||||
await this.cache?.set(cacheKey, description);
|
||||
}
|
||||
|
||||
return {
|
||||
columnName: column.name,
|
||||
description,
|
||||
skipped: false,
|
||||
processed: true,
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger?.error(`Error analyzing column '${column.name}': ${errorMessage(error)}`);
|
||||
return {
|
||||
columnName: column.name,
|
||||
description: `Error generating description: ${errorMessage(error)}`,
|
||||
skipped: false,
|
||||
processed: false,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async generateAiDescription(prompt: string, maxWords: number, _operationName: string): Promise<string> {
|
||||
try {
|
||||
const text = await generateKloText({
|
||||
llmProvider: this.llmProvider,
|
||||
role: 'candidateExtraction',
|
||||
prompt: appendKloWordLimitInstruction(prompt, maxWords),
|
||||
temperature: this.settings.temperature,
|
||||
});
|
||||
const description = text.trim();
|
||||
return description || 'Failed to generate description';
|
||||
} catch (error) {
|
||||
this.logger?.error(`Error generating AI description: ${errorMessage(error)}`);
|
||||
return `Error generating description: ${errorMessage(error)}`;
|
||||
}
|
||||
}
|
||||
}
|
||||
47
packages/context/src/scan/embedding-text.test.ts
Normal file
47
packages/context/src/scan/embedding-text.test.ts
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { buildKloColumnEmbeddingText } from './embedding-text.js';
|
||||
|
||||
describe('KLO scan embedding text', () => {
|
||||
it('builds column embedding text with table, description, FK, and sample-value context', () => {
|
||||
expect(
|
||||
buildKloColumnEmbeddingText({
|
||||
tableName: 'orders',
|
||||
columnName: 'status',
|
||||
columnType: 'varchar',
|
||||
resolvedDescription: 'Payment lifecycle state',
|
||||
sampleValues: ['paid', 'refunded', 'pending'],
|
||||
resolvedTableDescription: 'Customer orders',
|
||||
foreignKeys: {
|
||||
outgoing: [{ toTable: 'customers', toColumn: 'id' }],
|
||||
incoming: [{ fromTable: 'refunds', fromColumn: 'order_status' }],
|
||||
},
|
||||
maxSampleValues: 2,
|
||||
}),
|
||||
).toBe(
|
||||
'orders.status (varchar). Table: Customer orders. Payment lifecycle state. FK -> customers.id. FK <- refunds.order_status. Values: paid, refunded',
|
||||
);
|
||||
});
|
||||
|
||||
it('omits optional sections when the scan has no enrichment context yet', () => {
|
||||
expect(
|
||||
buildKloColumnEmbeddingText({
|
||||
tableName: 'orders',
|
||||
columnName: 'id',
|
||||
columnType: 'integer',
|
||||
resolvedDescription: null,
|
||||
}),
|
||||
).toBe('orders.id (integer)');
|
||||
});
|
||||
|
||||
it('keeps all available sample values when no explicit max is supplied', () => {
|
||||
expect(
|
||||
buildKloColumnEmbeddingText({
|
||||
tableName: 'orders',
|
||||
columnName: 'status',
|
||||
columnType: 'varchar',
|
||||
resolvedDescription: null,
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
}),
|
||||
).toBe('orders.status (varchar). Values: paid, refunded');
|
||||
});
|
||||
});
|
||||
45
packages/context/src/scan/embedding-text.ts
Normal file
45
packages/context/src/scan/embedding-text.ts
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
export interface KloColumnEmbeddingForeignKeys {
|
||||
outgoing: Array<{ toTable: string; toColumn: string }>;
|
||||
incoming: Array<{ fromTable: string; fromColumn: string }>;
|
||||
}
|
||||
|
||||
export interface KloColumnEmbeddingTextInput {
|
||||
tableName: string;
|
||||
columnName: string;
|
||||
columnType: string;
|
||||
resolvedDescription: string | null;
|
||||
sampleValues?: readonly string[] | null;
|
||||
resolvedTableDescription?: string | null;
|
||||
foreignKeys?: KloColumnEmbeddingForeignKeys | null;
|
||||
maxSampleValues?: number;
|
||||
}
|
||||
|
||||
export function buildKloColumnEmbeddingText(input: KloColumnEmbeddingTextInput): string {
|
||||
const parts: string[] = [];
|
||||
|
||||
parts.push(`${input.tableName}.${input.columnName} (${input.columnType})`);
|
||||
|
||||
if (input.resolvedTableDescription) {
|
||||
parts.push(`Table: ${input.resolvedTableDescription}`);
|
||||
}
|
||||
|
||||
if (input.resolvedDescription) {
|
||||
parts.push(input.resolvedDescription);
|
||||
}
|
||||
|
||||
if (input.foreignKeys) {
|
||||
for (const fk of input.foreignKeys.outgoing) {
|
||||
parts.push(`FK -> ${fk.toTable}.${fk.toColumn}`);
|
||||
}
|
||||
for (const fk of input.foreignKeys.incoming) {
|
||||
parts.push(`FK <- ${fk.fromTable}.${fk.fromColumn}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (input.sampleValues && input.sampleValues.length > 0) {
|
||||
const maxSampleValues = input.maxSampleValues ?? 20;
|
||||
parts.push(`Values: ${input.sampleValues.slice(0, maxSampleValues).join(', ')}`);
|
||||
}
|
||||
|
||||
return parts.join('. ');
|
||||
}
|
||||
175
packages/context/src/scan/enrichment-state.test.ts
Normal file
175
packages/context/src/scan/enrichment-state.test.ts
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import {
|
||||
completedKloScanEnrichmentStateSummary,
|
||||
computeKloScanEnrichmentInputHash,
|
||||
summarizeKloScanEnrichmentState,
|
||||
} from './enrichment-state.js';
|
||||
import { SqliteLocalScanEnrichmentStateStore } from './sqlite-local-enrichment-state-store.js';
|
||||
import type { KloSchemaSnapshot } from './types.js';
|
||||
|
||||
const snapshot: KloSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 1,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
describe('scan enrichment state', () => {
|
||||
let tempDir: string;
|
||||
let store: SqliteLocalScanEnrichmentStateStore;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-scan-enrichment-state-'));
|
||||
store = new SqliteLocalScanEnrichmentStateStore({ dbPath: join(tempDir, 'db.sqlite') });
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('computes stable input hashes without depending on object key order', () => {
|
||||
const first = computeKloScanEnrichmentInputHash({
|
||||
snapshot,
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' },
|
||||
});
|
||||
const second = computeKloScanEnrichmentInputHash({
|
||||
snapshot: { ...snapshot, metadata: {} },
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { llmModel: 'a', embeddingDimensions: 8, provider: 'deterministic' },
|
||||
});
|
||||
const firstTable = snapshot.tables[0];
|
||||
if (!firstTable) {
|
||||
throw new Error('Expected test snapshot table');
|
||||
}
|
||||
const changed = computeKloScanEnrichmentInputHash({
|
||||
snapshot: { ...snapshot, tables: [{ ...firstTable, name: 'orders_v2' }] },
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8, llmModel: 'a' },
|
||||
});
|
||||
|
||||
expect(first).toMatch(/^[a-f0-9]{64}$/);
|
||||
expect(second).toBe(first);
|
||||
expect(changed).not.toBe(first);
|
||||
});
|
||||
|
||||
it('persists completed stages and ignores stale hashes', async () => {
|
||||
const inputHash = computeKloScanEnrichmentInputHash({
|
||||
snapshot,
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 8 },
|
||||
});
|
||||
|
||||
await store.saveCompletedStage({
|
||||
runId: 'scan-run-1',
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-1',
|
||||
mode: 'enriched',
|
||||
stage: 'descriptions',
|
||||
inputHash,
|
||||
output: [{ table: { catalog: null, db: 'public', name: 'orders' }, tableDescription: 'Orders' }],
|
||||
updatedAt: '2026-04-29T12:01:00.000Z',
|
||||
});
|
||||
|
||||
await expect(
|
||||
store.findCompletedStage({
|
||||
runId: 'scan-run-1',
|
||||
stage: 'descriptions',
|
||||
inputHash,
|
||||
}),
|
||||
).resolves.toMatchObject({
|
||||
runId: 'scan-run-1',
|
||||
stage: 'descriptions',
|
||||
status: 'completed',
|
||||
output: [{ table: { catalog: null, db: 'public', name: 'orders' }, tableDescription: 'Orders' }],
|
||||
});
|
||||
|
||||
await expect(
|
||||
store.findCompletedStage({
|
||||
runId: 'scan-run-1',
|
||||
stage: 'descriptions',
|
||||
inputHash: 'different-hash',
|
||||
}),
|
||||
).resolves.toBeNull();
|
||||
});
|
||||
|
||||
it('records failed stages without making them reusable', async () => {
|
||||
await store.saveFailedStage({
|
||||
runId: 'scan-run-2',
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-2',
|
||||
mode: 'enriched',
|
||||
stage: 'embeddings',
|
||||
inputHash: 'hash-2',
|
||||
errorMessage: 'embedding service timed out',
|
||||
updatedAt: '2026-04-29T12:02:00.000Z',
|
||||
});
|
||||
|
||||
await expect(
|
||||
store.findCompletedStage({
|
||||
runId: 'scan-run-2',
|
||||
stage: 'embeddings',
|
||||
inputHash: 'hash-2',
|
||||
}),
|
||||
).resolves.toBeNull();
|
||||
|
||||
await expect(store.listRunStages('scan-run-2')).resolves.toEqual([
|
||||
expect.objectContaining({
|
||||
runId: 'scan-run-2',
|
||||
stage: 'embeddings',
|
||||
status: 'failed',
|
||||
errorMessage: 'embedding service timed out',
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it('summarizes resumed, completed, and failed stages for reports', () => {
|
||||
expect(
|
||||
summarizeKloScanEnrichmentState({
|
||||
resumedStages: ['descriptions'],
|
||||
completedStages: ['descriptions', 'embeddings'],
|
||||
failedStages: ['relationships'],
|
||||
}),
|
||||
).toEqual({
|
||||
resumedStages: ['descriptions'],
|
||||
completedStages: ['descriptions', 'embeddings'],
|
||||
failedStages: ['relationships'],
|
||||
});
|
||||
|
||||
expect(completedKloScanEnrichmentStateSummary()).toEqual({
|
||||
resumedStages: [],
|
||||
completedStages: [],
|
||||
failedStages: [],
|
||||
});
|
||||
});
|
||||
});
|
||||
108
packages/context/src/scan/enrichment-state.ts
Normal file
108
packages/context/src/scan/enrichment-state.ts
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import type { KloScanEnrichmentStage, KloScanEnrichmentStateSummary, KloScanMode, KloSchemaSnapshot } from './types.js';
|
||||
|
||||
export const KLO_SCAN_ENRICHMENT_STAGES: readonly KloScanEnrichmentStage[] = [
|
||||
'descriptions',
|
||||
'embeddings',
|
||||
'relationships',
|
||||
] as const;
|
||||
|
||||
export interface KloScanEnrichmentStageLookup {
|
||||
runId: string;
|
||||
stage: KloScanEnrichmentStage;
|
||||
inputHash: string;
|
||||
}
|
||||
|
||||
export interface KloScanEnrichmentCompletedStage<TOutput = unknown> {
|
||||
runId: string;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
mode: KloScanMode;
|
||||
stage: KloScanEnrichmentStage;
|
||||
inputHash: string;
|
||||
status: 'completed';
|
||||
output: TOutput;
|
||||
errorMessage: null;
|
||||
updatedAt: string;
|
||||
}
|
||||
|
||||
export interface KloScanEnrichmentFailedStage {
|
||||
runId: string;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
mode: KloScanMode;
|
||||
stage: KloScanEnrichmentStage;
|
||||
inputHash: string;
|
||||
status: 'failed';
|
||||
output: null;
|
||||
errorMessage: string;
|
||||
updatedAt: string;
|
||||
}
|
||||
|
||||
export type KloScanEnrichmentStageRecord<TOutput = unknown> =
|
||||
| KloScanEnrichmentCompletedStage<TOutput>
|
||||
| KloScanEnrichmentFailedStage;
|
||||
|
||||
export interface KloScanEnrichmentStateStore {
|
||||
findCompletedStage<TOutput = unknown>(
|
||||
input: KloScanEnrichmentStageLookup,
|
||||
): Promise<KloScanEnrichmentCompletedStage<TOutput> | null>;
|
||||
saveCompletedStage<TOutput = unknown>(
|
||||
input: Omit<KloScanEnrichmentCompletedStage<TOutput>, 'status' | 'errorMessage'>,
|
||||
): Promise<void>;
|
||||
saveFailedStage(input: Omit<KloScanEnrichmentFailedStage, 'status' | 'output'>): Promise<void>;
|
||||
listRunStages(runId: string): Promise<KloScanEnrichmentStageRecord[]>;
|
||||
}
|
||||
|
||||
export interface ComputeKloScanEnrichmentInputHashInput {
|
||||
snapshot: KloSchemaSnapshot;
|
||||
mode: KloScanMode;
|
||||
detectRelationships: boolean;
|
||||
providerIdentity: Record<string, unknown>;
|
||||
relationshipSettings?: unknown;
|
||||
}
|
||||
|
||||
function stableJson(value: unknown): string {
|
||||
if (Array.isArray(value)) {
|
||||
return `[${value.map(stableJson).join(',')}]`;
|
||||
}
|
||||
if (value && typeof value === 'object') {
|
||||
const entries = Object.entries(value as Record<string, unknown>).sort(([left], [right]) =>
|
||||
left.localeCompare(right),
|
||||
);
|
||||
return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableJson(item)}`).join(',')}}`;
|
||||
}
|
||||
return JSON.stringify(value);
|
||||
}
|
||||
|
||||
export function computeKloScanEnrichmentInputHash(input: ComputeKloScanEnrichmentInputHashInput): string {
|
||||
return createHash('sha256').update(stableJson(input)).digest('hex');
|
||||
}
|
||||
|
||||
function uniqueStages(stages: KloScanEnrichmentStage[]): KloScanEnrichmentStage[] {
|
||||
const seen = new Set<KloScanEnrichmentStage>();
|
||||
const ordered: KloScanEnrichmentStage[] = [];
|
||||
for (const stage of KLO_SCAN_ENRICHMENT_STAGES) {
|
||||
if (stages.includes(stage) && !seen.has(stage)) {
|
||||
seen.add(stage);
|
||||
ordered.push(stage);
|
||||
}
|
||||
}
|
||||
return ordered;
|
||||
}
|
||||
|
||||
export function completedKloScanEnrichmentStateSummary(): KloScanEnrichmentStateSummary {
|
||||
return {
|
||||
resumedStages: [],
|
||||
completedStages: [],
|
||||
failedStages: [],
|
||||
};
|
||||
}
|
||||
|
||||
export function summarizeKloScanEnrichmentState(input: KloScanEnrichmentStateSummary): KloScanEnrichmentStateSummary {
|
||||
return {
|
||||
resumedStages: uniqueStages(input.resumedStages),
|
||||
completedStages: uniqueStages(input.completedStages),
|
||||
failedStages: uniqueStages(input.failedStages),
|
||||
};
|
||||
}
|
||||
42
packages/context/src/scan/enrichment-summary.test.ts
Normal file
42
packages/context/src/scan/enrichment-summary.test.ts
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
failedKloScanEnrichmentSummary,
|
||||
kloScanErrorMessage,
|
||||
skippedKloScanEnrichmentSummary,
|
||||
} from './enrichment-summary.js';
|
||||
|
||||
describe('KLO scan enrichment summaries', () => {
|
||||
it('keeps structural scans skipped when no enrichment was requested', () => {
|
||||
expect(failedKloScanEnrichmentSummary('structural', false)).toEqual(skippedKloScanEnrichmentSummary);
|
||||
});
|
||||
|
||||
it('marks relationship stages failed when relationship detection fails', () => {
|
||||
expect(failedKloScanEnrichmentSummary('relationships', true)).toEqual({
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'failed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'failed',
|
||||
});
|
||||
});
|
||||
|
||||
it('marks every enriched-only stage failed when full enrichment fails', () => {
|
||||
expect(failedKloScanEnrichmentSummary('enriched', true)).toEqual({
|
||||
dataDictionary: 'failed',
|
||||
tableDescriptions: 'failed',
|
||||
columnDescriptions: 'failed',
|
||||
embeddings: 'failed',
|
||||
deterministicRelationships: 'failed',
|
||||
llmRelationshipValidation: 'failed',
|
||||
statisticalValidation: 'failed',
|
||||
});
|
||||
});
|
||||
|
||||
it('formats unknown thrown values for scan warnings', () => {
|
||||
expect(kloScanErrorMessage(new Error('gateway timeout'))).toBe('gateway timeout');
|
||||
expect(kloScanErrorMessage('plain failure')).toBe('plain failure');
|
||||
expect(kloScanErrorMessage({ code: 'E_SCAN' })).toBe('{"code":"E_SCAN"}');
|
||||
});
|
||||
});
|
||||
52
packages/context/src/scan/enrichment-summary.ts
Normal file
52
packages/context/src/scan/enrichment-summary.ts
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import type { KloScanEnrichmentSummary, KloScanMode } from './types.js';
|
||||
|
||||
export const skippedKloScanEnrichmentSummary: KloScanEnrichmentSummary = {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'skipped',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
};
|
||||
|
||||
export function failedKloScanEnrichmentSummary(
|
||||
mode: KloScanMode,
|
||||
detectRelationships = false,
|
||||
): KloScanEnrichmentSummary {
|
||||
if (mode === 'enriched') {
|
||||
return {
|
||||
dataDictionary: 'failed',
|
||||
tableDescriptions: 'failed',
|
||||
columnDescriptions: 'failed',
|
||||
embeddings: 'failed',
|
||||
deterministicRelationships: 'failed',
|
||||
llmRelationshipValidation: 'failed',
|
||||
statisticalValidation: 'failed',
|
||||
};
|
||||
}
|
||||
|
||||
if (mode === 'relationships' || detectRelationships) {
|
||||
return {
|
||||
...skippedKloScanEnrichmentSummary,
|
||||
deterministicRelationships: 'failed',
|
||||
statisticalValidation: 'failed',
|
||||
};
|
||||
}
|
||||
|
||||
return skippedKloScanEnrichmentSummary;
|
||||
}
|
||||
|
||||
export function kloScanErrorMessage(error: unknown): string {
|
||||
if (error instanceof Error) {
|
||||
return error.message;
|
||||
}
|
||||
if (typeof error === 'string') {
|
||||
return error;
|
||||
}
|
||||
try {
|
||||
return JSON.stringify(error);
|
||||
} catch {
|
||||
return String(error);
|
||||
}
|
||||
}
|
||||
159
packages/context/src/scan/enrichment-types.test.ts
Normal file
159
packages/context/src/scan/enrichment-types.test.ts
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type {
|
||||
KloColumnSampleUpdate,
|
||||
KloDescriptionUpdate,
|
||||
KloEmbeddingUpdate,
|
||||
KloEnrichedSchema,
|
||||
KloJoinUpdate,
|
||||
KloRelationshipEndpoint,
|
||||
KloRelationshipUpdate,
|
||||
KloScanMetadataStore,
|
||||
KloStructuralSyncPlan,
|
||||
} from './enrichment-types.js';
|
||||
|
||||
describe('KLO scan enrichment contracts', () => {
|
||||
it('models an enriched schema with reusable table, column, and relationship metadata', () => {
|
||||
const schema: KloEnrichedSchema = {
|
||||
connectionId: 'warehouse',
|
||||
tables: [
|
||||
{
|
||||
id: 'table-orders',
|
||||
ref: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
enabled: true,
|
||||
descriptions: { db: 'Raw orders', ai: 'Customer orders' },
|
||||
columns: [
|
||||
{
|
||||
id: 'column-orders-status',
|
||||
tableId: 'table-orders',
|
||||
tableRef: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
name: 'status',
|
||||
nativeType: 'varchar',
|
||||
normalizedType: 'string',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: { db: 'Status code' },
|
||||
embedding: [0.1, 0.2],
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
cardinality: 2,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
relationships: [
|
||||
{
|
||||
id: 'rel-orders-customers',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'table-orders',
|
||||
columnIds: ['column-orders-customer-id'],
|
||||
table: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'table-customers',
|
||||
columnIds: ['column-customers-id'],
|
||||
table: { catalog: 'analytics', db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
expect(schema.tables[0].columns[0].sampleValues).toEqual(['paid', 'refunded']);
|
||||
expect(schema.relationships[0].source).toBe('formal');
|
||||
});
|
||||
|
||||
it('models metadata-store updates without requiring a concrete store implementation', async () => {
|
||||
const structuralPlan: KloStructuralSyncPlan = {
|
||||
connectionId: 'warehouse',
|
||||
snapshotId: 'snapshot-1',
|
||||
operations: [{ kind: 'create_table', table: 'orders' }],
|
||||
};
|
||||
const descriptionUpdate: KloDescriptionUpdate = {
|
||||
connectionId: 'warehouse',
|
||||
table: { catalog: 'analytics', db: 'public', name: 'orders' },
|
||||
source: 'ai',
|
||||
tableDescription: 'Customer orders',
|
||||
columnDescriptions: { status: 'Payment lifecycle state' },
|
||||
};
|
||||
const sampleUpdate: KloColumnSampleUpdate = {
|
||||
columnId: 'column-orders-status',
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
cardinality: 2,
|
||||
};
|
||||
const embeddingUpdate: KloEmbeddingUpdate = {
|
||||
columnId: 'column-orders-status',
|
||||
text: 'orders.status (varchar). Values: paid, refunded',
|
||||
embedding: [0.25, 0.75],
|
||||
};
|
||||
const relationshipUpdate: KloRelationshipUpdate = {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [],
|
||||
rejected: [],
|
||||
skipped: [{ reason: 'missing parent table', relationshipId: 'candidate-1' }],
|
||||
};
|
||||
|
||||
const store: KloScanMetadataStore = {
|
||||
loadSchema: async () => null,
|
||||
applyStructuralPlan: async (plan) => ({
|
||||
connectionId: plan.connectionId,
|
||||
tables: [],
|
||||
relationships: [],
|
||||
}),
|
||||
updateDescriptions: async (input) => {
|
||||
expect(input).toEqual(descriptionUpdate);
|
||||
},
|
||||
updateColumnSamples: async (input) => {
|
||||
expect(input).toEqual([sampleUpdate]);
|
||||
},
|
||||
updateColumnEmbeddings: async (input) => {
|
||||
expect(input).toEqual([embeddingUpdate]);
|
||||
},
|
||||
updateInferredRelationships: async (input) => {
|
||||
expect(input).toEqual(relationshipUpdate);
|
||||
},
|
||||
};
|
||||
|
||||
await expect(store.loadSchema('warehouse')).resolves.toBeNull();
|
||||
await expect(store.applyStructuralPlan(structuralPlan)).resolves.toEqual({
|
||||
connectionId: 'warehouse',
|
||||
tables: [],
|
||||
relationships: [],
|
||||
});
|
||||
await expect(store.updateDescriptions(descriptionUpdate)).resolves.toBeUndefined();
|
||||
await expect(store.updateColumnSamples([sampleUpdate])).resolves.toBeUndefined();
|
||||
await expect(store.updateColumnEmbeddings([embeddingUpdate])).resolves.toBeUndefined();
|
||||
await expect(store.updateInferredRelationships(relationshipUpdate)).resolves.toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('relationship tuple contracts', () => {
|
||||
it('represents relationship endpoints and join updates as ordered column tuples', () => {
|
||||
const endpoint: KloRelationshipEndpoint = {
|
||||
tableId: 'public.order_lines',
|
||||
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
};
|
||||
const update: KloJoinUpdate = {
|
||||
connectionId: 'warehouse',
|
||||
fromTable: 'order_line_allocations',
|
||||
fromColumns: ['order_id', 'line_number'],
|
||||
toTable: 'order_lines',
|
||||
toColumns: ['order_id', 'line_number'],
|
||||
relationship: 'many_to_one',
|
||||
author: 'klo',
|
||||
authorEmail: 'klo@example.com',
|
||||
};
|
||||
|
||||
expect(endpoint.columns).toEqual(['order_id', 'line_number']);
|
||||
expect(endpoint.columnIds).toEqual(['public.order_lines.order_id', 'public.order_lines.line_number']);
|
||||
expect(update.fromColumns).toEqual(['order_id', 'line_number']);
|
||||
expect(update.toColumns).toEqual(['order_id', 'line_number']);
|
||||
});
|
||||
});
|
||||
130
packages/context/src/scan/enrichment-types.ts
Normal file
130
packages/context/src/scan/enrichment-types.ts
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
import type { KloSchemaDimensionType, KloTableRef } from './types.js';
|
||||
|
||||
export type KloDescriptionSource = 'ai' | 'db' | 'dbt' | 'user' | (string & {});
|
||||
|
||||
export type KloRelationshipSource = 'formal' | 'inferred' | 'manual';
|
||||
|
||||
export type KloRelationshipType = 'many_to_one' | 'one_to_many' | 'one_to_one';
|
||||
|
||||
export interface KloEnrichedColumn {
|
||||
id: string;
|
||||
tableId: string;
|
||||
tableRef: KloTableRef;
|
||||
name: string;
|
||||
nativeType: string;
|
||||
normalizedType: string;
|
||||
dimensionType: KloSchemaDimensionType;
|
||||
nullable: boolean;
|
||||
primaryKey: boolean;
|
||||
parentColumnId: string | null;
|
||||
descriptions: Partial<Record<KloDescriptionSource, string>>;
|
||||
embedding: number[] | null;
|
||||
sampleValues: string[] | null;
|
||||
cardinality: number | null;
|
||||
}
|
||||
|
||||
export interface KloEnrichedTable {
|
||||
id: string;
|
||||
ref: KloTableRef;
|
||||
enabled: boolean;
|
||||
descriptions: Partial<Record<KloDescriptionSource, string>>;
|
||||
columns: KloEnrichedColumn[];
|
||||
}
|
||||
|
||||
export interface KloRelationshipEndpoint {
|
||||
tableId: string;
|
||||
columnIds: string[];
|
||||
table: KloTableRef;
|
||||
columns: string[];
|
||||
}
|
||||
|
||||
export interface KloEnrichedRelationship {
|
||||
id: string;
|
||||
source: KloRelationshipSource;
|
||||
from: KloRelationshipEndpoint;
|
||||
to: KloRelationshipEndpoint;
|
||||
relationshipType: KloRelationshipType;
|
||||
confidence: number;
|
||||
isPrimaryKeyReference: boolean;
|
||||
}
|
||||
|
||||
export interface KloEnrichedSchema {
|
||||
connectionId: string;
|
||||
tables: KloEnrichedTable[];
|
||||
relationships: KloEnrichedRelationship[];
|
||||
}
|
||||
|
||||
export interface KloStructuralSyncPlan {
|
||||
connectionId: string;
|
||||
snapshotId: string;
|
||||
operations: Array<Record<string, unknown>>;
|
||||
}
|
||||
|
||||
export interface KloDescriptionUpdate {
|
||||
connectionId: string;
|
||||
table: KloTableRef;
|
||||
source: KloDescriptionSource;
|
||||
tableDescription?: string;
|
||||
columnDescriptions?: Record<string, string | null>;
|
||||
}
|
||||
|
||||
const PREFERRED_METADATA_FIELD_NAMES = [
|
||||
'tags',
|
||||
'constraints',
|
||||
'enum_values',
|
||||
'freshness',
|
||||
'tests',
|
||||
'lineage',
|
||||
] as const;
|
||||
|
||||
export interface KloMetadataUpdate {
|
||||
connectionId: string;
|
||||
table: KloTableRef;
|
||||
source: KloDescriptionSource;
|
||||
tableFields?: Record<string, unknown>;
|
||||
columnFields?: Record<string, Record<string, unknown>>;
|
||||
}
|
||||
|
||||
export interface KloJoinUpdate {
|
||||
connectionId: string;
|
||||
fromTable: string;
|
||||
fromColumns: string[];
|
||||
toTable: string;
|
||||
toColumns: string[];
|
||||
relationship: KloRelationshipType;
|
||||
author: string;
|
||||
authorEmail: string;
|
||||
}
|
||||
|
||||
export interface KloColumnSampleUpdate {
|
||||
columnId: string;
|
||||
sampleValues: string[] | null;
|
||||
cardinality: number | null;
|
||||
}
|
||||
|
||||
export interface KloEmbeddingUpdate {
|
||||
columnId: string;
|
||||
text: string;
|
||||
embedding: number[];
|
||||
}
|
||||
|
||||
export interface KloSkippedRelationship {
|
||||
relationshipId: string;
|
||||
reason: string;
|
||||
}
|
||||
|
||||
export interface KloRelationshipUpdate {
|
||||
connectionId: string;
|
||||
accepted: KloEnrichedRelationship[];
|
||||
rejected: KloEnrichedRelationship[];
|
||||
skipped: KloSkippedRelationship[];
|
||||
}
|
||||
|
||||
export interface KloScanMetadataStore {
|
||||
loadSchema(connectionId: string): Promise<KloEnrichedSchema | null>;
|
||||
applyStructuralPlan(plan: KloStructuralSyncPlan): Promise<KloEnrichedSchema>;
|
||||
updateDescriptions(input: KloDescriptionUpdate): Promise<void>;
|
||||
updateColumnSamples(input: KloColumnSampleUpdate[]): Promise<void>;
|
||||
updateColumnEmbeddings(input: KloEmbeddingUpdate[]): Promise<void>;
|
||||
updateInferredRelationships(input: KloRelationshipUpdate): Promise<void>;
|
||||
}
|
||||
400
packages/context/src/scan/index.ts
Normal file
400
packages/context/src/scan/index.ts
Normal file
|
|
@ -0,0 +1,400 @@
|
|||
export {
|
||||
REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
redactKloCredentialEnvelope,
|
||||
redactKloCredentialValue,
|
||||
redactKloScanMetadata,
|
||||
redactKloScanReport,
|
||||
redactKloScanWarning,
|
||||
} from './credentials.js';
|
||||
export type {
|
||||
KloDataDictionaryColumnState,
|
||||
KloDataDictionarySampleDecision,
|
||||
KloDataDictionarySettings,
|
||||
KloDataDictionarySkipReason,
|
||||
} from './data-dictionary.js';
|
||||
export {
|
||||
defaultKloDataDictionarySettings,
|
||||
isKloDataDictionaryCandidate,
|
||||
shouldKloSampleColumnForDictionary,
|
||||
} from './data-dictionary.js';
|
||||
export type {
|
||||
KloColumnAnalysisResult,
|
||||
KloColumnDescriptionPromptInput,
|
||||
KloDataSourceDescriptionPromptInput,
|
||||
KloDescriptionCachePort,
|
||||
KloDescriptionColumn,
|
||||
KloDescriptionColumnTable,
|
||||
KloDescriptionGenerationSettings,
|
||||
KloDescriptionGeneratorOptions,
|
||||
KloDescriptionSamplingPort,
|
||||
KloDescriptionTableInput,
|
||||
KloGenerateColumnDescriptionsInput,
|
||||
KloGenerateDataSourceDescriptionInput,
|
||||
KloGenerateTableDescriptionInput,
|
||||
KloTableDescriptionPromptInput,
|
||||
} from './description-generation.js';
|
||||
export {
|
||||
appendKloWordLimitInstruction,
|
||||
buildKloColumnDescriptionPrompt,
|
||||
buildKloDataSourceDescriptionPrompt,
|
||||
buildKloTableDescriptionPrompt,
|
||||
KloDescriptionGenerator,
|
||||
} from './description-generation.js';
|
||||
export type { KloColumnEmbeddingForeignKeys, KloColumnEmbeddingTextInput } from './embedding-text.js';
|
||||
export { buildKloColumnEmbeddingText } from './embedding-text.js';
|
||||
export type {
|
||||
ComputeKloScanEnrichmentInputHashInput,
|
||||
KloScanEnrichmentCompletedStage,
|
||||
KloScanEnrichmentFailedStage,
|
||||
KloScanEnrichmentStageLookup,
|
||||
KloScanEnrichmentStageRecord,
|
||||
KloScanEnrichmentStateStore,
|
||||
} from './enrichment-state.js';
|
||||
export {
|
||||
completedKloScanEnrichmentStateSummary,
|
||||
computeKloScanEnrichmentInputHash,
|
||||
KLO_SCAN_ENRICHMENT_STAGES,
|
||||
summarizeKloScanEnrichmentState,
|
||||
} from './enrichment-state.js';
|
||||
export {
|
||||
failedKloScanEnrichmentSummary,
|
||||
kloScanErrorMessage,
|
||||
skippedKloScanEnrichmentSummary,
|
||||
} from './enrichment-summary.js';
|
||||
export type {
|
||||
KloColumnSampleUpdate,
|
||||
KloDescriptionSource,
|
||||
KloDescriptionUpdate,
|
||||
KloEmbeddingUpdate,
|
||||
KloEnrichedColumn,
|
||||
KloEnrichedRelationship,
|
||||
KloEnrichedSchema,
|
||||
KloEnrichedTable,
|
||||
KloRelationshipEndpoint,
|
||||
KloRelationshipSource,
|
||||
KloRelationshipType,
|
||||
KloRelationshipUpdate,
|
||||
KloScanMetadataStore,
|
||||
KloSkippedRelationship,
|
||||
KloStructuralSyncPlan,
|
||||
} from './enrichment-types.js';
|
||||
export type {
|
||||
DeterministicLocalScanEnrichmentProviderOptions,
|
||||
KloLocalScanEnrichmentInput,
|
||||
KloLocalScanEnrichmentProviders,
|
||||
KloLocalScanEnrichmentResult,
|
||||
} from './local-enrichment.js';
|
||||
export {
|
||||
createDeterministicLocalScanEnrichmentProviders,
|
||||
runLocalScanEnrichment,
|
||||
snapshotToKloEnrichedSchema,
|
||||
} from './local-enrichment.js';
|
||||
export type {
|
||||
WriteLocalScanEnrichmentArtifactsInput,
|
||||
WriteLocalScanEnrichmentArtifactsResult,
|
||||
WriteLocalScanManifestShardsInput,
|
||||
WriteLocalScanManifestShardsResult,
|
||||
} from './local-enrichment-artifacts.js';
|
||||
export {
|
||||
writeLocalScanEnrichmentArtifacts,
|
||||
writeLocalScanManifestShards,
|
||||
} from './local-enrichment-artifacts.js';
|
||||
export type {
|
||||
LocalScanMcpOptions,
|
||||
LocalScanRunResult,
|
||||
LocalScanStatusResponse,
|
||||
RunLocalScanOptions,
|
||||
} from './local-scan.js';
|
||||
export { getLocalScanReport, getLocalScanStatus, runLocalScan } from './local-scan.js';
|
||||
export type { ReadLocalScanStructuralSnapshotInput } from './local-structural-artifacts.js';
|
||||
export { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js';
|
||||
export type {
|
||||
KloEnrichmentScanPhaseResult,
|
||||
KloScanOrchestratorOptions,
|
||||
KloScanOrchestratorRunInput,
|
||||
KloScanOrchestratorRunResult,
|
||||
KloStructuralScanPhaseResult,
|
||||
} from './orchestrator.js';
|
||||
export { KloScanOrchestrator } from './orchestrator.js';
|
||||
export type {
|
||||
KloRelationshipArtifactStatus,
|
||||
ReadLocalScanRelationshipArtifactsResult,
|
||||
} from './relationship-artifacts.js';
|
||||
export { readLocalScanRelationshipArtifacts } from './relationship-artifacts.js';
|
||||
export type {
|
||||
KloRelationshipBenchmarkReport,
|
||||
KloRelationshipBenchmarkReportCase,
|
||||
KloRelationshipBenchmarkReportCaseStatus,
|
||||
} from './relationship-benchmark-report.js';
|
||||
export {
|
||||
buildKloRelationshipBenchmarkReport,
|
||||
formatKloRelationshipBenchmarkReportMarkdown,
|
||||
} from './relationship-benchmark-report.js';
|
||||
export type {
|
||||
KloRelationshipBenchmarkCaseResult,
|
||||
KloRelationshipBenchmarkDetectedLink,
|
||||
KloRelationshipBenchmarkDetectedPk,
|
||||
KloRelationshipBenchmarkDetector,
|
||||
KloRelationshipBenchmarkDetectorInput,
|
||||
KloRelationshipBenchmarkDetectorResult,
|
||||
KloRelationshipBenchmarkExpectedLink,
|
||||
KloRelationshipBenchmarkExpectedLinks,
|
||||
KloRelationshipBenchmarkExpectedPk,
|
||||
KloRelationshipBenchmarkFixture,
|
||||
KloRelationshipBenchmarkMetrics,
|
||||
KloRelationshipBenchmarkMode,
|
||||
KloRelationshipBenchmarkStatus,
|
||||
KloRelationshipBenchmarkSuiteResult,
|
||||
KloRelationshipBenchmarkTier,
|
||||
} from './relationship-benchmarks.js';
|
||||
export {
|
||||
currentKloRelationshipBenchmarkDetector,
|
||||
kloRelationshipBenchmarkDetectorWithLlm,
|
||||
KLO_RELATIONSHIP_BENCHMARK_MODES,
|
||||
KLO_RELATIONSHIP_BENCHMARK_TIERS,
|
||||
loadKloRelationshipBenchmarkFixture,
|
||||
loadKloRelationshipBenchmarkFixtures,
|
||||
maskKloRelationshipBenchmarkSnapshot,
|
||||
runKloRelationshipBenchmarkCase,
|
||||
runKloRelationshipBenchmarkSuite,
|
||||
} from './relationship-benchmarks.js';
|
||||
export type {
|
||||
ApplyKloRelationshipValidationBudgetInput,
|
||||
KloRelationshipBudgetedCandidate,
|
||||
KloRelationshipValidationBudget,
|
||||
KloRelationshipValidationBudgetResult,
|
||||
} from './relationship-budget.js';
|
||||
export {
|
||||
applyKloRelationshipValidationBudget,
|
||||
defaultKloRelationshipValidationBudget,
|
||||
} from './relationship-budget.js';
|
||||
export type {
|
||||
KloRelationshipDiscoveryCandidate,
|
||||
KloRelationshipDiscoveryCandidateEvidence,
|
||||
KloRelationshipDiscoveryCandidateOptions,
|
||||
KloRelationshipDiscoveryCandidateSource,
|
||||
KloRelationshipDiscoveryCandidateStatus,
|
||||
KloRelationshipInferredTargetPk,
|
||||
} from './relationship-candidates.js';
|
||||
export {
|
||||
generateKloRelationshipDiscoveryCandidates,
|
||||
inferKloRelationshipTargetPks,
|
||||
mergeKloRelationshipDiscoveryCandidates,
|
||||
} from './relationship-candidates.js';
|
||||
export type {
|
||||
DiscoverKloCompositeRelationshipsInput,
|
||||
DiscoverKloCompositeRelationshipsResult,
|
||||
KloCompositePrimaryKeyCandidate,
|
||||
KloCompositeRelationshipCandidate,
|
||||
KloCompositeRelationshipStatus,
|
||||
KloCompositeRelationshipTupleEndpoint,
|
||||
KloCompositeRelationshipValidationEvidence,
|
||||
} from './relationship-composite-candidates.js';
|
||||
export { discoverKloCompositeRelationships } from './relationship-composite-candidates.js';
|
||||
export type {
|
||||
BuildKloRelationshipArtifactsInput,
|
||||
BuildKloRelationshipDiagnosticsInput,
|
||||
EmptyKloRelationshipProfileArtifactInput,
|
||||
KloRelationshipArtifact,
|
||||
KloRelationshipArtifactEdge,
|
||||
KloRelationshipArtifactEndpoint,
|
||||
KloRelationshipDiagnosticsArtifact,
|
||||
KloRelationshipDiagnosticsSummary,
|
||||
KloRelationshipDiagnosticsThresholds,
|
||||
KloRelationshipDiagnosticsValidation,
|
||||
} from './relationship-diagnostics.js';
|
||||
export {
|
||||
buildKloRelationshipArtifacts,
|
||||
buildKloRelationshipDiagnostics,
|
||||
emptyKloRelationshipProfileArtifact,
|
||||
} from './relationship-diagnostics.js';
|
||||
export type {
|
||||
BuildKloRelationshipFeedbackCalibrationReportInput,
|
||||
CalibrateLocalRelationshipFeedbackLabelsInput,
|
||||
KloRelationshipFeedbackCalibrationBucket,
|
||||
KloRelationshipFeedbackCalibrationLabel,
|
||||
KloRelationshipFeedbackCalibrationReport,
|
||||
} from './relationship-feedback-calibration.js';
|
||||
export {
|
||||
buildKloRelationshipFeedbackCalibrationReport,
|
||||
calibrateLocalRelationshipFeedbackLabels,
|
||||
formatKloRelationshipFeedbackCalibrationMarkdown,
|
||||
} from './relationship-feedback-calibration.js';
|
||||
export type {
|
||||
ExportLocalRelationshipFeedbackLabelsInput,
|
||||
ExportLocalRelationshipFeedbackLabelsResult,
|
||||
KloRelationshipFeedbackDecisionFilter,
|
||||
KloRelationshipFeedbackExportWarning,
|
||||
KloRelationshipFeedbackLabel,
|
||||
} from './relationship-feedback-export.js';
|
||||
export {
|
||||
exportLocalRelationshipFeedbackLabels,
|
||||
formatKloRelationshipFeedbackLabelsJsonl,
|
||||
} from './relationship-feedback-export.js';
|
||||
export {
|
||||
collectKloFormalMetadataRelationships,
|
||||
type KloFormalMetadataRelationshipCollection,
|
||||
} from './relationship-formal-metadata.js';
|
||||
export type {
|
||||
KloRelationshipGraphResolutionResult,
|
||||
KloRelationshipGraphResolverSettings,
|
||||
KloResolvedRelationshipDiscoveryCandidate,
|
||||
KloResolvedRelationshipGraphEvidence,
|
||||
KloResolvedRelationshipPk,
|
||||
KloResolvedRelationshipPkEvidence,
|
||||
KloResolvedRelationshipStatus,
|
||||
ResolveKloRelationshipGraphInput,
|
||||
} from './relationship-graph-resolver.js';
|
||||
export { resolveKloRelationshipGraph } from './relationship-graph-resolver.js';
|
||||
export type {
|
||||
KloRelationshipLlmProposalGenerateText,
|
||||
KloRelationshipLlmProposalResult,
|
||||
KloRelationshipLlmProposalSettings,
|
||||
ProposeKloRelationshipCandidatesWithLlmInput,
|
||||
} from './relationship-llm-proposal.js';
|
||||
export { proposeKloRelationshipCandidatesWithLlm } from './relationship-llm-proposal.js';
|
||||
export type {
|
||||
KloRelationshipLocalityCandidateTable,
|
||||
LocalKloRelationshipCandidateTablesInput,
|
||||
} from './relationship-locality.js';
|
||||
export { localCandidateTables } from './relationship-locality.js';
|
||||
export type {
|
||||
KloRelationshipNormalizedName,
|
||||
KloRelationshipTokenInput,
|
||||
} from './relationship-name-similarity.js';
|
||||
export {
|
||||
normalizeKloRelationshipName,
|
||||
pluralizeKloRelationshipToken,
|
||||
singularizeKloRelationshipToken,
|
||||
tokenizeKloRelationshipName,
|
||||
tokenSimilarity,
|
||||
} from './relationship-name-similarity.js';
|
||||
export type {
|
||||
DiscoverKloRelationshipsInput,
|
||||
DiscoverKloRelationshipsResult,
|
||||
} from './relationship-discovery.js';
|
||||
export { discoverKloRelationships } from './relationship-discovery.js';
|
||||
export type {
|
||||
KloRelationshipColumnProfile,
|
||||
KloRelationshipProfileArtifact,
|
||||
KloRelationshipReadOnlyExecutor,
|
||||
KloRelationshipTableProfile,
|
||||
ProfileKloRelationshipSchemaInput,
|
||||
} from './relationship-profiling.js';
|
||||
export {
|
||||
formatKloRelationshipTableRef,
|
||||
profileKloRelationshipSchema,
|
||||
quoteKloRelationshipIdentifier,
|
||||
} from './relationship-profiling.js';
|
||||
export type {
|
||||
AppliedRelationshipReviewDecision,
|
||||
ApplyLocalScanRelationshipReviewDecisionsInput,
|
||||
ApplyLocalScanRelationshipReviewDecisionsResult,
|
||||
} from './relationship-review-apply.js';
|
||||
export { applyLocalScanRelationshipReviewDecisions } from './relationship-review-apply.js';
|
||||
export type {
|
||||
KloRelationshipReviewDecisionArtifact,
|
||||
KloRelationshipReviewDecisionEntry,
|
||||
KloRelationshipReviewDecisionValue,
|
||||
WriteLocalScanRelationshipReviewDecisionInput,
|
||||
WriteLocalScanRelationshipReviewDecisionResult,
|
||||
} from './relationship-review-decisions.js';
|
||||
export { writeLocalScanRelationshipReviewDecision } from './relationship-review-decisions.js';
|
||||
export type {
|
||||
KloRelationshipFixtureOrigin,
|
||||
KloRelationshipScoreBreakdown,
|
||||
KloRelationshipScoreSignal,
|
||||
KloRelationshipScoreWeights,
|
||||
KloRelationshipScoringCalibrationObservation,
|
||||
KloRelationshipSignalVector,
|
||||
} from './relationship-scoring.js';
|
||||
export {
|
||||
calibrateWeightsFromSyntheticFixtures,
|
||||
defaultKloRelationshipScoreWeights,
|
||||
KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS,
|
||||
normalizeKloRelationshipScoreWeights,
|
||||
scoreKloRelationshipCandidate,
|
||||
} from './relationship-scoring.js';
|
||||
export type {
|
||||
AdviseLocalRelationshipFeedbackThresholdsInput,
|
||||
BuildKloRelationshipThresholdAdviceReportInput,
|
||||
KloRelationshipThresholdAdviceCandidate,
|
||||
KloRelationshipThresholdAdviceReport,
|
||||
KloRelationshipThresholdAdviceStatus,
|
||||
} from './relationship-threshold-advice.js';
|
||||
export {
|
||||
adviseLocalRelationshipFeedbackThresholds,
|
||||
buildKloRelationshipThresholdAdviceReport,
|
||||
formatKloRelationshipThresholdAdviceMarkdown,
|
||||
} from './relationship-threshold-advice.js';
|
||||
export type {
|
||||
KloRelationshipValidationEvidence,
|
||||
KloRelationshipValidationSettings,
|
||||
KloValidatedRelationshipDiscoveryCandidate,
|
||||
KloValidatedRelationshipStatus,
|
||||
ValidateKloRelationshipDiscoveryCandidatesInput,
|
||||
} from './relationship-validation.js';
|
||||
export { validateKloRelationshipDiscoveryCandidates } from './relationship-validation.js';
|
||||
export type { SqliteLocalScanEnrichmentStateStoreOptions } from './sqlite-local-enrichment-state-store.js';
|
||||
export { SqliteLocalScanEnrichmentStateStore } from './sqlite-local-enrichment-state-store.js';
|
||||
export type { KloColumnTypeMapping } from './type-normalization.js';
|
||||
export {
|
||||
inferKloDimensionType,
|
||||
kloColumnTypeMappingFromNative,
|
||||
normalizeKloNativeType,
|
||||
} from './type-normalization.js';
|
||||
export type {
|
||||
KloColumnSampleInput,
|
||||
KloColumnSampleResult,
|
||||
KloColumnStatsInput,
|
||||
KloColumnStatsResult,
|
||||
KloConnectionDriver,
|
||||
KloConnectorCapabilities,
|
||||
KloCredentialEnvelope,
|
||||
KloCredentialEnvReference,
|
||||
KloCredentialFileReference,
|
||||
KloEmbeddingPort,
|
||||
KloEventPropertyDiscovery,
|
||||
KloEventPropertyDiscoveryInput,
|
||||
KloEventPropertyValuesInput,
|
||||
KloEventPropertyValuesResult,
|
||||
KloEventStreamDiscoveryPort,
|
||||
KloEventTypeDiscovery,
|
||||
KloEventTypeDiscoveryInput,
|
||||
KloNetworkEndpoint,
|
||||
KloNetworkTunnelPort,
|
||||
KloNetworkTunnelRequest,
|
||||
KloOptionalConnectorCapabilities,
|
||||
KloProgressPort,
|
||||
KloProgressUpdateOptions,
|
||||
KloQueryResult,
|
||||
KloReadOnlyQueryInput,
|
||||
KloResolvedCredentialEnvelope,
|
||||
KloScanArtifactPaths,
|
||||
KloScanConnector,
|
||||
KloScanContext,
|
||||
KloScanDiffSummary,
|
||||
KloScanEnrichmentStage,
|
||||
KloScanEnrichmentStateSummary,
|
||||
KloScanEnrichmentSummary,
|
||||
KloScanInput,
|
||||
KloScanLoggerPort,
|
||||
KloScanMode,
|
||||
KloScanRelationshipSummary,
|
||||
KloScanReport,
|
||||
KloScanTrigger,
|
||||
KloScanWarning,
|
||||
KloScanWarningCode,
|
||||
KloSchemaColumn,
|
||||
KloSchemaDimensionType,
|
||||
KloSchemaForeignKey,
|
||||
KloSchemaScope,
|
||||
KloSchemaSnapshot,
|
||||
KloSchemaTable,
|
||||
KloSchemaTableKind,
|
||||
KloStructuralSyncStats,
|
||||
KloTableRef,
|
||||
KloTableSampleInput,
|
||||
KloTableSampleResult,
|
||||
} from './types.js';
|
||||
export { createKloConnectorCapabilities } from './types.js';
|
||||
852
packages/context/src/scan/local-enrichment-artifacts.test.ts
Normal file
852
packages/context/src/scan/local-enrichment-artifacts.test.ts
Normal file
|
|
@ -0,0 +1,852 @@
|
|||
import { mkdtemp, readFile, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import YAML from 'yaml';
|
||||
import { initKloProject, type KloLocalProject } from '../project/index.js';
|
||||
import type { KloLocalScanEnrichmentResult } from './local-enrichment.js';
|
||||
import { writeLocalScanEnrichmentArtifacts, writeLocalScanManifestShards } from './local-enrichment-artifacts.js';
|
||||
import type { KloSchemaSnapshot } from './types.js';
|
||||
|
||||
const snapshot: KloSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: 'DB customer table',
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'DB customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'DB orders table',
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_customer_id_fkey',
|
||||
},
|
||||
],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'DB order id',
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: 'DB customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
function enrichment(): KloLocalScanEnrichmentResult {
|
||||
return {
|
||||
snapshot,
|
||||
summary: {
|
||||
dataDictionary: 'completed',
|
||||
tableDescriptions: 'completed',
|
||||
columnDescriptions: 'completed',
|
||||
embeddings: 'completed',
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
|
||||
state: {
|
||||
resumedStages: [],
|
||||
completedStages: ['descriptions', 'embeddings', 'relationships'],
|
||||
failedStages: [],
|
||||
},
|
||||
warnings: [],
|
||||
descriptionUpdates: [
|
||||
{
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
tableDescription: 'AI orders table',
|
||||
columnDescriptions: {
|
||||
id: 'AI order id',
|
||||
customer_id: 'AI customer reference',
|
||||
},
|
||||
},
|
||||
{
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
tableDescription: 'AI customers table',
|
||||
columnDescriptions: {
|
||||
id: 'AI customer id',
|
||||
},
|
||||
},
|
||||
],
|
||||
embeddingUpdates: [
|
||||
{ columnId: 'public.orders.id', text: 'orders id', embedding: [0.1, 0.2] },
|
||||
{ columnId: 'public.orders.customer_id', text: 'orders customer_id', embedding: [0.3, 0.4] },
|
||||
],
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
source: 'inferred',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
relationshipProfile: {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 6,
|
||||
tables: [{ table: { catalog: null, db: 'public', name: 'customers' }, rowCount: 2 }],
|
||||
columns: {
|
||||
'customers.id': {
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
column: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
},
|
||||
resolvedRelationships: [
|
||||
{
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
source: 'llm_proposal',
|
||||
status: 'accepted',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.92,
|
||||
pkScore: 0.95,
|
||||
fkScore: 0.91,
|
||||
score: 0.9,
|
||||
evidence: {
|
||||
sourceColumnBase: 'buyer',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.88,
|
||||
nameScore: 0.45,
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
checkedValues: 2,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
graph: {
|
||||
targetPkScore: 0.95,
|
||||
incomingCandidateCount: 1,
|
||||
conflictRank: 1,
|
||||
reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
compositeRelationships: null,
|
||||
};
|
||||
}
|
||||
|
||||
describe('writeLocalScanEnrichmentArtifacts', () => {
|
||||
let tempDir: string;
|
||||
let project: KloLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-local-enrichment-artifacts-'));
|
||||
project = await initKloProject({
|
||||
projectDir: join(tempDir, 'project'),
|
||||
projectName: 'warehouse',
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('writes enrichment artifacts and manifest shards while preserving external descriptions', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
YAML.stringify(
|
||||
{
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
descriptions: { user: 'Pinned analyst description', ai: 'Old AI description' },
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
descriptions: { user: 'Pinned id description', ai: 'Old AI id' },
|
||||
},
|
||||
{ name: 'customer_id', type: 'number' },
|
||||
],
|
||||
joins: [
|
||||
{
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
{ indent: 2, lineWidth: 0 },
|
||||
),
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed manifest shard',
|
||||
);
|
||||
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-1',
|
||||
driver: 'postgres',
|
||||
enrichment: enrichment(),
|
||||
dryRun: false,
|
||||
relationshipSettings: {
|
||||
enabled: true,
|
||||
llmProposals: false,
|
||||
validationRequiredForManifest: true,
|
||||
acceptThreshold: 0.91,
|
||||
reviewThreshold: 0.61,
|
||||
maxLlmTablesPerBatch: 12,
|
||||
maxCandidatesPerColumn: 7,
|
||||
profileSampleRows: 500,
|
||||
validationConcurrency: 2,
|
||||
},
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
enrichmentArtifacts: [
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/embeddings.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json',
|
||||
],
|
||||
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
|
||||
manifestShardsWritten: 1,
|
||||
});
|
||||
|
||||
await expect(
|
||||
readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/descriptions.json'),
|
||||
'utf-8',
|
||||
),
|
||||
).resolves.toContain('AI orders table');
|
||||
|
||||
const relationshipsRaw = await readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationships.json'),
|
||||
'utf-8',
|
||||
);
|
||||
const relationshipsArtifact = JSON.parse(relationshipsRaw) as {
|
||||
accepted: Array<{
|
||||
id: string;
|
||||
status: string;
|
||||
source: string;
|
||||
pkScore: number;
|
||||
fkScore: number;
|
||||
evidence: unknown;
|
||||
reasons: string[];
|
||||
validation: unknown;
|
||||
graph: unknown;
|
||||
}>;
|
||||
review: unknown[];
|
||||
rejected: unknown[];
|
||||
skipped: unknown[];
|
||||
};
|
||||
expect(relationshipsArtifact.accepted).toHaveLength(1);
|
||||
expect(relationshipsArtifact.accepted[0]).toMatchObject({
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
status: 'accepted',
|
||||
source: 'llm_proposal',
|
||||
pkScore: 0.95,
|
||||
fkScore: 0.91,
|
||||
evidence: expect.objectContaining({
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
}),
|
||||
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
|
||||
validation: expect.objectContaining({ reasons: ['validation_passed'] }),
|
||||
graph: expect.objectContaining({ reasons: ['target_pk_score_passed', 'validation_passed', 'fk_score_passed'] }),
|
||||
});
|
||||
expect(relationshipsArtifact.review).toEqual([]);
|
||||
expect(relationshipsArtifact.rejected).toEqual([]);
|
||||
expect(relationshipsArtifact.skipped).toEqual([]);
|
||||
|
||||
const profileRaw = await readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json'),
|
||||
'utf-8',
|
||||
);
|
||||
expect(JSON.parse(profileRaw)).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 6,
|
||||
warnings: [],
|
||||
});
|
||||
|
||||
const diagnosticsRaw = await readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-diagnostics.json'),
|
||||
'utf-8',
|
||||
);
|
||||
expect(JSON.parse(diagnosticsRaw)).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
summary: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
|
||||
noAcceptedReason: null,
|
||||
candidateCountsBySource: { llm_proposal: 1 },
|
||||
validation: { available: true, sqlAvailable: true, queryCount: 6 },
|
||||
thresholds: { acceptThreshold: 0.91, reviewThreshold: 0.61 },
|
||||
policy: {
|
||||
validationRequiredForManifest: true,
|
||||
maxCandidatesPerColumn: 7,
|
||||
profileSampleRows: 500,
|
||||
validationConcurrency: 2,
|
||||
},
|
||||
profileWarnings: [],
|
||||
});
|
||||
|
||||
const manifestRaw = await readFile(
|
||||
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
|
||||
'utf-8',
|
||||
);
|
||||
const manifest = YAML.parse(manifestRaw) as {
|
||||
tables: {
|
||||
orders: {
|
||||
descriptions: Record<string, string>;
|
||||
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
|
||||
joins: Array<{ to: string; on: string; source: string }>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
expect(manifest.tables.orders.descriptions).toEqual({
|
||||
user: 'Pinned analyst description',
|
||||
db: 'DB orders table',
|
||||
ai: 'AI orders table',
|
||||
});
|
||||
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
|
||||
user: 'Pinned id description',
|
||||
db: 'DB order id',
|
||||
ai: 'AI order id',
|
||||
});
|
||||
expect(manifest.tables.orders.joins).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
source: 'formal',
|
||||
}),
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
source: 'manual',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('writes formal accepted relationships into relationship artifacts and manifest shards', async () => {
|
||||
const source = enrichment();
|
||||
const formalEnrichment: KloLocalScanEnrichmentResult = {
|
||||
...source,
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'public.orders:public.orders.customer_id->public.customers:public.customers.id',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
resolvedRelationships: [],
|
||||
compositeRelationships: null,
|
||||
};
|
||||
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
syncId: 'sync-formal',
|
||||
enrichment: formalEnrichment,
|
||||
relationshipSettings: {
|
||||
enabled: true,
|
||||
llmProposals: false,
|
||||
validationRequiredForManifest: true,
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
maxLlmTablesPerBatch: 40,
|
||||
maxCandidatesPerColumn: 25,
|
||||
profileSampleRows: 10000,
|
||||
validationConcurrency: 4,
|
||||
},
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
const relationshipsPath = 'raw-sources/warehouse/live-database/sync-formal/enrichment/relationships.json';
|
||||
const relationships = JSON.parse((await project.fileStore.readFile(relationshipsPath)).content) as {
|
||||
accepted: Array<{ source: string; reasons: string[] }>;
|
||||
};
|
||||
expect(relationships.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
source: 'formal',
|
||||
reasons: ['formal_metadata_accepted'],
|
||||
}),
|
||||
]);
|
||||
|
||||
const manifestPath = result.manifestShards[0];
|
||||
if (!manifestPath) {
|
||||
throw new Error('Expected manifest shard path');
|
||||
}
|
||||
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
|
||||
tables: { orders: { joins: Array<{ to: string; on: string; source: string }> } };
|
||||
};
|
||||
expect(manifest.tables.orders.joins).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
source: 'formal',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('writes manually applied relationship joins with manual source', async () => {
|
||||
const result = await writeLocalScanManifestShards({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-manual',
|
||||
driver: 'postgres',
|
||||
snapshot,
|
||||
dryRun: false,
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'public.orders:(public.orders.customer_id)->public.customers:(public.customers.id)',
|
||||
source: 'manual',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.manifestShardsWritten).toBe(1);
|
||||
const shard = YAML.parse(await readFile(join(tempDir, 'project/semantic-layer/warehouse/_schema/public.yaml'), 'utf8'));
|
||||
expect(shard.tables.orders.joins).toContainEqual({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
});
|
||||
});
|
||||
|
||||
it('writes accepted composite relationships to relationship artifacts and manifest shards', async () => {
|
||||
const compositeSnapshot: KloSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-05-07T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'order_lines',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'order_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'line_number',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'order_line_allocations',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'order_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'line_number',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
const compositeEnrichment: KloLocalScanEnrichmentResult = Object.assign(enrichment(), {
|
||||
snapshot: compositeSnapshot,
|
||||
relationships: { accepted: 1, review: 0, rejected: 0, skipped: 0 },
|
||||
descriptionUpdates: [],
|
||||
embeddingUpdates: [],
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'inferred',
|
||||
from: {
|
||||
tableId: 'public.order_line_allocations',
|
||||
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.order_lines',
|
||||
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
resolvedRelationships: [],
|
||||
compositeRelationships: [
|
||||
{
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'composite_profile_match',
|
||||
status: 'accepted',
|
||||
from: {
|
||||
tableId: 'public.order_line_allocations',
|
||||
columnIds: ['public.order_line_allocations.order_id', 'public.order_line_allocations.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_line_allocations' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.order_lines',
|
||||
columnIds: ['public.order_lines.order_id', 'public.order_lines.line_number'],
|
||||
table: { catalog: null, db: 'public', name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
reasons: ['composite_validation_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
syncId: 'sync-composite',
|
||||
enrichment: compositeEnrichment,
|
||||
relationshipSettings: {
|
||||
enabled: true,
|
||||
llmProposals: false,
|
||||
validationRequiredForManifest: true,
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
maxLlmTablesPerBatch: 40,
|
||||
maxCandidatesPerColumn: 25,
|
||||
profileSampleRows: 10000,
|
||||
validationConcurrency: 4,
|
||||
},
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
const relationships = JSON.parse(
|
||||
(await project.fileStore.readFile('raw-sources/warehouse/live-database/sync-composite/enrichment/relationships.json'))
|
||||
.content,
|
||||
) as { accepted: Array<{ from: { columns: string[] }; to: { columns: string[] }; reasons: string[] }> };
|
||||
expect(relationships.accepted[0]).toMatchObject({
|
||||
from: { columns: ['order_id', 'line_number'] },
|
||||
to: { columns: ['order_id', 'line_number'] },
|
||||
reasons: ['composite_validation_passed'],
|
||||
});
|
||||
|
||||
const manifestPath = result.manifestShards[0];
|
||||
if (!manifestPath) {
|
||||
throw new Error('Expected manifest shard path');
|
||||
}
|
||||
const manifest = YAML.parse((await project.fileStore.readFile(manifestPath)).content) as {
|
||||
tables: { order_line_allocations: { joins: Array<{ to: string; on: string; source: string }> } };
|
||||
};
|
||||
expect(manifest.tables.order_line_allocations.joins).toEqual([
|
||||
{
|
||||
to: 'order_lines',
|
||||
on: 'order_line_allocations.order_id = order_lines.order_id AND order_line_allocations.line_number = order_lines.line_number',
|
||||
relationship: 'many_to_one',
|
||||
source: 'inferred',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('writes structural manifest shards without enrichment artifacts', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
YAML.stringify(
|
||||
{
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
descriptions: { user: 'Pinned structural description', ai: 'Old generated text' },
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
descriptions: { user: 'Pinned structural id', ai: 'Old generated id' },
|
||||
},
|
||||
{ name: 'customer_id', type: 'number' },
|
||||
],
|
||||
joins: [
|
||||
{
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
{ indent: 2, lineWidth: 0 },
|
||||
),
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed structural manifest shard',
|
||||
);
|
||||
|
||||
const result = await writeLocalScanManifestShards({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-structural-1',
|
||||
driver: 'postgres',
|
||||
snapshot,
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
|
||||
manifestShardsWritten: 1,
|
||||
});
|
||||
|
||||
await expect(
|
||||
readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-structural-1/enrichment/descriptions.json'),
|
||||
'utf-8',
|
||||
),
|
||||
).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
|
||||
const manifestRaw = await readFile(
|
||||
join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'),
|
||||
'utf-8',
|
||||
);
|
||||
const manifest = YAML.parse(manifestRaw) as {
|
||||
tables: {
|
||||
orders: {
|
||||
descriptions: Record<string, string>;
|
||||
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
|
||||
joins: Array<{ to: string; on: string; source: string }>;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
expect(manifest.tables.orders.descriptions).toEqual({
|
||||
user: 'Pinned structural description',
|
||||
db: 'DB orders table',
|
||||
});
|
||||
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
|
||||
user: 'Pinned structural id',
|
||||
db: 'DB order id',
|
||||
});
|
||||
expect(manifest.tables.orders.joins).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
source: 'formal',
|
||||
}),
|
||||
expect.objectContaining({
|
||||
to: 'customers',
|
||||
on: 'orders.id = customers.id',
|
||||
source: 'manual',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('returns planned empty paths without writing files during dry runs', async () => {
|
||||
const result = await writeLocalScanEnrichmentArtifacts({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-dry-run',
|
||||
driver: 'postgres',
|
||||
enrichment: enrichment(),
|
||||
dryRun: true,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
enrichmentArtifacts: [],
|
||||
manifestShards: [],
|
||||
manifestShardsWritten: 0,
|
||||
});
|
||||
await expect(
|
||||
readFile(
|
||||
join(project.projectDir, 'raw-sources/warehouse/live-database/sync-dry-run/enrichment/descriptions.json'),
|
||||
'utf-8',
|
||||
),
|
||||
).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
});
|
||||
});
|
||||
417
packages/context/src/scan/local-enrichment-artifacts.ts
Normal file
417
packages/context/src/scan/local-enrichment-artifacts.ts
Normal file
|
|
@ -0,0 +1,417 @@
|
|||
import YAML from 'yaml';
|
||||
import {
|
||||
buildLiveDatabaseManifestShards,
|
||||
type LiveDatabaseManifestExistingDescriptions,
|
||||
type LiveDatabaseManifestJoinData,
|
||||
type LiveDatabaseManifestJoinEntry,
|
||||
type LiveDatabaseManifestShard,
|
||||
type LiveDatabaseManifestTableData,
|
||||
} from '../ingest/index.js';
|
||||
import type { KloScanRelationshipConfig } from '../project/config.js';
|
||||
import type { KloLocalProject } from '../project/index.js';
|
||||
import type { KloLocalScanEnrichmentResult } from './local-enrichment.js';
|
||||
import {
|
||||
buildKloRelationshipArtifacts,
|
||||
buildKloRelationshipDiagnostics,
|
||||
emptyKloRelationshipProfileArtifact,
|
||||
} from './relationship-diagnostics.js';
|
||||
import type { KloConnectionDriver, KloSchemaColumn, KloSchemaSnapshot, KloSchemaTable } from './types.js';
|
||||
|
||||
const LIVE_DATABASE_ADAPTER = 'live-database';
|
||||
const LOCAL_AUTHOR = 'klo';
|
||||
const LOCAL_AUTHOR_EMAIL = 'klo@example.com';
|
||||
const SCHEMA_DIR = '_schema';
|
||||
const SL_DIR_PREFIX = 'semantic-layer';
|
||||
|
||||
export interface WriteLocalScanManifestShardsInput {
|
||||
project: KloLocalProject;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
driver: KloConnectionDriver;
|
||||
snapshot: KloSchemaSnapshot;
|
||||
dryRun: boolean;
|
||||
descriptionUpdates?: KloLocalScanEnrichmentResult['descriptionUpdates'];
|
||||
relationshipUpdate?: KloLocalScanEnrichmentResult['relationshipUpdate'];
|
||||
}
|
||||
|
||||
export interface WriteLocalScanManifestShardsResult {
|
||||
manifestShards: string[];
|
||||
manifestShardsWritten: number;
|
||||
}
|
||||
|
||||
export interface WriteLocalScanEnrichmentArtifactsInput {
|
||||
project: KloLocalProject;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
driver: KloConnectionDriver;
|
||||
enrichment: KloLocalScanEnrichmentResult;
|
||||
dryRun: boolean;
|
||||
relationshipSettings?: KloScanRelationshipConfig;
|
||||
}
|
||||
|
||||
export interface WriteLocalScanEnrichmentArtifactsResult extends WriteLocalScanManifestShardsResult {
|
||||
enrichmentArtifacts: string[];
|
||||
}
|
||||
|
||||
interface ExistingManifestState {
|
||||
descriptions: Map<string, LiveDatabaseManifestExistingDescriptions>;
|
||||
preservedJoins: Map<string, LiveDatabaseManifestJoinEntry[]>;
|
||||
}
|
||||
|
||||
type LocalDescriptionUpdates = KloLocalScanEnrichmentResult['descriptionUpdates'];
|
||||
|
||||
function artifactDir(connectionId: string, syncId: string): string {
|
||||
return `raw-sources/${connectionId}/${LIVE_DATABASE_ADAPTER}/${syncId}/enrichment`;
|
||||
}
|
||||
|
||||
function schemaDir(connectionId: string): string {
|
||||
return `${SL_DIR_PREFIX}/${connectionId}/${SCHEMA_DIR}`;
|
||||
}
|
||||
|
||||
function tableDescription(
|
||||
table: KloSchemaTable,
|
||||
descriptionUpdates: LocalDescriptionUpdates = [],
|
||||
): Record<string, string> | undefined {
|
||||
const update = descriptionUpdates.find((candidate) => candidate.table.name === table.name);
|
||||
const descriptions: Record<string, string> = {};
|
||||
if (table.comment) {
|
||||
descriptions.db = table.comment;
|
||||
}
|
||||
if (update?.tableDescription) {
|
||||
descriptions.ai = update.tableDescription;
|
||||
}
|
||||
return Object.keys(descriptions).length > 0 ? descriptions : undefined;
|
||||
}
|
||||
|
||||
function columnDescription(
|
||||
table: KloSchemaTable,
|
||||
column: KloSchemaColumn,
|
||||
descriptionUpdates: LocalDescriptionUpdates = [],
|
||||
): Record<string, string> | undefined {
|
||||
const update = descriptionUpdates.find((candidate) => candidate.table.name === table.name);
|
||||
const aiDescription = update?.columnDescriptions[column.name] ?? null;
|
||||
const descriptions: Record<string, string> = {};
|
||||
if (column.comment) {
|
||||
descriptions.db = column.comment;
|
||||
}
|
||||
if (aiDescription) {
|
||||
descriptions.ai = aiDescription;
|
||||
}
|
||||
return Object.keys(descriptions).length > 0 ? descriptions : undefined;
|
||||
}
|
||||
|
||||
function snapshotTablesToManifestData(
|
||||
snapshot: KloSchemaSnapshot,
|
||||
descriptionUpdates: LocalDescriptionUpdates = [],
|
||||
): LiveDatabaseManifestTableData[] {
|
||||
return snapshot.tables.map((table) => ({
|
||||
name: table.name,
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
descriptions: tableDescription(table, descriptionUpdates),
|
||||
columns: table.columns.map((column) => ({
|
||||
name: column.name,
|
||||
type: column.dimensionType,
|
||||
...(column.primaryKey ? { pk: true } : {}),
|
||||
...(column.nullable === false ? { nullable: false } : {}),
|
||||
descriptions: columnDescription(table, column, descriptionUpdates),
|
||||
})),
|
||||
}));
|
||||
}
|
||||
|
||||
function formalJoins(snapshot: KloSchemaSnapshot): LiveDatabaseManifestJoinData[] {
|
||||
const joins: LiveDatabaseManifestJoinData[] = [];
|
||||
for (const table of snapshot.tables) {
|
||||
for (const foreignKey of table.foreignKeys) {
|
||||
joins.push({
|
||||
fromTable: table.name,
|
||||
fromColumns: [foreignKey.fromColumn],
|
||||
toTable: foreignKey.toTable,
|
||||
toColumns: [foreignKey.toColumn],
|
||||
relationship: 'many_to_one',
|
||||
source: 'formal',
|
||||
});
|
||||
}
|
||||
}
|
||||
return joins;
|
||||
}
|
||||
|
||||
function acceptedRelationshipJoins(
|
||||
relationshipUpdate: KloLocalScanEnrichmentResult['relationshipUpdate'] | undefined,
|
||||
): LiveDatabaseManifestJoinData[] {
|
||||
return (relationshipUpdate?.accepted ?? []).map((relationship) => ({
|
||||
fromTable: relationship.from.table.name,
|
||||
fromColumns: relationship.from.columns,
|
||||
toTable: relationship.to.table.name,
|
||||
toColumns: relationship.to.columns,
|
||||
relationship: relationship.relationshipType,
|
||||
source: relationship.source,
|
||||
}));
|
||||
}
|
||||
|
||||
function relationshipJoins(
|
||||
snapshot: KloSchemaSnapshot,
|
||||
relationshipUpdate: KloLocalScanEnrichmentResult['relationshipUpdate'] | undefined,
|
||||
): LiveDatabaseManifestJoinData[] {
|
||||
const accepted = acceptedRelationshipJoins(relationshipUpdate);
|
||||
const manual = accepted.filter((relationship) => relationship.source === 'manual');
|
||||
const generated = accepted.filter((relationship) => relationship.source !== 'manual');
|
||||
return [...manual, ...formalJoins(snapshot), ...generated];
|
||||
}
|
||||
|
||||
function validColumns(snapshot: KloSchemaSnapshot): Map<string, Set<string>> {
|
||||
return new Map(snapshot.tables.map((table) => [table.name, new Set(table.columns.map((column) => column.name))]));
|
||||
}
|
||||
|
||||
function joinReferencesExistingColumns(
|
||||
join: LiveDatabaseManifestJoinEntry,
|
||||
columnsByTable: Map<string, Set<string>>,
|
||||
): boolean {
|
||||
const terms = join.on.split(/\s+AND\s+/iu);
|
||||
for (const term of terms) {
|
||||
const match = term.match(/^(\w+)\.(\w+)\s*=\s*(\w+)\.(\w+)$/u);
|
||||
if (!match) {
|
||||
return true;
|
||||
}
|
||||
const leftTable = match[1];
|
||||
const leftColumn = match[2];
|
||||
const rightTable = match[3];
|
||||
const rightColumn = match[4];
|
||||
if (!leftTable || !leftColumn || !rightTable || !rightColumn) {
|
||||
return true;
|
||||
}
|
||||
const leftColumns = columnsByTable.get(leftTable);
|
||||
const rightColumns = columnsByTable.get(rightTable);
|
||||
if ((leftColumns && !leftColumns.has(leftColumn)) || (rightColumns && !rightColumns.has(rightColumn))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
async function loadExistingManifestState(
|
||||
project: KloLocalProject,
|
||||
connectionId: string,
|
||||
snapshot: KloSchemaSnapshot,
|
||||
): Promise<ExistingManifestState> {
|
||||
const descriptions = new Map<string, LiveDatabaseManifestExistingDescriptions>();
|
||||
const preservedJoins = new Map<string, LiveDatabaseManifestJoinEntry[]>();
|
||||
const validTableNames = new Set(snapshot.tables.map((table) => table.name));
|
||||
const columnsByTable = validColumns(snapshot);
|
||||
|
||||
let files: string[];
|
||||
try {
|
||||
files = (await project.fileStore.listFiles(schemaDir(connectionId))).files.filter((file) => file.endsWith('.yaml'));
|
||||
} catch {
|
||||
return { descriptions, preservedJoins };
|
||||
}
|
||||
|
||||
for (const file of files) {
|
||||
try {
|
||||
const { content } = await project.fileStore.readFile(file);
|
||||
const shard = YAML.parse(content) as LiveDatabaseManifestShard | null;
|
||||
if (!shard?.tables) {
|
||||
continue;
|
||||
}
|
||||
for (const [tableName, entry] of Object.entries(shard.tables)) {
|
||||
if (!validTableNames.has(tableName)) {
|
||||
continue;
|
||||
}
|
||||
descriptions.set(tableName, {
|
||||
table: entry.descriptions ? { ...entry.descriptions } : undefined,
|
||||
columns: new Map(
|
||||
(entry.columns ?? []).flatMap((column) =>
|
||||
column.descriptions ? ([[column.name, { ...column.descriptions }]] as const) : [],
|
||||
),
|
||||
),
|
||||
});
|
||||
const joins = (entry.joins ?? []).filter((join) => {
|
||||
return (
|
||||
(join.source === 'manual' || join.source === 'inferred') &&
|
||||
validTableNames.has(join.to) &&
|
||||
joinReferencesExistingColumns(join, columnsByTable)
|
||||
);
|
||||
});
|
||||
if (joins.length > 0) {
|
||||
preservedJoins.set(tableName, joins);
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return { descriptions, preservedJoins };
|
||||
}
|
||||
|
||||
async function writeJsonArtifact(
|
||||
project: KloLocalProject,
|
||||
path: string,
|
||||
value: unknown,
|
||||
commitMessage: string,
|
||||
): Promise<void> {
|
||||
await project.fileStore.writeFile(
|
||||
path,
|
||||
`${JSON.stringify(value, null, 2)}\n`,
|
||||
LOCAL_AUTHOR,
|
||||
LOCAL_AUTHOR_EMAIL,
|
||||
commitMessage,
|
||||
);
|
||||
}
|
||||
|
||||
export async function writeLocalScanManifestShards(
|
||||
input: WriteLocalScanManifestShardsInput,
|
||||
): Promise<WriteLocalScanManifestShardsResult> {
|
||||
if (input.dryRun) {
|
||||
return {
|
||||
manifestShards: [],
|
||||
manifestShardsWritten: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const existing = await loadExistingManifestState(input.project, input.connectionId, input.snapshot);
|
||||
const { shards } = buildLiveDatabaseManifestShards({
|
||||
connectionType: input.driver.toUpperCase(),
|
||||
tables: snapshotTablesToManifestData(input.snapshot, input.descriptionUpdates),
|
||||
joins: relationshipJoins(input.snapshot, input.relationshipUpdate),
|
||||
existingDescriptions: existing.descriptions,
|
||||
existingPreservedJoins: existing.preservedJoins,
|
||||
mapColumnType: (dimensionType) => dimensionType,
|
||||
});
|
||||
|
||||
const manifestShards: string[] = [];
|
||||
for (const [shardKey, shard] of [...shards.entries()].sort(([left], [right]) => left.localeCompare(right))) {
|
||||
const path = `${schemaDir(input.connectionId)}/${shardKey}.yaml`;
|
||||
await input.project.fileStore.writeFile(
|
||||
path,
|
||||
YAML.stringify(shard, { indent: 2, lineWidth: 0 }),
|
||||
LOCAL_AUTHOR,
|
||||
LOCAL_AUTHOR_EMAIL,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write manifest shard ${shardKey} syncId=${input.syncId}`,
|
||||
);
|
||||
manifestShards.push(path);
|
||||
}
|
||||
|
||||
return {
|
||||
manifestShards,
|
||||
manifestShardsWritten: manifestShards.length,
|
||||
};
|
||||
}
|
||||
|
||||
export async function writeLocalScanEnrichmentArtifacts(
|
||||
input: WriteLocalScanEnrichmentArtifactsInput,
|
||||
): Promise<WriteLocalScanEnrichmentArtifactsResult> {
|
||||
if (input.dryRun) {
|
||||
return {
|
||||
enrichmentArtifacts: [],
|
||||
manifestShards: [],
|
||||
manifestShardsWritten: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const enrichmentRoot = artifactDir(input.connectionId, input.syncId);
|
||||
const descriptionsArtifact = `${enrichmentRoot}/descriptions.json`;
|
||||
const embeddingsArtifact = `${enrichmentRoot}/embeddings.json`;
|
||||
const relationshipsArtifact = `${enrichmentRoot}/relationships.json`;
|
||||
const relationshipProfileArtifact = `${enrichmentRoot}/relationship-profile.json`;
|
||||
const relationshipDiagnosticsArtifact = `${enrichmentRoot}/relationship-diagnostics.json`;
|
||||
const enrichmentArtifacts: string[] = [];
|
||||
|
||||
if (
|
||||
input.enrichment.summary.tableDescriptions === 'completed' ||
|
||||
input.enrichment.summary.columnDescriptions === 'completed'
|
||||
) {
|
||||
enrichmentArtifacts.push(descriptionsArtifact);
|
||||
await writeJsonArtifact(
|
||||
input.project,
|
||||
descriptionsArtifact,
|
||||
input.enrichment.descriptionUpdates,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write enrichment descriptions syncId=${input.syncId}`,
|
||||
);
|
||||
}
|
||||
if (input.enrichment.summary.embeddings === 'completed') {
|
||||
enrichmentArtifacts.push(embeddingsArtifact);
|
||||
await writeJsonArtifact(
|
||||
input.project,
|
||||
embeddingsArtifact,
|
||||
input.enrichment.embeddingUpdates,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write enrichment embeddings syncId=${input.syncId}`,
|
||||
);
|
||||
}
|
||||
enrichmentArtifacts.push(relationshipsArtifact, relationshipProfileArtifact, relationshipDiagnosticsArtifact);
|
||||
const hasResolvedRelationships = input.enrichment.resolvedRelationships !== null;
|
||||
const relationshipArtifacts = buildKloRelationshipArtifacts({
|
||||
connectionId: input.connectionId,
|
||||
resolvedRelationships: hasResolvedRelationships ? (input.enrichment.resolvedRelationships ?? []) : undefined,
|
||||
compositeRelationships: input.enrichment.compositeRelationships ?? undefined,
|
||||
relationshipUpdate: input.enrichment.relationshipUpdate ?? {
|
||||
connectionId: input.connectionId,
|
||||
accepted: [],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
});
|
||||
const relationshipProfile =
|
||||
input.enrichment.relationshipProfile ??
|
||||
emptyKloRelationshipProfileArtifact({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
reason: 'relationship_profiling_not_run',
|
||||
});
|
||||
const relationshipDiagnostics = buildKloRelationshipDiagnostics({
|
||||
connectionId: input.connectionId,
|
||||
artifacts: relationshipArtifacts,
|
||||
profile: relationshipProfile,
|
||||
warnings: input.enrichment.warnings,
|
||||
thresholds: input.relationshipSettings
|
||||
? {
|
||||
acceptThreshold: input.relationshipSettings.acceptThreshold,
|
||||
reviewThreshold: input.relationshipSettings.reviewThreshold,
|
||||
}
|
||||
: undefined,
|
||||
policy: input.relationshipSettings
|
||||
? {
|
||||
validationRequiredForManifest: input.relationshipSettings.validationRequiredForManifest,
|
||||
maxCandidatesPerColumn: input.relationshipSettings.maxCandidatesPerColumn,
|
||||
profileSampleRows: input.relationshipSettings.profileSampleRows,
|
||||
validationConcurrency: input.relationshipSettings.validationConcurrency,
|
||||
}
|
||||
: undefined,
|
||||
});
|
||||
|
||||
await writeJsonArtifact(
|
||||
input.project,
|
||||
relationshipsArtifact,
|
||||
relationshipArtifacts,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write enrichment relationships syncId=${input.syncId}`,
|
||||
);
|
||||
await writeJsonArtifact(
|
||||
input.project,
|
||||
relationshipProfileArtifact,
|
||||
relationshipProfile,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write relationship profile syncId=${input.syncId}`,
|
||||
);
|
||||
await writeJsonArtifact(
|
||||
input.project,
|
||||
relationshipDiagnosticsArtifact,
|
||||
relationshipDiagnostics,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): write relationship diagnostics syncId=${input.syncId}`,
|
||||
);
|
||||
|
||||
const manifestResult = await writeLocalScanManifestShards({
|
||||
project: input.project,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
driver: input.driver,
|
||||
snapshot: input.enrichment.snapshot,
|
||||
descriptionUpdates: input.enrichment.descriptionUpdates,
|
||||
relationshipUpdate: input.enrichment.relationshipUpdate,
|
||||
dryRun: false,
|
||||
});
|
||||
|
||||
return {
|
||||
enrichmentArtifacts,
|
||||
manifestShards: manifestResult.manifestShards,
|
||||
manifestShardsWritten: manifestResult.manifestShardsWritten,
|
||||
};
|
||||
}
|
||||
742
packages/context/src/scan/local-enrichment.test.ts
Normal file
742
packages/context/src/scan/local-enrichment.test.ts
Normal file
|
|
@ -0,0 +1,742 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { buildDefaultKloProjectConfig } from '../project/config.js';
|
||||
import type {
|
||||
KloScanEnrichmentCompletedStage,
|
||||
KloScanEnrichmentFailedStage,
|
||||
KloScanEnrichmentStageLookup,
|
||||
KloScanEnrichmentStateStore,
|
||||
} from './enrichment-state.js';
|
||||
import {
|
||||
createDeterministicLocalScanEnrichmentProviders,
|
||||
runLocalScanEnrichment,
|
||||
snapshotToKloEnrichedSchema,
|
||||
} from './local-enrichment.js';
|
||||
import { createLocalScanEnrichmentProvidersFromConfig } from './local-scan.js';
|
||||
import {
|
||||
createKloConnectorCapabilities,
|
||||
type KloQueryResult,
|
||||
type KloReadOnlyQueryInput,
|
||||
type KloScanConnector,
|
||||
type KloScanContext,
|
||||
type KloSchemaSnapshot,
|
||||
} from './types.js';
|
||||
|
||||
const snapshot: KloSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: 'Customer accounts',
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Customer orders',
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Order id',
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
function connector(): KloScanConnector {
|
||||
return {
|
||||
id: 'test:warehouse',
|
||||
driver: 'postgres',
|
||||
capabilities: createKloConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
columnSampling: true,
|
||||
readOnlySql: true,
|
||||
columnStats: true,
|
||||
}),
|
||||
introspect: vi.fn(async () => snapshot),
|
||||
sampleTable: vi.fn(async () => ({
|
||||
headers: ['id', 'customer_id'],
|
||||
rows: [[1, 10]],
|
||||
totalRows: 1,
|
||||
})),
|
||||
sampleColumn: vi.fn(async () => ({
|
||||
values: ['10', '11'],
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
|
||||
executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function noDeclaredRelationshipSnapshot(): KloSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'accounts',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function memoryEnrichmentStateStore(): KloScanEnrichmentStateStore {
|
||||
const records = new Map<string, KloScanEnrichmentCompletedStage | KloScanEnrichmentFailedStage>();
|
||||
const key = (input: Pick<KloScanEnrichmentStageLookup, 'runId' | 'stage'>) => `${input.runId}:${input.stage}`;
|
||||
return {
|
||||
async findCompletedStage<TOutput>(input: KloScanEnrichmentStageLookup) {
|
||||
const record = records.get(key(input));
|
||||
if (!record || record.status !== 'completed' || record.inputHash !== input.inputHash) {
|
||||
return null;
|
||||
}
|
||||
return record as KloScanEnrichmentCompletedStage<TOutput>;
|
||||
},
|
||||
async saveCompletedStage(input) {
|
||||
records.set(key(input), {
|
||||
...input,
|
||||
status: 'completed',
|
||||
errorMessage: null,
|
||||
});
|
||||
},
|
||||
async saveFailedStage(input) {
|
||||
records.set(key(input), {
|
||||
...input,
|
||||
status: 'failed',
|
||||
output: null,
|
||||
});
|
||||
},
|
||||
async listRunStages(runId) {
|
||||
return [...records.values()].filter((record) => record.runId === runId);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('local scan enrichment', () => {
|
||||
it('maps a scan snapshot into relationship detector schema', () => {
|
||||
const schema = snapshotToKloEnrichedSchema(snapshot);
|
||||
|
||||
expect(schema.connectionId).toBe('warehouse');
|
||||
expect(schema.tables).toHaveLength(2);
|
||||
expect(schema.tables[1]?.columns.map((column) => column.name)).toEqual(['id', 'customer_id']);
|
||||
expect(schema.tables[1]?.columns[1]).toMatchObject({
|
||||
id: 'public.orders.customer_id',
|
||||
tableId: 'public.orders',
|
||||
primaryKey: false,
|
||||
sampleValues: null,
|
||||
embedding: null,
|
||||
});
|
||||
});
|
||||
|
||||
it('maps snapshot foreign keys into formal schema relationships', () => {
|
||||
const source = noDeclaredRelationshipSnapshot();
|
||||
const snapshotWithForeignKey = {
|
||||
...source,
|
||||
tables: source.tables.map((table) =>
|
||||
table.name === 'orders'
|
||||
? {
|
||||
...table,
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'account_id',
|
||||
toCatalog: null,
|
||||
toDb: null,
|
||||
toTable: 'accounts',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_account_id_fkey',
|
||||
},
|
||||
],
|
||||
}
|
||||
: table.name === 'accounts'
|
||||
? {
|
||||
...table,
|
||||
columns: table.columns.map((column) =>
|
||||
column.name === 'id' ? { ...column, primaryKey: true } : column,
|
||||
),
|
||||
}
|
||||
: table,
|
||||
),
|
||||
};
|
||||
|
||||
const schema = snapshotToKloEnrichedSchema(snapshotWithForeignKey);
|
||||
|
||||
expect(schema.relationships).toEqual([
|
||||
{
|
||||
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.account_id'],
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
columns: ['account_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'accounts',
|
||||
columnIds: ['accounts.id'],
|
||||
table: { catalog: null, db: null, name: 'accounts' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('runs deterministic relationship detection for relationship scans', async () => {
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-1' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.summary).toMatchObject({
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
});
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(result.summary.statisticalValidation).toBe('skipped');
|
||||
expect(result.warnings).toContainEqual({
|
||||
code: 'relationship_validation_failed',
|
||||
message: 'KLO scan connector advertises readOnlySql but does not expose executeReadOnly',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
});
|
||||
});
|
||||
|
||||
it('runs relationship discovery with connector SQL evidence', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
try {
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts (id) VALUES (1), (2);
|
||||
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
driver: 'sqlite' as const,
|
||||
capabilities: createKloConnectorCapabilities({ readOnlySql: true, columnStats: true }),
|
||||
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
|
||||
executeReadOnly: executor.executeReadOnly.bind(executor),
|
||||
};
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-relationship-discovery' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.summary.statisticalValidation).toBe('completed');
|
||||
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
|
||||
expect(result.resolvedRelationships).toEqual([
|
||||
expect.objectContaining({
|
||||
status: 'accepted',
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
}),
|
||||
]);
|
||||
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
|
||||
} finally {
|
||||
executor.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('honors scan relationship config when LLM proposals are disabled', async () => {
|
||||
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 });
|
||||
const getModel = vi.fn(() => ({ modelId: 'provider/language-model', provider: 'gateway' }));
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-llm-disabled' },
|
||||
providers: {
|
||||
...providers,
|
||||
llm: {
|
||||
...providers.llm,
|
||||
getModel: getModel as never,
|
||||
},
|
||||
},
|
||||
relationshipSettings: {
|
||||
...buildDefaultKloProjectConfig('warehouse').scan.relationships,
|
||||
llmProposals: false,
|
||||
maxLlmTablesPerBatch: 40,
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.summary.llmRelationshipValidation).toBe('skipped');
|
||||
expect(getModel).not.toHaveBeenCalledWith('candidateExtraction');
|
||||
});
|
||||
|
||||
it('skips relationship detection when scan relationships are disabled', async () => {
|
||||
const settings = {
|
||||
...buildDefaultKloProjectConfig('warehouse').scan.relationships,
|
||||
enabled: false,
|
||||
};
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
connector: connector(),
|
||||
context: { runId: 'disabled-relationships' },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders(),
|
||||
relationshipSettings: settings,
|
||||
});
|
||||
|
||||
expect(result.summary.deterministicRelationships).toBe('skipped');
|
||||
expect(result.summary.statisticalValidation).toBe('skipped');
|
||||
expect(result.summary.llmRelationshipValidation).toBe('skipped');
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate).toBeNull();
|
||||
expect(result.relationshipProfile).toBeNull();
|
||||
expect(result.resolvedRelationships).toBeNull();
|
||||
});
|
||||
|
||||
it('runs configured deterministic enrichment with descriptions and embeddings', async () => {
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-2' },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
|
||||
});
|
||||
|
||||
expect(result.summary).toMatchObject({
|
||||
dataDictionary: 'completed',
|
||||
tableDescriptions: 'completed',
|
||||
columnDescriptions: 'completed',
|
||||
embeddings: 'completed',
|
||||
deterministicRelationships: 'completed',
|
||||
});
|
||||
expect(result.embeddingUpdates).toHaveLength(3);
|
||||
expect(result.embeddingUpdates[0]?.embedding).toHaveLength(6);
|
||||
expect(result.snapshot).toEqual(snapshot);
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
});
|
||||
|
||||
it('reports enrichment progress for countable stages', async () => {
|
||||
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
|
||||
const progress = {
|
||||
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
|
||||
events.push({ progress: progressValue, message, transient: options?.transient });
|
||||
},
|
||||
startPhase() {
|
||||
return progress;
|
||||
},
|
||||
};
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: connector(),
|
||||
context: { runId: 'scan-run-progress', progress },
|
||||
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
|
||||
});
|
||||
|
||||
expect(events).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({ message: 'Generating descriptions 1/2 tables', transient: true }),
|
||||
expect.objectContaining({ message: 'Generating descriptions 2/2 tables', transient: true }),
|
||||
expect.objectContaining({ message: 'Building embeddings 1/1 batches', transient: true }),
|
||||
expect.objectContaining({ message: 'Detecting relationships' }),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('reports progress before enrichment connector introspection starts', async () => {
|
||||
const events: Array<{ progress: number; message?: string; transient?: boolean }> = [];
|
||||
const progress = {
|
||||
async update(progressValue: number, message?: string, options?: { transient?: boolean }) {
|
||||
events.push({ progress: progressValue, message, transient: options?.transient });
|
||||
},
|
||||
startPhase() {
|
||||
return progress;
|
||||
},
|
||||
};
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => {
|
||||
expect(events).toContainEqual(expect.objectContaining({ message: 'Loading enrichment schema snapshot' }));
|
||||
return snapshot;
|
||||
}),
|
||||
};
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'relationships',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-progress-before-introspection', progress },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(scanConnector.introspect).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('splits enrichment embedding requests by provider batch size', async () => {
|
||||
const manyColumnSnapshot: KloSchemaSnapshot = {
|
||||
...snapshot,
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'wide_orders',
|
||||
kind: 'table',
|
||||
comment: 'Wide order facts',
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: Array.from({ length: 5 }, (_, index) => ({
|
||||
name: `metric_${index + 1}`,
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number' as const,
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: `Metric ${index + 1}`,
|
||||
})),
|
||||
},
|
||||
],
|
||||
};
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => manyColumnSnapshot),
|
||||
};
|
||||
const deterministicProviders = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 3 });
|
||||
const embedBatch = vi.fn(async (texts: string[]) => {
|
||||
if (texts.length > 2) {
|
||||
throw new Error(`Embedding batch size ${texts.length} exceeds maximum 2`);
|
||||
}
|
||||
return texts.map((_, index) => [index, index + 1, index + 2]);
|
||||
});
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-batched-embeddings' },
|
||||
providers: {
|
||||
llm: deterministicProviders.llm,
|
||||
embedding: {
|
||||
dimensions: 3,
|
||||
maxBatchSize: 2,
|
||||
embedBatch,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.embeddingUpdates).toHaveLength(5);
|
||||
expect(embedBatch.mock.calls.map(([texts]) => texts).map((texts) => texts.length)).toEqual([2, 2, 1]);
|
||||
});
|
||||
|
||||
it('reuses completed description and embedding stages for the same run id and snapshot hash', async () => {
|
||||
const stateStore = memoryEnrichmentStateStore();
|
||||
const scanConnector = connector();
|
||||
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 });
|
||||
|
||||
const first = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-resume-1' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-1',
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
const getModel = vi.spyOn(providers.llm, 'getModel');
|
||||
const embedBatch = vi.spyOn(providers.embedding, 'embedBatch');
|
||||
const second = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-resume-1' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-1',
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
expect(first.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(first.state.resumedStages).toEqual([]);
|
||||
expect(second.state.resumedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(second.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(getModel).not.toHaveBeenCalled();
|
||||
expect(embedBatch).not.toHaveBeenCalled();
|
||||
expect(second.descriptionUpdates).toEqual(first.descriptionUpdates);
|
||||
expect(second.embeddingUpdates).toEqual(first.embeddingUpdates);
|
||||
expect(second.relationships).toEqual(first.relationships);
|
||||
});
|
||||
|
||||
it('does not reuse completed stages when the snapshot changes', async () => {
|
||||
const stateStore = memoryEnrichmentStateStore();
|
||||
const providers = createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 });
|
||||
const scanConnector = connector();
|
||||
|
||||
await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-resume-hash' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-hash',
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
const firstTable = snapshot.tables[0];
|
||||
if (!firstTable) {
|
||||
throw new Error('Expected test snapshot table');
|
||||
}
|
||||
const changedConnector = {
|
||||
...connector(),
|
||||
introspect: vi.fn(async () => ({
|
||||
...snapshot,
|
||||
tables: [{ ...firstTable, name: 'customers' }],
|
||||
})),
|
||||
};
|
||||
const getModel = vi.spyOn(providers.llm, 'getModel');
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: changedConnector,
|
||||
context: { runId: 'scan-run-resume-hash' },
|
||||
providers,
|
||||
stateStore,
|
||||
syncId: 'sync-resume-hash',
|
||||
providerIdentity: { provider: 'deterministic', embeddingDimensions: 6 },
|
||||
});
|
||||
|
||||
expect(result.state.resumedStages).toEqual([]);
|
||||
expect(result.state.completedStages).toEqual(['descriptions', 'embeddings', 'relationships']);
|
||||
expect(getModel).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('runs providerless enriched scans as relationship-only discovery enrichment', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
try {
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts (id) VALUES (1), (2);
|
||||
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
const scanConnector = {
|
||||
...connector(),
|
||||
driver: 'sqlite' as const,
|
||||
capabilities: createKloConnectorCapabilities({ readOnlySql: true, columnStats: true }),
|
||||
introspect: vi.fn(async () => noDeclaredRelationshipSnapshot()),
|
||||
executeReadOnly: executor.executeReadOnly.bind(executor),
|
||||
};
|
||||
|
||||
const result = await runLocalScanEnrichment({
|
||||
connectionId: 'warehouse',
|
||||
mode: 'enriched',
|
||||
detectRelationships: false,
|
||||
connector: scanConnector,
|
||||
context: { runId: 'scan-run-providerless-enriched' },
|
||||
providers: null,
|
||||
});
|
||||
|
||||
expect(result.summary).toEqual({
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'completed',
|
||||
});
|
||||
expect(result.descriptionUpdates).toEqual([]);
|
||||
expect(result.embeddingUpdates).toEqual([]);
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate?.accepted).toHaveLength(1);
|
||||
expect(result.relationshipProfile).toMatchObject({ sqlAvailable: true });
|
||||
expect(result.resolvedRelationships).toEqual([
|
||||
expect.objectContaining({
|
||||
status: 'accepted',
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
}),
|
||||
]);
|
||||
expect(result.warnings).toContainEqual({
|
||||
code: 'scan_enrichment_backend_not_configured',
|
||||
message:
|
||||
'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
skippedStages: ['descriptions', 'embeddings'],
|
||||
relationshipDetection: true,
|
||||
},
|
||||
});
|
||||
} finally {
|
||||
executor.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('resolves gateway LLM providers and OpenAI embeddings from local scan config', () => {
|
||||
const createKloLlmProvider = vi.fn(() => ({
|
||||
getModel: vi.fn().mockReturnValue({ modelId: 'provider/language-model', provider: 'gateway' }),
|
||||
}));
|
||||
const createKloEmbeddingProvider = vi.fn(() => ({
|
||||
dimensions: 1536,
|
||||
maxBatchSize: 8,
|
||||
embed: vi.fn(),
|
||||
[['embed', 'Many'].join('')]: vi.fn(),
|
||||
}));
|
||||
|
||||
const providers = createLocalScanEnrichmentProvidersFromConfig(
|
||||
{
|
||||
mode: 'llm',
|
||||
embeddings: {
|
||||
backend: 'openai',
|
||||
model: 'provider/embedding-model',
|
||||
dimensions: 1536,
|
||||
batchSize: 8,
|
||||
openai: { api_key: 'env:OPENAI_API_KEY' },
|
||||
},
|
||||
},
|
||||
{
|
||||
provider: {
|
||||
backend: 'gateway',
|
||||
gateway: {},
|
||||
},
|
||||
models: { default: 'provider/language-model' },
|
||||
},
|
||||
{
|
||||
createKloLlmProvider: createKloLlmProvider as any,
|
||||
createKloEmbeddingProvider: createKloEmbeddingProvider as any,
|
||||
env: { OPENAI_API_KEY: 'openai-key' },
|
||||
},
|
||||
);
|
||||
|
||||
expect(providers?.embedding.dimensions).toBe(1536);
|
||||
expect(providers?.embedding.maxBatchSize).toBe(8);
|
||||
expect(createKloLlmProvider).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ backend: 'gateway', modelSlots: { default: 'provider/language-model' } }),
|
||||
);
|
||||
expect(createKloEmbeddingProvider).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ backend: 'openai', model: 'provider/embedding-model' }),
|
||||
);
|
||||
});
|
||||
});
|
||||
659
packages/context/src/scan/local-enrichment.ts
Normal file
659
packages/context/src/scan/local-enrichment.ts
Normal file
|
|
@ -0,0 +1,659 @@
|
|||
import type { KloLlmProvider } from '@klo/llm';
|
||||
import { buildDefaultKloProjectConfig, type KloScanRelationshipConfig } from '../project/config.js';
|
||||
import { type KloDescriptionColumnTable, KloDescriptionGenerator } from './description-generation.js';
|
||||
import { buildKloColumnEmbeddingText } from './embedding-text.js';
|
||||
import {
|
||||
completedKloScanEnrichmentStateSummary,
|
||||
computeKloScanEnrichmentInputHash,
|
||||
type KloScanEnrichmentStateStore,
|
||||
summarizeKloScanEnrichmentState,
|
||||
} from './enrichment-state.js';
|
||||
import { skippedKloScanEnrichmentSummary } from './enrichment-summary.js';
|
||||
import type {
|
||||
KloEmbeddingUpdate,
|
||||
KloEnrichedColumn,
|
||||
KloEnrichedRelationship,
|
||||
KloEnrichedSchema,
|
||||
KloEnrichedTable,
|
||||
KloRelationshipEndpoint,
|
||||
KloRelationshipUpdate,
|
||||
} from './enrichment-types.js';
|
||||
import type { KloCompositeRelationshipCandidate } from './relationship-composite-candidates.js';
|
||||
import type { KloResolvedRelationshipDiscoveryCandidate } from './relationship-graph-resolver.js';
|
||||
import { discoverKloRelationships } from './relationship-discovery.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type {
|
||||
KloEmbeddingPort,
|
||||
KloProgressPort,
|
||||
KloScanConnector,
|
||||
KloScanContext,
|
||||
KloScanEnrichmentStage,
|
||||
KloScanEnrichmentStateSummary,
|
||||
KloScanEnrichmentSummary,
|
||||
KloScanMode,
|
||||
KloScanRelationshipSummary,
|
||||
KloScanWarning,
|
||||
KloSchemaColumn,
|
||||
KloSchemaForeignKey,
|
||||
KloSchemaSnapshot,
|
||||
KloSchemaTable,
|
||||
KloTableRef,
|
||||
} from './types.js';
|
||||
|
||||
export interface DeterministicLocalScanEnrichmentProviderOptions {
|
||||
embeddingDimensions?: number;
|
||||
maxBatchSize?: number;
|
||||
}
|
||||
|
||||
export interface KloLocalScanEnrichmentProviders {
|
||||
llm: KloLlmProvider;
|
||||
embedding: KloEmbeddingPort;
|
||||
}
|
||||
|
||||
export interface KloLocalScanEnrichmentInput {
|
||||
connectionId: string;
|
||||
mode: KloScanMode;
|
||||
detectRelationships?: boolean;
|
||||
connector: KloScanConnector;
|
||||
context: KloScanContext;
|
||||
providers: KloLocalScanEnrichmentProviders | null;
|
||||
stateStore?: KloScanEnrichmentStateStore | null;
|
||||
syncId?: string;
|
||||
providerIdentity?: Record<string, unknown>;
|
||||
relationshipSettings?: KloScanRelationshipConfig;
|
||||
now?: () => Date;
|
||||
}
|
||||
|
||||
export interface KloLocalScanEnrichmentResult {
|
||||
snapshot: KloSchemaSnapshot;
|
||||
summary: KloScanEnrichmentSummary;
|
||||
relationships: KloScanRelationshipSummary;
|
||||
state: KloScanEnrichmentStateSummary;
|
||||
warnings: KloScanWarning[];
|
||||
descriptionUpdates: Array<{
|
||||
table: KloTableRef;
|
||||
tableDescription: string | null;
|
||||
columnDescriptions: Record<string, string | null>;
|
||||
}>;
|
||||
embeddingUpdates: KloEmbeddingUpdate[];
|
||||
relationshipUpdate: KloRelationshipUpdate | null;
|
||||
relationshipProfile: KloRelationshipProfileArtifact | null;
|
||||
resolvedRelationships: KloResolvedRelationshipDiscoveryCandidate[] | null;
|
||||
compositeRelationships: KloCompositeRelationshipCandidate[] | null;
|
||||
}
|
||||
|
||||
function tableId(table: KloSchemaTable): string {
|
||||
return [table.catalog, table.db, table.name].filter((value): value is string => Boolean(value)).join('.');
|
||||
}
|
||||
|
||||
function columnId(table: KloSchemaTable, column: KloSchemaColumn): string {
|
||||
return `${tableId(table)}.${column.name}`;
|
||||
}
|
||||
|
||||
function tableRef(table: KloSchemaTable): KloTableRef {
|
||||
return {
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
name: table.name,
|
||||
};
|
||||
}
|
||||
|
||||
function endpoint(table: KloEnrichedTable, column: KloEnrichedColumn): KloRelationshipEndpoint {
|
||||
return {
|
||||
tableId: table.id,
|
||||
columnIds: [column.id],
|
||||
table: table.ref,
|
||||
columns: [column.name],
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipId(from: KloRelationshipEndpoint, to: KloRelationshipEndpoint): string {
|
||||
return `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`;
|
||||
}
|
||||
|
||||
function targetMatchesForeignKey(table: KloEnrichedTable, foreignKey: KloSchemaForeignKey): boolean {
|
||||
return (
|
||||
table.ref.name === foreignKey.toTable &&
|
||||
(foreignKey.toCatalog === null || table.ref.catalog === foreignKey.toCatalog) &&
|
||||
(foreignKey.toDb === null || table.ref.db === foreignKey.toDb)
|
||||
);
|
||||
}
|
||||
|
||||
function formalRelationshipsFromSnapshot(
|
||||
snapshot: KloSchemaSnapshot,
|
||||
tables: readonly KloEnrichedTable[],
|
||||
): KloEnrichedRelationship[] {
|
||||
const tableById = new Map(tables.map((table) => [table.id, table]));
|
||||
const relationships: KloEnrichedRelationship[] = [];
|
||||
|
||||
for (const sourceTableSnapshot of snapshot.tables) {
|
||||
const sourceTable = tableById.get(tableId(sourceTableSnapshot));
|
||||
if (!sourceTable) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const foreignKey of sourceTableSnapshot.foreignKeys) {
|
||||
const sourceColumn = sourceTable.columns.find((column) => column.name === foreignKey.fromColumn);
|
||||
const targetTable = tables.find((table) => targetMatchesForeignKey(table, foreignKey));
|
||||
const targetColumn = targetTable?.columns.find((column) => column.name === foreignKey.toColumn);
|
||||
if (!sourceColumn || !targetTable || !targetColumn) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const from = endpoint(sourceTable, sourceColumn);
|
||||
const to = endpoint(targetTable, targetColumn);
|
||||
relationships.push({
|
||||
id: relationshipId(from, to),
|
||||
source: 'formal',
|
||||
from,
|
||||
to,
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return relationships.sort((left, right) => left.id.localeCompare(right.id));
|
||||
}
|
||||
|
||||
function providerlessEnrichedWarning(relationshipDetection: boolean): KloScanWarning {
|
||||
return {
|
||||
code: 'scan_enrichment_backend_not_configured',
|
||||
message:
|
||||
'Skipping description and embedding enrichment because scan.enrichment.mode is not configured; relationship discovery still ran.',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
skippedStages: ['descriptions', 'embeddings'],
|
||||
relationshipDetection,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function hashEmbedding(text: string, dimensions: number): number[] {
|
||||
const values = Array.from({ length: dimensions }, (_, index) => {
|
||||
let hash = index + 17;
|
||||
for (const char of text) {
|
||||
hash = (hash * 31 + char.charCodeAt(0) + index) % 1009;
|
||||
}
|
||||
return Number(((hash % 200) / 100 - 1).toFixed(4));
|
||||
});
|
||||
return values;
|
||||
}
|
||||
|
||||
export function createDeterministicLocalScanEnrichmentProviders(
|
||||
options: DeterministicLocalScanEnrichmentProviderOptions = {},
|
||||
): KloLocalScanEnrichmentProviders {
|
||||
const dimensions = options.embeddingDimensions ?? 8;
|
||||
const maxBatchSize = options.maxBatchSize ?? 64;
|
||||
return {
|
||||
llm: deterministicLlmProvider(),
|
||||
embedding: {
|
||||
dimensions,
|
||||
maxBatchSize,
|
||||
async embedBatch(texts) {
|
||||
return texts.map((text) => hashEmbedding(text, dimensions));
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function deterministicLlmProvider(): KloLlmProvider {
|
||||
const model = { modelId: 'deterministic-scan', provider: 'deterministic' };
|
||||
return {
|
||||
getModel() {
|
||||
return model as ReturnType<KloLlmProvider['getModel']>;
|
||||
},
|
||||
getModelByName() {
|
||||
return model as ReturnType<KloLlmProvider['getModelByName']>;
|
||||
},
|
||||
cacheMarker() {
|
||||
return undefined;
|
||||
},
|
||||
repairToolCallHandler() {
|
||||
throw new Error('deterministic scan provider does not support tool-call repair');
|
||||
},
|
||||
thinkingProviderOptions() {
|
||||
return {};
|
||||
},
|
||||
telemetryConfig() {
|
||||
return undefined;
|
||||
},
|
||||
promptCachingConfig() {
|
||||
return {
|
||||
enabled: false,
|
||||
systemTtl: '1h',
|
||||
toolsTtl: '1h',
|
||||
historyTtl: '5m',
|
||||
cacheSystem: true,
|
||||
cacheTools: true,
|
||||
cacheHistory: true,
|
||||
vertexFallbackTo5m: false,
|
||||
};
|
||||
},
|
||||
activeBackend() {
|
||||
return 'gateway';
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export function snapshotToKloEnrichedSchema(
|
||||
snapshot: KloSchemaSnapshot,
|
||||
embeddingsByColumnId: ReadonlyMap<string, number[]> = new Map(),
|
||||
): KloEnrichedSchema {
|
||||
const tables: KloEnrichedTable[] = snapshot.tables.map((table) => {
|
||||
const id = tableId(table);
|
||||
const ref = tableRef(table);
|
||||
const columns: KloEnrichedColumn[] = table.columns.map((column) => {
|
||||
const idForColumn = columnId(table, column);
|
||||
return {
|
||||
id: idForColumn,
|
||||
tableId: id,
|
||||
tableRef: ref,
|
||||
name: column.name,
|
||||
nativeType: column.nativeType,
|
||||
normalizedType: column.normalizedType,
|
||||
dimensionType: column.dimensionType,
|
||||
nullable: column.nullable,
|
||||
primaryKey: column.primaryKey,
|
||||
parentColumnId: null,
|
||||
descriptions: {
|
||||
...(column.comment ? { db: column.comment } : {}),
|
||||
},
|
||||
embedding: embeddingsByColumnId.get(idForColumn) ?? null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
};
|
||||
});
|
||||
return {
|
||||
id,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {
|
||||
...(table.comment ? { db: table.comment } : {}),
|
||||
},
|
||||
columns,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
connectionId: snapshot.connectionId,
|
||||
tables,
|
||||
relationships: formalRelationshipsFromSnapshot(snapshot, tables),
|
||||
};
|
||||
}
|
||||
|
||||
function descriptionTable(table: KloSchemaTable): KloDescriptionColumnTable {
|
||||
return {
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
name: table.name,
|
||||
columns: table.columns.map((column) => ({
|
||||
name: column.name,
|
||||
...(column.comment ? { sampleValues: [column.comment], rawDescriptions: { db: column.comment } } : {}),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
function embeddingBatchSize(maxBatchSize: number): number {
|
||||
return Number.isInteger(maxBatchSize) && maxBatchSize > 0 ? maxBatchSize : 100;
|
||||
}
|
||||
|
||||
async function generateDescriptions(input: {
|
||||
snapshot: KloSchemaSnapshot;
|
||||
connector: KloScanConnector;
|
||||
context: KloScanContext;
|
||||
providers: KloLocalScanEnrichmentProviders;
|
||||
progress?: KloProgressPort;
|
||||
}): Promise<KloLocalScanEnrichmentResult['descriptionUpdates']> {
|
||||
const generator = new KloDescriptionGenerator({
|
||||
llmProvider: input.providers.llm,
|
||||
settings: {
|
||||
columnMaxWords: 16,
|
||||
tableMaxWords: 24,
|
||||
dataSourceMaxWords: 32,
|
||||
concurrencyLimit: 4,
|
||||
},
|
||||
});
|
||||
|
||||
const updates: KloLocalScanEnrichmentResult['descriptionUpdates'] = [];
|
||||
const totalTables = input.snapshot.tables.length;
|
||||
if (totalTables === 0) {
|
||||
await input.progress?.update(1, 'No tables to describe');
|
||||
return updates;
|
||||
}
|
||||
for (const [index, table] of input.snapshot.tables.entries()) {
|
||||
await input.progress?.update(
|
||||
(index + 1) / totalTables,
|
||||
`Generating descriptions ${index + 1}/${totalTables} tables`,
|
||||
{
|
||||
transient: true,
|
||||
},
|
||||
);
|
||||
const tableInput = descriptionTable(table);
|
||||
const columnResult = await generator.generateColumnDescriptions({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
connector: input.connector,
|
||||
context: input.context,
|
||||
dataSourceType: input.snapshot.driver,
|
||||
supportsNestedAnalysis: input.connector.capabilities.nestedAnalysis,
|
||||
table: tableInput,
|
||||
});
|
||||
const tableDescription = await generator.generateTableDescription({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
connector: input.connector,
|
||||
context: input.context,
|
||||
dataSourceType: input.snapshot.driver,
|
||||
table: {
|
||||
catalog: table.catalog,
|
||||
db: table.db,
|
||||
name: table.name,
|
||||
rawDescriptions: table.comment ? { db: table.comment } : {},
|
||||
},
|
||||
});
|
||||
updates.push({
|
||||
table: tableRef(table),
|
||||
tableDescription,
|
||||
columnDescriptions: Object.fromEntries(columnResult.columnDescriptions),
|
||||
});
|
||||
}
|
||||
await input.progress?.update(1, `Generated descriptions for ${totalTables} tables`);
|
||||
return updates;
|
||||
}
|
||||
|
||||
async function buildEmbeddings(input: {
|
||||
snapshot: KloSchemaSnapshot;
|
||||
providers: KloLocalScanEnrichmentProviders;
|
||||
descriptions: KloLocalScanEnrichmentResult['descriptionUpdates'];
|
||||
progress?: KloProgressPort;
|
||||
}): Promise<{ updates: KloEmbeddingUpdate[]; byColumnId: Map<string, number[]> }> {
|
||||
const descriptionByTable = new Map(input.descriptions.map((item) => [item.table.name, item]));
|
||||
const texts: Array<{ columnId: string; text: string }> = [];
|
||||
|
||||
for (const table of input.snapshot.tables) {
|
||||
const tableDescriptions = descriptionByTable.get(table.name);
|
||||
for (const column of table.columns) {
|
||||
const id = columnId(table, column);
|
||||
const text = buildKloColumnEmbeddingText({
|
||||
tableName: table.name,
|
||||
columnName: column.name,
|
||||
columnType: column.nativeType,
|
||||
resolvedDescription: tableDescriptions?.columnDescriptions[column.name] ?? column.comment,
|
||||
resolvedTableDescription: tableDescriptions?.tableDescription ?? table.comment,
|
||||
sampleValues: column.comment ? [column.comment] : null,
|
||||
foreignKeys: {
|
||||
outgoing: (table.foreignKeys ?? [])
|
||||
.filter((foreignKey) => foreignKey.fromColumn === column.name)
|
||||
.map((foreignKey) => ({ toTable: foreignKey.toTable, toColumn: foreignKey.toColumn })),
|
||||
incoming: [],
|
||||
},
|
||||
});
|
||||
texts.push({ columnId: id, text });
|
||||
}
|
||||
}
|
||||
|
||||
const embeddings: number[][] = [];
|
||||
const maxBatchSize = embeddingBatchSize(input.providers.embedding.maxBatchSize);
|
||||
const embeddingTexts = texts.map((item) => item.text);
|
||||
const batchCount = Math.ceil(embeddingTexts.length / maxBatchSize);
|
||||
if (batchCount === 0) {
|
||||
await input.progress?.update(1, 'No embeddings to build');
|
||||
}
|
||||
for (let offset = 0; offset < embeddingTexts.length; offset += maxBatchSize) {
|
||||
const batchIndex = Math.floor(offset / maxBatchSize) + 1;
|
||||
await input.progress?.update(batchIndex / batchCount, `Building embeddings ${batchIndex}/${batchCount} batches`, {
|
||||
transient: true,
|
||||
});
|
||||
const batch = embeddingTexts.slice(offset, offset + maxBatchSize);
|
||||
const batchEmbeddings = await input.providers.embedding.embedBatch(batch);
|
||||
if (batchEmbeddings.length !== batch.length) {
|
||||
throw new Error(`expected ${batch.length} embeddings, received ${batchEmbeddings.length}`);
|
||||
}
|
||||
embeddings.push(...batchEmbeddings);
|
||||
}
|
||||
|
||||
const byColumnId = new Map<string, number[]>();
|
||||
const updates = texts.map((item, index) => {
|
||||
const embedding = embeddings[index] ?? [];
|
||||
byColumnId.set(item.columnId, embedding);
|
||||
return {
|
||||
columnId: item.columnId,
|
||||
text: item.text,
|
||||
embedding,
|
||||
};
|
||||
});
|
||||
if (batchCount > 0) {
|
||||
await input.progress?.update(1, `Built embeddings for ${updates.length} columns`);
|
||||
}
|
||||
return { updates, byColumnId };
|
||||
}
|
||||
|
||||
async function runEnrichmentStage<TOutput>(input: {
|
||||
stateStore: KloScanEnrichmentStateStore | null | undefined;
|
||||
runId: string;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
mode: KloScanMode;
|
||||
stage: KloScanEnrichmentStage;
|
||||
inputHash: string;
|
||||
now: () => Date;
|
||||
resumedStages: KloScanEnrichmentStage[];
|
||||
completedStages: KloScanEnrichmentStage[];
|
||||
failedStages: KloScanEnrichmentStage[];
|
||||
compute: () => Promise<TOutput>;
|
||||
}): Promise<TOutput> {
|
||||
const existing = await input.stateStore?.findCompletedStage<TOutput>({
|
||||
runId: input.runId,
|
||||
stage: input.stage,
|
||||
inputHash: input.inputHash,
|
||||
});
|
||||
if (existing) {
|
||||
input.resumedStages.push(input.stage);
|
||||
input.completedStages.push(input.stage);
|
||||
return existing.output;
|
||||
}
|
||||
|
||||
try {
|
||||
const output = await input.compute();
|
||||
input.completedStages.push(input.stage);
|
||||
await input.stateStore?.saveCompletedStage({
|
||||
runId: input.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
mode: input.mode,
|
||||
stage: input.stage,
|
||||
inputHash: input.inputHash,
|
||||
output,
|
||||
updatedAt: input.now().toISOString(),
|
||||
});
|
||||
return output;
|
||||
} catch (error) {
|
||||
input.failedStages.push(input.stage);
|
||||
await input.stateStore?.saveFailedStage({
|
||||
runId: input.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
mode: input.mode,
|
||||
stage: input.stage,
|
||||
inputHash: input.inputHash,
|
||||
errorMessage: error instanceof Error ? error.message : String(error),
|
||||
updatedAt: input.now().toISOString(),
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
function embeddingsByColumnId(updates: KloEmbeddingUpdate[]): Map<string, number[]> {
|
||||
return new Map(updates.map((update) => [update.columnId, update.embedding]));
|
||||
}
|
||||
|
||||
export async function runLocalScanEnrichment(
|
||||
input: KloLocalScanEnrichmentInput,
|
||||
): Promise<KloLocalScanEnrichmentResult> {
|
||||
const progress = input.context.progress;
|
||||
await progress?.update(0, 'Loading enrichment schema snapshot');
|
||||
const snapshot = await input.connector.introspect(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
driver: input.connector.driver,
|
||||
mode: input.mode,
|
||||
detectRelationships: input.detectRelationships,
|
||||
},
|
||||
input.context,
|
||||
);
|
||||
await progress?.update(0.05, `Loaded schema snapshot with ${snapshot.tables.length} tables`);
|
||||
|
||||
const now = input.now ?? (() => new Date());
|
||||
const state = completedKloScanEnrichmentStateSummary();
|
||||
const syncId = input.syncId ?? input.context.runId;
|
||||
const relationshipSettings =
|
||||
input.relationshipSettings ?? buildDefaultKloProjectConfig(input.connectionId).scan.relationships;
|
||||
const inputHash = computeKloScanEnrichmentInputHash({
|
||||
snapshot,
|
||||
mode: input.mode,
|
||||
detectRelationships: input.detectRelationships ?? false,
|
||||
providerIdentity: input.providerIdentity ?? {},
|
||||
relationshipSettings,
|
||||
});
|
||||
const warnings: KloScanWarning[] = [];
|
||||
let descriptions: KloLocalScanEnrichmentResult['descriptionUpdates'] = [];
|
||||
let embeddingUpdates: KloEmbeddingUpdate[] = [];
|
||||
let schema = snapshotToKloEnrichedSchema(snapshot);
|
||||
const summary: KloScanEnrichmentSummary = { ...skippedKloScanEnrichmentSummary };
|
||||
const relationshipDetectionEnabled = relationshipSettings.enabled;
|
||||
const shouldDetectRelationships =
|
||||
relationshipDetectionEnabled &&
|
||||
(input.mode === 'relationships' || input.mode === 'enriched' || (input.detectRelationships ?? false));
|
||||
|
||||
if (input.mode === 'enriched' && !input.providers) {
|
||||
warnings.push(providerlessEnrichedWarning(shouldDetectRelationships));
|
||||
}
|
||||
|
||||
if (input.mode === 'enriched' && input.providers) {
|
||||
const providers = input.providers;
|
||||
const descriptionProgress = progress?.startPhase(0.45);
|
||||
descriptions = await runEnrichmentStage({
|
||||
stateStore: input.stateStore,
|
||||
runId: input.context.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId,
|
||||
mode: input.mode,
|
||||
stage: 'descriptions',
|
||||
inputHash,
|
||||
now,
|
||||
resumedStages: state.resumedStages,
|
||||
completedStages: state.completedStages,
|
||||
failedStages: state.failedStages,
|
||||
compute: () =>
|
||||
generateDescriptions({
|
||||
snapshot,
|
||||
connector: input.connector,
|
||||
context: input.context,
|
||||
providers,
|
||||
progress: descriptionProgress,
|
||||
}),
|
||||
});
|
||||
const embeddingProgress = progress?.startPhase(0.2);
|
||||
embeddingUpdates = await runEnrichmentStage({
|
||||
stateStore: input.stateStore,
|
||||
runId: input.context.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId,
|
||||
mode: input.mode,
|
||||
stage: 'embeddings',
|
||||
inputHash,
|
||||
now,
|
||||
resumedStages: state.resumedStages,
|
||||
completedStages: state.completedStages,
|
||||
failedStages: state.failedStages,
|
||||
compute: async () => {
|
||||
const embeddings = await buildEmbeddings({
|
||||
snapshot,
|
||||
providers,
|
||||
descriptions,
|
||||
progress: embeddingProgress,
|
||||
});
|
||||
return embeddings.updates;
|
||||
},
|
||||
});
|
||||
schema = snapshotToKloEnrichedSchema(snapshot, embeddingsByColumnId(embeddingUpdates));
|
||||
summary.dataDictionary = input.connector.sampleColumn ? 'completed' : 'skipped';
|
||||
summary.tableDescriptions = 'completed';
|
||||
summary.columnDescriptions = 'completed';
|
||||
summary.embeddings = 'completed';
|
||||
}
|
||||
|
||||
let relationshipUpdate: KloRelationshipUpdate | null = null;
|
||||
let relationshipProfile: KloRelationshipProfileArtifact | null = null;
|
||||
let resolvedRelationships: KloResolvedRelationshipDiscoveryCandidate[] | null = null;
|
||||
let compositeRelationships: KloCompositeRelationshipCandidate[] | null = null;
|
||||
let relationships: KloScanRelationshipSummary = { accepted: 0, review: 0, rejected: 0, skipped: 0 };
|
||||
if (shouldDetectRelationships) {
|
||||
const relationshipProgress = progress?.startPhase(0.25);
|
||||
const relationshipStage = await runEnrichmentStage({
|
||||
stateStore: input.stateStore,
|
||||
runId: input.context.runId,
|
||||
connectionId: input.connectionId,
|
||||
syncId,
|
||||
mode: input.mode,
|
||||
stage: 'relationships',
|
||||
inputHash,
|
||||
now,
|
||||
resumedStages: state.resumedStages,
|
||||
completedStages: state.completedStages,
|
||||
failedStages: state.failedStages,
|
||||
compute: async () => {
|
||||
await relationshipProgress?.update(0, 'Detecting relationships');
|
||||
const detection = await discoverKloRelationships({
|
||||
connectionId: input.connectionId,
|
||||
driver: snapshot.driver,
|
||||
connector: input.connector,
|
||||
schema,
|
||||
context: input.context,
|
||||
settings: relationshipSettings,
|
||||
llmProvider: input.providers?.llm ?? null,
|
||||
});
|
||||
|
||||
await relationshipProgress?.update(
|
||||
1,
|
||||
`Relationship detection found ${detection.relationships.accepted} accepted, ${detection.relationships.review} review`,
|
||||
);
|
||||
return {
|
||||
relationshipUpdate: detection.relationshipUpdate,
|
||||
relationshipProfile: detection.profile,
|
||||
resolvedRelationships: detection.resolvedRelationships,
|
||||
compositeRelationships: detection.compositeRelationships,
|
||||
relationships: detection.relationships,
|
||||
statisticalValidation: detection.statisticalValidation,
|
||||
llmRelationshipValidation: detection.llmRelationshipValidation,
|
||||
warnings: detection.warnings,
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
summary.deterministicRelationships = 'completed';
|
||||
summary.llmRelationshipValidation = relationshipStage.llmRelationshipValidation;
|
||||
summary.statisticalValidation = relationshipStage.statisticalValidation;
|
||||
relationshipUpdate = relationshipStage.relationshipUpdate;
|
||||
relationshipProfile = relationshipStage.relationshipProfile;
|
||||
resolvedRelationships = relationshipStage.resolvedRelationships;
|
||||
compositeRelationships = relationshipStage.compositeRelationships;
|
||||
relationships = relationshipStage.relationships;
|
||||
warnings.push(...relationshipStage.warnings);
|
||||
}
|
||||
|
||||
await progress?.update(1, 'Enrichment complete');
|
||||
return {
|
||||
snapshot,
|
||||
summary,
|
||||
relationships,
|
||||
state: summarizeKloScanEnrichmentState(state),
|
||||
warnings,
|
||||
descriptionUpdates: descriptions,
|
||||
embeddingUpdates,
|
||||
relationshipUpdate,
|
||||
relationshipProfile,
|
||||
resolvedRelationships,
|
||||
compositeRelationships,
|
||||
};
|
||||
}
|
||||
1494
packages/context/src/scan/local-scan.test.ts
Normal file
1494
packages/context/src/scan/local-scan.test.ts
Normal file
File diff suppressed because it is too large
Load diff
516
packages/context/src/scan/local-scan.ts
Normal file
516
packages/context/src/scan/local-scan.ts
Normal file
|
|
@ -0,0 +1,516 @@
|
|||
import type { createKloEmbeddingProvider, createKloLlmProvider } from '@klo/llm';
|
||||
import {
|
||||
createDefaultLocalIngestAdapters,
|
||||
getLocalStageOnlyIngestStatus,
|
||||
type LocalIngestRunRecord,
|
||||
runLocalStageOnlyIngest,
|
||||
type SourceAdapter,
|
||||
} from '../ingest/index.js';
|
||||
import {
|
||||
createLocalKloEmbeddingProviderFromConfig,
|
||||
createLocalKloLlmProviderFromConfig,
|
||||
KloScanEmbeddingPortAdapter,
|
||||
} from '../llm/index.js';
|
||||
import type { KloProjectLlmConfig, KloScanEnrichmentConfig, KloScanRelationshipConfig } from '../project/config.js';
|
||||
import type { KloLocalProject } from '../project/index.js';
|
||||
import { kloLocalStateDbPath } from '../project/local-state-db.js';
|
||||
import { redactKloScanReport } from './credentials.js';
|
||||
import { completedKloScanEnrichmentStateSummary } from './enrichment-state.js';
|
||||
import { failedKloScanEnrichmentSummary, kloScanErrorMessage } from './enrichment-summary.js';
|
||||
import {
|
||||
createDeterministicLocalScanEnrichmentProviders,
|
||||
type KloLocalScanEnrichmentProviders,
|
||||
runLocalScanEnrichment,
|
||||
} from './local-enrichment.js';
|
||||
import { writeLocalScanEnrichmentArtifacts, writeLocalScanManifestShards } from './local-enrichment-artifacts.js';
|
||||
import { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js';
|
||||
import { SqliteLocalScanEnrichmentStateStore } from './sqlite-local-enrichment-state-store.js';
|
||||
import type {
|
||||
KloConnectionDriver,
|
||||
KloProgressPort,
|
||||
KloScanConnector,
|
||||
KloScanEnrichmentStateSummary,
|
||||
KloScanMode,
|
||||
KloScanReport,
|
||||
KloScanTrigger,
|
||||
} from './types.js';
|
||||
|
||||
export interface RunLocalScanOptions {
|
||||
project: KloLocalProject;
|
||||
connectionId: string;
|
||||
mode?: KloScanMode;
|
||||
detectRelationships?: boolean;
|
||||
dryRun?: boolean;
|
||||
trigger?: KloScanTrigger;
|
||||
databaseIntrospectionUrl?: string;
|
||||
adapters?: SourceAdapter[];
|
||||
jobId?: string;
|
||||
now?: () => Date;
|
||||
connector?: KloScanConnector;
|
||||
createConnector?: (connectionId: string) => KloScanConnector | Promise<KloScanConnector>;
|
||||
enrichmentProviders?: KloLocalScanEnrichmentProviders | null;
|
||||
enrichmentStateStore?: SqliteLocalScanEnrichmentStateStore | null;
|
||||
progress?: KloProgressPort;
|
||||
}
|
||||
|
||||
export interface LocalScanRunResult {
|
||||
runId: string;
|
||||
status: 'done';
|
||||
done: true;
|
||||
connectionId: string;
|
||||
mode: KloScanMode;
|
||||
dryRun: boolean;
|
||||
syncId: string;
|
||||
report: KloScanReport;
|
||||
}
|
||||
|
||||
export interface LocalScanStatusResponse {
|
||||
runId: string;
|
||||
status: LocalIngestRunRecord['status'];
|
||||
done: boolean;
|
||||
connectionId: string;
|
||||
mode: KloScanMode;
|
||||
dryRun: boolean;
|
||||
syncId: string;
|
||||
progress: number;
|
||||
startedAt: string;
|
||||
completedAt: string;
|
||||
reportPath: string | null;
|
||||
warnings: KloScanReport['warnings'];
|
||||
}
|
||||
|
||||
export interface LocalScanMcpOptions {
|
||||
adapters?: SourceAdapter[];
|
||||
databaseIntrospectionUrl?: string;
|
||||
jobIdFactory?: () => string;
|
||||
now?: () => Date;
|
||||
createConnector?: (connectionId: string) => KloScanConnector | Promise<KloScanConnector>;
|
||||
}
|
||||
|
||||
const LIVE_DATABASE_ADAPTER = 'live-database';
|
||||
const SCAN_REPORT_FILE = 'scan-report.json';
|
||||
const LOCAL_AUTHOR = 'klo';
|
||||
const LOCAL_AUTHOR_EMAIL = 'klo@example.com';
|
||||
|
||||
function normalizeDriver(driver: string | undefined): KloConnectionDriver {
|
||||
const normalized = (driver ?? '').toLowerCase();
|
||||
if (
|
||||
normalized === 'postgres' ||
|
||||
normalized === 'postgresql' ||
|
||||
normalized === 'sqlite' ||
|
||||
normalized === 'sqlite3' ||
|
||||
normalized === 'mysql' ||
|
||||
normalized === 'clickhouse' ||
|
||||
normalized === 'sqlserver' ||
|
||||
normalized === 'bigquery' ||
|
||||
normalized === 'snowflake' ||
|
||||
normalized === 'posthog'
|
||||
) {
|
||||
return normalized === 'sqlite3' ? 'sqlite' : normalized;
|
||||
}
|
||||
throw new Error(
|
||||
`Standalone klo scan supports postgres/postgresql/sqlite/mysql/clickhouse/sqlserver/bigquery/snowflake/posthog in this phase, received "${driver ?? 'unknown'}"`,
|
||||
);
|
||||
}
|
||||
|
||||
function tablePathCount(paths: string[]): number {
|
||||
return paths.filter((path) => path.startsWith('tables/') && path.endsWith('.json')).length;
|
||||
}
|
||||
|
||||
function rawSourcesDir(connectionId: string, syncId: string): string {
|
||||
return `raw-sources/${connectionId}/${LIVE_DATABASE_ADAPTER}/${syncId}`;
|
||||
}
|
||||
|
||||
function scanReportPath(connectionId: string, syncId: string): string {
|
||||
return `${rawSourcesDir(connectionId, syncId)}/${SCAN_REPORT_FILE}`;
|
||||
}
|
||||
|
||||
function assertSupportedMode(mode: KloScanMode): void {
|
||||
if (mode !== 'structural' && mode !== 'relationships' && mode !== 'enriched') {
|
||||
throw new Error(`Unsupported KLO scan mode: ${mode}`);
|
||||
}
|
||||
}
|
||||
|
||||
async function resolveScanConnector(options: RunLocalScanOptions, mode: KloScanMode): Promise<KloScanConnector | null> {
|
||||
if (mode === 'structural' && !options.detectRelationships) {
|
||||
return null;
|
||||
}
|
||||
if (options.connector) {
|
||||
return options.connector;
|
||||
}
|
||||
if (options.createConnector) {
|
||||
return options.createConnector(options.connectionId);
|
||||
}
|
||||
throw new Error('klo scan --enrich and --detect-relationships require a native standalone scan connector');
|
||||
}
|
||||
|
||||
interface LocalScanEnrichmentProviderDeps {
|
||||
createKloLlmProvider?: typeof createKloLlmProvider;
|
||||
createKloEmbeddingProvider?: typeof createKloEmbeddingProvider;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
}
|
||||
|
||||
export function createLocalScanEnrichmentProvidersFromConfig(
|
||||
config: KloScanEnrichmentConfig,
|
||||
llmConfig: KloProjectLlmConfig,
|
||||
deps: LocalScanEnrichmentProviderDeps = {},
|
||||
): KloLocalScanEnrichmentProviders | null {
|
||||
if (config.mode === 'deterministic') {
|
||||
return createDeterministicLocalScanEnrichmentProviders();
|
||||
}
|
||||
|
||||
if (config.mode !== 'llm' || !config.embeddings) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const llm = createLocalKloLlmProviderFromConfig(llmConfig, deps);
|
||||
const embeddingProvider = createLocalKloEmbeddingProviderFromConfig(config.embeddings, deps);
|
||||
if (!llm || !embeddingProvider) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
llm,
|
||||
embedding: new KloScanEmbeddingPortAdapter(embeddingProvider),
|
||||
};
|
||||
}
|
||||
|
||||
function createLocalScanEnrichmentStateStore(options: RunLocalScanOptions): SqliteLocalScanEnrichmentStateStore | null {
|
||||
if (options.dryRun) {
|
||||
return null;
|
||||
}
|
||||
if (options.enrichmentStateStore !== undefined) {
|
||||
return options.enrichmentStateStore;
|
||||
}
|
||||
return new SqliteLocalScanEnrichmentStateStore({ dbPath: kloLocalStateDbPath(options.project) });
|
||||
}
|
||||
|
||||
function localScanProviderIdentity(
|
||||
config: KloScanEnrichmentConfig,
|
||||
llmConfig: KloProjectLlmConfig,
|
||||
relationships: KloScanRelationshipConfig,
|
||||
): Record<string, unknown> {
|
||||
return {
|
||||
mode: config.mode,
|
||||
embeddingDimensions: config.embeddings?.dimensions ?? null,
|
||||
llmModel: llmConfig.models.default ?? null,
|
||||
embeddingModel: config.embeddings?.model ?? null,
|
||||
batchSize: config.embeddings?.batchSize ?? null,
|
||||
baseUrlConfigured: Boolean(llmConfig.provider.gateway?.base_url),
|
||||
relationships,
|
||||
};
|
||||
}
|
||||
|
||||
function reportFromIngest(input: {
|
||||
record: LocalIngestRunRecord;
|
||||
driver: KloConnectionDriver;
|
||||
mode: KloScanMode;
|
||||
dryRun: boolean;
|
||||
trigger: KloScanTrigger;
|
||||
createdAt: string;
|
||||
}): KloScanReport {
|
||||
const reportPath = input.dryRun ? null : scanReportPath(input.record.connectionId, input.record.syncId);
|
||||
return {
|
||||
connectionId: input.record.connectionId,
|
||||
driver: input.driver,
|
||||
syncId: input.record.syncId,
|
||||
runId: input.record.runId,
|
||||
trigger: input.trigger,
|
||||
mode: input.mode,
|
||||
dryRun: input.dryRun,
|
||||
artifactPaths: {
|
||||
rawSourcesDir: input.dryRun ? null : rawSourcesDir(input.record.connectionId, input.record.syncId),
|
||||
reportPath,
|
||||
manifestShards: [],
|
||||
enrichmentArtifacts: [],
|
||||
},
|
||||
diffSummary: {
|
||||
tablesAdded: tablePathCount(input.record.diffPaths.added),
|
||||
tablesModified: tablePathCount(input.record.diffPaths.modified),
|
||||
tablesDeleted: tablePathCount(input.record.diffPaths.deleted),
|
||||
tablesUnchanged: tablePathCount(input.record.diffPaths.unchanged),
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
manifestShardsWritten: 0,
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 0,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'skipped',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
capabilityGaps: [],
|
||||
warnings: [],
|
||||
relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 },
|
||||
enrichmentState: completedKloScanEnrichmentStateSummary(),
|
||||
createdAt: input.createdAt,
|
||||
};
|
||||
}
|
||||
|
||||
async function writeScanReport(project: KloLocalProject, report: KloScanReport): Promise<void> {
|
||||
if (!report.artifactPaths.reportPath) {
|
||||
return;
|
||||
}
|
||||
await project.fileStore.writeFile(
|
||||
report.artifactPaths.reportPath,
|
||||
`${JSON.stringify(report, null, 2)}\n`,
|
||||
LOCAL_AUTHOR,
|
||||
LOCAL_AUTHOR_EMAIL,
|
||||
`scan(${LIVE_DATABASE_ADAPTER}): ${report.runId} syncId=${report.syncId}`,
|
||||
);
|
||||
}
|
||||
|
||||
function scanDiffSummaryFromRecord(record: LocalIngestRunRecord): KloScanReport['diffSummary'] {
|
||||
return {
|
||||
tablesAdded: tablePathCount(record.diffPaths.added),
|
||||
tablesModified: tablePathCount(record.diffPaths.modified),
|
||||
tablesDeleted: tablePathCount(record.diffPaths.deleted),
|
||||
tablesUnchanged: tablePathCount(record.diffPaths.unchanged),
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
};
|
||||
}
|
||||
|
||||
function hasNoContentChanges(record: LocalIngestRunRecord): boolean {
|
||||
return (
|
||||
record.previousRunId !== null &&
|
||||
record.diffSummary.added === 0 &&
|
||||
record.diffSummary.modified === 0 &&
|
||||
record.diffSummary.deleted === 0
|
||||
);
|
||||
}
|
||||
|
||||
function scanChangeSummary(diffSummary: KloScanReport['diffSummary']): string {
|
||||
const changedTables = diffSummary.tablesAdded + diffSummary.tablesModified + diffSummary.tablesDeleted;
|
||||
const totalTables = changedTables + diffSummary.tablesUnchanged;
|
||||
const changeNoun = changedTables === 1 ? 'change' : 'changes';
|
||||
const tableNoun = totalTables === 1 ? 'table' : 'tables';
|
||||
return `Semantic layer comparison found ${changedTables} ${changeNoun} across ${totalTables} ${tableNoun}`;
|
||||
}
|
||||
|
||||
async function readScanReport(
|
||||
project: KloLocalProject,
|
||||
connectionId: string,
|
||||
syncId: string,
|
||||
): Promise<KloScanReport | null> {
|
||||
try {
|
||||
const raw = await project.fileStore.readFile(scanReportPath(connectionId, syncId));
|
||||
return JSON.parse(raw.content) as KloScanReport;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function runLocalScan(options: RunLocalScanOptions): Promise<LocalScanRunResult> {
|
||||
const mode = options.mode ?? 'structural';
|
||||
assertSupportedMode(mode);
|
||||
await options.progress?.update(0.05, 'Preparing scan');
|
||||
const connector = await resolveScanConnector(options, mode);
|
||||
|
||||
const connection = options.project.config.connections[options.connectionId];
|
||||
if (!connection) {
|
||||
throw new Error(`Connection "${options.connectionId}" is not configured in klo.yaml`);
|
||||
}
|
||||
const driver = normalizeDriver(connection.driver);
|
||||
const adapters =
|
||||
options.adapters ??
|
||||
createDefaultLocalIngestAdapters(options.project, { databaseIntrospectionUrl: options.databaseIntrospectionUrl });
|
||||
const enrichmentProviders =
|
||||
connector && (mode !== 'structural' || options.detectRelationships)
|
||||
? options.enrichmentProviders !== undefined
|
||||
? options.enrichmentProviders
|
||||
: createLocalScanEnrichmentProvidersFromConfig(options.project.config.scan.enrichment, options.project.config.llm)
|
||||
: null;
|
||||
|
||||
await options.progress?.update(0.15, 'Inspecting database schema');
|
||||
const record = await runLocalStageOnlyIngest({
|
||||
project: options.project,
|
||||
adapters,
|
||||
adapter: LIVE_DATABASE_ADAPTER,
|
||||
connectionId: options.connectionId,
|
||||
trigger: 'manual_resync',
|
||||
jobId: options.jobId,
|
||||
now: options.now,
|
||||
dryRun: options.dryRun,
|
||||
});
|
||||
await options.progress?.update(0.55, scanChangeSummary(scanDiffSummaryFromRecord(record)));
|
||||
let report = reportFromIngest({
|
||||
record,
|
||||
driver,
|
||||
mode,
|
||||
dryRun: options.dryRun ?? false,
|
||||
trigger: options.trigger ?? 'cli',
|
||||
createdAt: (options.now?.() ?? new Date()).toISOString(),
|
||||
});
|
||||
let reusedExistingScanArtifacts = false;
|
||||
const existingReport =
|
||||
!report.dryRun && !connector && hasNoContentChanges(record)
|
||||
? await readScanReport(options.project, record.connectionId, record.syncId)
|
||||
: null;
|
||||
if (existingReport && existingReport.mode === mode && existingReport.dryRun === report.dryRun) {
|
||||
report.artifactPaths = existingReport.artifactPaths;
|
||||
report.capabilityGaps = existingReport.capabilityGaps;
|
||||
report.warnings = existingReport.warnings;
|
||||
report.relationships = existingReport.relationships;
|
||||
report.enrichment = existingReport.enrichment;
|
||||
report.enrichmentState = existingReport.enrichmentState;
|
||||
reusedExistingScanArtifacts = true;
|
||||
}
|
||||
const enrichmentStateStore = connector ? createLocalScanEnrichmentStateStore(options) : null;
|
||||
let enrichmentState: KloScanEnrichmentStateSummary = completedKloScanEnrichmentStateSummary();
|
||||
if (!reusedExistingScanArtifacts && !report.dryRun && report.artifactPaths.rawSourcesDir) {
|
||||
await options.progress?.update(0.7, 'Writing schema artifacts');
|
||||
const structuralSnapshot = await readLocalScanStructuralSnapshot({
|
||||
project: options.project,
|
||||
connectionId: options.connectionId,
|
||||
driver,
|
||||
rawSourcesDir: report.artifactPaths.rawSourcesDir,
|
||||
extractedAtFallback: report.createdAt,
|
||||
});
|
||||
const manifestArtifacts = await writeLocalScanManifestShards({
|
||||
project: options.project,
|
||||
connectionId: options.connectionId,
|
||||
syncId: record.syncId,
|
||||
driver,
|
||||
snapshot: structuralSnapshot,
|
||||
dryRun: false,
|
||||
});
|
||||
report.artifactPaths.manifestShards = manifestArtifacts.manifestShards;
|
||||
report.manifestShardsWritten = manifestArtifacts.manifestShardsWritten;
|
||||
}
|
||||
if (connector) {
|
||||
try {
|
||||
await options.progress?.update(
|
||||
0.82,
|
||||
mode === 'relationships' || options.detectRelationships
|
||||
? 'Detecting relationships'
|
||||
: 'Enriching schema metadata',
|
||||
);
|
||||
const enrichment = await runLocalScanEnrichment({
|
||||
connectionId: options.connectionId,
|
||||
mode,
|
||||
detectRelationships: options.detectRelationships,
|
||||
connector,
|
||||
context: { runId: record.runId, progress: options.progress?.startPhase(0.18) },
|
||||
providers: enrichmentProviders,
|
||||
stateStore: enrichmentStateStore,
|
||||
syncId: record.syncId,
|
||||
providerIdentity: localScanProviderIdentity(
|
||||
options.project.config.scan.enrichment,
|
||||
options.project.config.llm,
|
||||
options.project.config.scan.relationships,
|
||||
),
|
||||
relationshipSettings: options.project.config.scan.relationships,
|
||||
now: options.now,
|
||||
});
|
||||
const artifacts = await writeLocalScanEnrichmentArtifacts({
|
||||
project: options.project,
|
||||
connectionId: options.connectionId,
|
||||
syncId: record.syncId,
|
||||
driver,
|
||||
enrichment,
|
||||
dryRun: options.dryRun ?? false,
|
||||
relationshipSettings: options.project.config.scan.relationships,
|
||||
});
|
||||
report.enrichment = enrichment.summary;
|
||||
report.relationships = enrichment.relationships;
|
||||
enrichmentState = enrichment.state;
|
||||
report.enrichmentState = enrichmentState;
|
||||
report.warnings.push(...enrichment.warnings);
|
||||
report.artifactPaths.enrichmentArtifacts = artifacts.enrichmentArtifacts;
|
||||
report.artifactPaths.manifestShards = artifacts.manifestShards;
|
||||
report.manifestShardsWritten = artifacts.manifestShardsWritten;
|
||||
} catch (error) {
|
||||
const message = kloScanErrorMessage(error);
|
||||
report.enrichment = failedKloScanEnrichmentSummary(mode, options.detectRelationships ?? false);
|
||||
const stages = await enrichmentStateStore?.listRunStages(record.runId);
|
||||
if (stages) {
|
||||
enrichmentState = completedKloScanEnrichmentStateSummary();
|
||||
for (const stage of stages) {
|
||||
if (stage.status === 'completed') {
|
||||
enrichmentState.completedStages.push(stage.stage);
|
||||
} else {
|
||||
enrichmentState.failedStages.push(stage.stage);
|
||||
}
|
||||
}
|
||||
report.enrichmentState = enrichmentState;
|
||||
}
|
||||
report.warnings.push({
|
||||
code: 'enrichment_failed',
|
||||
message: `KLO scan enrichment failed after structural scan completed: ${message}`,
|
||||
recoverable: true,
|
||||
metadata: { mode, detectRelationships: options.detectRelationships ?? false },
|
||||
});
|
||||
}
|
||||
}
|
||||
report = redactKloScanReport(report);
|
||||
if (!reusedExistingScanArtifacts) {
|
||||
await writeScanReport(options.project, report);
|
||||
}
|
||||
await options.progress?.update(1, 'Scan completed');
|
||||
return {
|
||||
runId: record.runId,
|
||||
status: 'done',
|
||||
done: true,
|
||||
connectionId: record.connectionId,
|
||||
mode,
|
||||
dryRun: options.dryRun ?? false,
|
||||
syncId: record.syncId,
|
||||
report,
|
||||
};
|
||||
}
|
||||
|
||||
export async function getLocalScanReport(project: KloLocalProject, runId: string): Promise<KloScanReport | null> {
|
||||
const status = await getLocalStageOnlyIngestStatus(project, runId);
|
||||
if (!status || status.adapter !== LIVE_DATABASE_ADAPTER) {
|
||||
return null;
|
||||
}
|
||||
const report = await readScanReport(project, status.connectionId, status.syncId);
|
||||
if (!report) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
...report,
|
||||
runId: status.runId,
|
||||
syncId: status.syncId,
|
||||
diffSummary: scanDiffSummaryFromRecord(status),
|
||||
};
|
||||
}
|
||||
|
||||
export async function getLocalScanStatus(
|
||||
project: KloLocalProject,
|
||||
runId: string,
|
||||
): Promise<LocalScanStatusResponse | null> {
|
||||
const status = await getLocalStageOnlyIngestStatus(project, runId);
|
||||
if (!status || status.adapter !== LIVE_DATABASE_ADAPTER) {
|
||||
return null;
|
||||
}
|
||||
const report = await getLocalScanReport(project, runId);
|
||||
return {
|
||||
runId: status.runId,
|
||||
status: status.status,
|
||||
done: status.done,
|
||||
connectionId: status.connectionId,
|
||||
mode: report?.mode ?? 'structural',
|
||||
dryRun: report?.dryRun ?? false,
|
||||
syncId: status.syncId,
|
||||
progress: status.progress,
|
||||
startedAt: status.startedAt,
|
||||
completedAt: status.completedAt,
|
||||
reportPath: report?.artifactPaths.reportPath ?? null,
|
||||
warnings: report?.warnings ?? [],
|
||||
};
|
||||
}
|
||||
196
packages/context/src/scan/local-structural-artifacts.test.ts
Normal file
196
packages/context/src/scan/local-structural-artifacts.test.ts
Normal file
|
|
@ -0,0 +1,196 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { initKloProject, type KloLocalProject } from '../project/index.js';
|
||||
import { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js';
|
||||
|
||||
describe('readLocalScanStructuralSnapshot', () => {
|
||||
let tempDir: string;
|
||||
let project: KloLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-local-structural-artifacts-'));
|
||||
project = await initKloProject({
|
||||
projectDir: join(tempDir, 'project'),
|
||||
projectName: 'warehouse',
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('rebuilds a canonical snapshot from persisted live-database raw files', async () => {
|
||||
const rawRoot = 'raw-sources/warehouse/live-database/sync-1';
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/connection.json`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
metadata: { source: 'sqlite-smoke' },
|
||||
tableCount: 2,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed connection artifact',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/customers.json`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: 'Customer table',
|
||||
estimatedRows: 12,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed customers artifact',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/orders.json`,
|
||||
`${JSON.stringify(
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 20,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed orders artifact',
|
||||
);
|
||||
|
||||
const snapshot = await readLocalScanStructuralSnapshot({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
rawSourcesDir: rawRoot,
|
||||
extractedAtFallback: '2026-04-29T13:00:00.000Z',
|
||||
});
|
||||
|
||||
expect(snapshot).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-04-29T12:00:00.000Z',
|
||||
metadata: { source: 'sqlite-smoke' },
|
||||
tables: [
|
||||
{
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
comment: 'Customer table',
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Customer id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('uses the scan report timestamp when connection.json omits extractedAt', async () => {
|
||||
const rawRoot = 'raw-sources/warehouse/live-database/sync-2';
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/connection.json`,
|
||||
'{"connectionId":"warehouse","metadata":{}}\n',
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed connection artifact without extractedAt',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
`${rawRoot}/tables/orders.json`,
|
||||
'{"name":"orders","catalog":null,"db":null,"kind":"table","comment":null,"estimatedRows":null,"columns":[{"name":"id","nativeType":"integer","normalizedType":"integer","dimensionType":"number","nullable":false,"primaryKey":true,"comment":null}],"foreignKeys":[]}\n',
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed orders artifact',
|
||||
);
|
||||
|
||||
const snapshot = await readLocalScanStructuralSnapshot({
|
||||
project,
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
rawSourcesDir: rawRoot,
|
||||
extractedAtFallback: '2026-04-29T13:00:00.000Z',
|
||||
});
|
||||
|
||||
expect(snapshot.extractedAt).toBe('2026-04-29T13:00:00.000Z');
|
||||
});
|
||||
});
|
||||
125
packages/context/src/scan/local-structural-artifacts.ts
Normal file
125
packages/context/src/scan/local-structural-artifacts.ts
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import type {
|
||||
KloConnectionDriver,
|
||||
KloSchemaColumn,
|
||||
KloSchemaForeignKey,
|
||||
KloSchemaSnapshot,
|
||||
KloSchemaTable,
|
||||
} from './types.js';
|
||||
|
||||
export interface ReadLocalScanStructuralSnapshotInput {
|
||||
project: KloLocalProject;
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
rawSourcesDir: string;
|
||||
extractedAtFallback: string;
|
||||
}
|
||||
|
||||
interface LiveDatabaseConnectionArtifact {
|
||||
connectionId?: unknown;
|
||||
extractedAt?: unknown;
|
||||
metadata?: unknown;
|
||||
scope?: unknown;
|
||||
}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function metadataRecord(value: unknown): Record<string, unknown> {
|
||||
return isRecord(value) ? value : {};
|
||||
}
|
||||
|
||||
function optionalStringOrNull(value: unknown): string | null | undefined {
|
||||
if (value === undefined) {
|
||||
return undefined;
|
||||
}
|
||||
return typeof value === 'string' ? value : null;
|
||||
}
|
||||
|
||||
function parseColumn(rawColumn: unknown, path: string): KloSchemaColumn {
|
||||
if (
|
||||
!isRecord(rawColumn) ||
|
||||
typeof rawColumn.name !== 'string' ||
|
||||
typeof rawColumn.nativeType !== 'string' ||
|
||||
typeof rawColumn.normalizedType !== 'string' ||
|
||||
(rawColumn.dimensionType !== 'time' &&
|
||||
rawColumn.dimensionType !== 'string' &&
|
||||
rawColumn.dimensionType !== 'number' &&
|
||||
rawColumn.dimensionType !== 'boolean')
|
||||
) {
|
||||
throw new Error(`Invalid KLO schema column artifact: ${path}`);
|
||||
}
|
||||
return {
|
||||
name: rawColumn.name,
|
||||
nativeType: rawColumn.nativeType,
|
||||
normalizedType: rawColumn.normalizedType,
|
||||
dimensionType: rawColumn.dimensionType,
|
||||
nullable: rawColumn.nullable === true,
|
||||
primaryKey: rawColumn.primaryKey === true,
|
||||
comment: optionalStringOrNull(rawColumn.comment) ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function parseForeignKey(rawForeignKey: unknown, path: string): KloSchemaForeignKey {
|
||||
if (
|
||||
!isRecord(rawForeignKey) ||
|
||||
typeof rawForeignKey.fromColumn !== 'string' ||
|
||||
typeof rawForeignKey.toTable !== 'string' ||
|
||||
typeof rawForeignKey.toColumn !== 'string'
|
||||
) {
|
||||
throw new Error(`Invalid KLO schema foreign key artifact: ${path}`);
|
||||
}
|
||||
return {
|
||||
fromColumn: rawForeignKey.fromColumn,
|
||||
toCatalog: optionalStringOrNull(rawForeignKey.toCatalog) ?? null,
|
||||
toDb: optionalStringOrNull(rawForeignKey.toDb) ?? null,
|
||||
toTable: rawForeignKey.toTable,
|
||||
toColumn: rawForeignKey.toColumn,
|
||||
constraintName: optionalStringOrNull(rawForeignKey.constraintName) ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function parseTable(raw: string, path: string): KloSchemaTable {
|
||||
const parsed = JSON.parse(raw) as unknown;
|
||||
if (!isRecord(parsed) || typeof parsed.name !== 'string' || !Array.isArray(parsed.columns)) {
|
||||
throw new Error(`Invalid KLO schema table artifact: ${path}`);
|
||||
}
|
||||
return {
|
||||
catalog: optionalStringOrNull(parsed.catalog) ?? null,
|
||||
db: optionalStringOrNull(parsed.db) ?? null,
|
||||
name: parsed.name,
|
||||
kind:
|
||||
parsed.kind === 'view' || parsed.kind === 'external' || parsed.kind === 'event_stream' ? parsed.kind : 'table',
|
||||
comment: optionalStringOrNull(parsed.comment) ?? null,
|
||||
estimatedRows: typeof parsed.estimatedRows === 'number' ? parsed.estimatedRows : null,
|
||||
columns: parsed.columns.map((column) => parseColumn(column, path)),
|
||||
foreignKeys: Array.isArray(parsed.foreignKeys)
|
||||
? parsed.foreignKeys.map((foreignKey) => parseForeignKey(foreignKey, path))
|
||||
: [],
|
||||
};
|
||||
}
|
||||
|
||||
export async function readLocalScanStructuralSnapshot(
|
||||
input: ReadLocalScanStructuralSnapshotInput,
|
||||
): Promise<KloSchemaSnapshot> {
|
||||
const connectionRaw = await input.project.fileStore.readFile(`${input.rawSourcesDir}/connection.json`);
|
||||
const connection = JSON.parse(connectionRaw.content) as LiveDatabaseConnectionArtifact;
|
||||
const listedTables = await input.project.fileStore.listFiles(`${input.rawSourcesDir}/tables`);
|
||||
const tablePaths = listedTables.files.filter((path) => path.endsWith('.json')).sort();
|
||||
|
||||
const tables: KloSchemaTable[] = [];
|
||||
for (const path of tablePaths) {
|
||||
const tableRaw = await input.project.fileStore.readFile(path);
|
||||
tables.push(parseTable(tableRaw.content, path));
|
||||
}
|
||||
|
||||
return {
|
||||
connectionId: typeof connection.connectionId === 'string' ? connection.connectionId : input.connectionId,
|
||||
driver: input.driver,
|
||||
extractedAt: typeof connection.extractedAt === 'string' ? connection.extractedAt : input.extractedAtFallback,
|
||||
scope: isRecord(connection.scope) ? connection.scope : {},
|
||||
metadata: metadataRecord(connection.metadata),
|
||||
tables,
|
||||
};
|
||||
}
|
||||
376
packages/context/src/scan/orchestrator.test.ts
Normal file
376
packages/context/src/scan/orchestrator.test.ts
Normal file
|
|
@ -0,0 +1,376 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import {
|
||||
createKloConnectorCapabilities,
|
||||
type KloScanConnector,
|
||||
type KloScanContext,
|
||||
type KloScanEnrichmentStateSummary,
|
||||
type KloScanInput,
|
||||
KloScanOrchestrator,
|
||||
type KloSchemaSnapshot,
|
||||
} from './index.js';
|
||||
|
||||
function snapshot(): KloSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T00:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: { source: 'test' },
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Orders table',
|
||||
estimatedRows: null,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Order id',
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function connector(
|
||||
capabilities = createKloConnectorCapabilities({ tableSampling: true, columnSampling: true }),
|
||||
): KloScanConnector {
|
||||
return {
|
||||
id: 'connector-1',
|
||||
driver: 'postgres',
|
||||
capabilities,
|
||||
introspect: vi.fn(async () => snapshot()),
|
||||
};
|
||||
}
|
||||
|
||||
function context(): KloScanContext {
|
||||
return {
|
||||
runId: 'scan-run-1',
|
||||
logger: {
|
||||
debug: vi.fn(),
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
error: vi.fn(),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const input: KloScanInput = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
mode: 'structural',
|
||||
};
|
||||
|
||||
describe('KloScanOrchestrator', () => {
|
||||
it('runs structural scans through connector introspection and structural host callback', async () => {
|
||||
const scanConnector = connector();
|
||||
const scanContext = context();
|
||||
const runStructural = vi.fn(async (scanSnapshot: KloSchemaSnapshot) => ({
|
||||
result: { synced: true },
|
||||
diffSummary: { tablesAdded: scanSnapshot.tables.length, columnsAdded: 1 },
|
||||
structuralSyncStats: { tablesCreated: 1, columnsCreated: 1 },
|
||||
artifactPaths: { manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'] },
|
||||
}));
|
||||
|
||||
const result = await new KloScanOrchestrator({
|
||||
now: () => new Date('2026-04-29T00:10:00.000Z'),
|
||||
syncIdFactory: () => 'sync-1',
|
||||
}).run({
|
||||
connector: scanConnector,
|
||||
input,
|
||||
trigger: 'schema_scan',
|
||||
context: scanContext,
|
||||
runStructural,
|
||||
});
|
||||
|
||||
expect(scanConnector.introspect).toHaveBeenCalledWith(input, scanContext);
|
||||
expect(runStructural).toHaveBeenCalledWith(snapshot(), scanContext);
|
||||
expect(result.snapshot.connectionId).toBe('warehouse');
|
||||
expect(result.structural.result).toEqual({ synced: true });
|
||||
expect(result.enrichment).toBeNull();
|
||||
expect(result.report).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
syncId: 'sync-1',
|
||||
runId: 'scan-run-1',
|
||||
trigger: 'schema_scan',
|
||||
mode: 'structural',
|
||||
dryRun: false,
|
||||
diffSummary: {
|
||||
tablesAdded: 1,
|
||||
columnsAdded: 1,
|
||||
},
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 1,
|
||||
columnsCreated: 1,
|
||||
},
|
||||
manifestShardsWritten: 1,
|
||||
artifactPaths: {
|
||||
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'skipped',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
enrichmentState: {
|
||||
resumedStages: [],
|
||||
completedStages: [],
|
||||
failedStages: [],
|
||||
},
|
||||
createdAt: '2026-04-29T00:10:00.000Z',
|
||||
});
|
||||
});
|
||||
|
||||
it('runs enriched scans through structural and enrichment host callbacks', async () => {
|
||||
const scanConnector = connector(
|
||||
createKloConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
columnSampling: true,
|
||||
columnStats: true,
|
||||
readOnlySql: true,
|
||||
}),
|
||||
);
|
||||
const scanContext = context();
|
||||
|
||||
const result = await new KloScanOrchestrator({ syncIdFactory: () => 'sync-2' }).run({
|
||||
connector: scanConnector,
|
||||
input: { ...input, mode: 'enriched', detectRelationships: true },
|
||||
trigger: 'schema_scan',
|
||||
context: scanContext,
|
||||
runStructural: vi.fn(async () => ({
|
||||
result: { schemaId: 'schema-1' },
|
||||
structuralSyncStats: { tablesCreated: 1 },
|
||||
})),
|
||||
runEnrichment: vi.fn(async () => ({
|
||||
result: { enriched: true },
|
||||
enrichment: {
|
||||
dataDictionary: 'completed',
|
||||
columnDescriptions: 'completed',
|
||||
tableDescriptions: 'completed',
|
||||
embeddings: 'completed',
|
||||
deterministicRelationships: 'completed',
|
||||
statisticalValidation: 'completed',
|
||||
} as const,
|
||||
relationships: { accepted: 2, rejected: 1 },
|
||||
})),
|
||||
});
|
||||
|
||||
expect(result.enrichment?.result).toEqual({ enriched: true });
|
||||
expect(result.report.enrichment.columnDescriptions).toBe('completed');
|
||||
expect(result.report.relationships).toEqual({ accepted: 2, review: 0, rejected: 1, skipped: 0 });
|
||||
expect(result.report.capabilityGaps).toEqual([]);
|
||||
expect(result.report.warnings).toEqual([]);
|
||||
});
|
||||
|
||||
it('reports host enrichment state summaries from enriched scan phases', async () => {
|
||||
const scanConnector = connector(
|
||||
createKloConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
columnSampling: true,
|
||||
columnStats: true,
|
||||
readOnlySql: true,
|
||||
}),
|
||||
);
|
||||
const enrichmentState: Partial<KloScanEnrichmentStateSummary> = {
|
||||
resumedStages: ['relationships', 'descriptions', 'descriptions'],
|
||||
completedStages: ['embeddings', 'descriptions', 'relationships'],
|
||||
failedStages: [],
|
||||
};
|
||||
|
||||
const result = await new KloScanOrchestrator({ syncIdFactory: () => 'sync-state' }).run({
|
||||
connector: scanConnector,
|
||||
input: { ...input, mode: 'enriched', detectRelationships: true },
|
||||
trigger: 'schema_scan',
|
||||
context: context(),
|
||||
runStructural: vi.fn(async () => ({ result: { synced: true } })),
|
||||
runEnrichment: vi.fn(async () => ({
|
||||
result: { enriched: true },
|
||||
enrichmentState,
|
||||
})),
|
||||
});
|
||||
|
||||
expect(result.report.enrichmentState).toEqual({
|
||||
resumedStages: ['descriptions', 'relationships'],
|
||||
completedStages: ['descriptions', 'embeddings', 'relationships'],
|
||||
failedStages: [],
|
||||
});
|
||||
});
|
||||
|
||||
it('records recoverable warnings for missing optional capabilities during enriched scans', async () => {
|
||||
const result = await new KloScanOrchestrator({ syncIdFactory: () => 'sync-3' }).run({
|
||||
connector: connector(createKloConnectorCapabilities()),
|
||||
input: { ...input, mode: 'enriched', detectRelationships: true },
|
||||
trigger: 'schema_scan',
|
||||
context: context(),
|
||||
runStructural: vi.fn(async () => ({ result: {} })),
|
||||
runEnrichment: vi.fn(async () => ({ result: {} })),
|
||||
});
|
||||
|
||||
expect(result.report.capabilityGaps).toEqual(['tableSampling', 'columnSampling', 'columnStats', 'readOnlySql']);
|
||||
expect(result.report.warnings.map((warning) => warning.code)).toEqual([
|
||||
'connector_capability_missing',
|
||||
'connector_capability_missing',
|
||||
'connector_capability_missing',
|
||||
'connector_capability_missing',
|
||||
]);
|
||||
expect(result.report.warnings.every((warning) => warning.recoverable)).toBe(true);
|
||||
});
|
||||
|
||||
it('redacts structural and enrichment warning metadata before returning reports', async () => {
|
||||
const result = await new KloScanOrchestrator({ syncIdFactory: () => 'sync-redacted' }).run({
|
||||
connector: connector(
|
||||
createKloConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
columnSampling: true,
|
||||
columnStats: true,
|
||||
readOnlySql: true,
|
||||
}),
|
||||
),
|
||||
input: { ...input, mode: 'enriched' },
|
||||
trigger: 'schema_scan',
|
||||
context: context(),
|
||||
runStructural: vi.fn(async () => ({
|
||||
result: {},
|
||||
warnings: [
|
||||
{
|
||||
code: 'sampling_failed',
|
||||
message: 'structural warning',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
url: 'postgres://reader:secret@example.test/db', // pragma: allowlist secret
|
||||
table: 'orders',
|
||||
},
|
||||
} as const,
|
||||
],
|
||||
})),
|
||||
runEnrichment: vi.fn(async () => ({
|
||||
result: {},
|
||||
warnings: [
|
||||
{
|
||||
code: 'embedding_unavailable',
|
||||
message: 'enrichment warning',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
nested: {
|
||||
api_key: 'sk_test_123', // pragma: allowlist secret
|
||||
schema: 'public',
|
||||
},
|
||||
},
|
||||
} as const,
|
||||
],
|
||||
})),
|
||||
});
|
||||
|
||||
expect(result.report.warnings).toEqual([
|
||||
{
|
||||
code: 'sampling_failed',
|
||||
message: 'structural warning',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
url: '<redacted>',
|
||||
table: 'orders',
|
||||
},
|
||||
},
|
||||
{
|
||||
code: 'embedding_unavailable',
|
||||
message: 'enrichment warning',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
nested: {
|
||||
api_key: '<redacted>',
|
||||
schema: 'public',
|
||||
},
|
||||
},
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('keeps structural results when the enrichment phase fails after structural sync', async () => {
|
||||
const scanConnector = connector(
|
||||
createKloConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
columnSampling: true,
|
||||
columnStats: true,
|
||||
readOnlySql: true,
|
||||
}),
|
||||
);
|
||||
const runStructural = vi.fn(async () => ({
|
||||
result: { synced: true },
|
||||
artifactPaths: {
|
||||
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-failed-enrichment',
|
||||
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
|
||||
},
|
||||
manifestShardsWritten: 1,
|
||||
}));
|
||||
const runEnrichment = vi.fn(async () => {
|
||||
throw new Error('AI Gateway timed out');
|
||||
});
|
||||
|
||||
const result = await new KloScanOrchestrator({
|
||||
now: () => new Date('2026-04-29T18:00:00.000Z'),
|
||||
syncIdFactory: () => 'sync-failed-enrichment',
|
||||
}).run({
|
||||
connector: scanConnector,
|
||||
input: { ...input, mode: 'enriched', detectRelationships: true },
|
||||
trigger: 'schema_scan',
|
||||
context: context(),
|
||||
runStructural,
|
||||
runEnrichment,
|
||||
});
|
||||
|
||||
expect(result.structural.result).toEqual({ synced: true });
|
||||
expect(result.enrichment).toBeNull();
|
||||
expect(result.report.artifactPaths.manifestShards).toEqual(['semantic-layer/warehouse/_schema/public.yaml']);
|
||||
expect(result.report.manifestShardsWritten).toBe(1);
|
||||
expect(result.report.enrichment).toEqual({
|
||||
dataDictionary: 'failed',
|
||||
tableDescriptions: 'failed',
|
||||
columnDescriptions: 'failed',
|
||||
embeddings: 'failed',
|
||||
deterministicRelationships: 'failed',
|
||||
llmRelationshipValidation: 'failed',
|
||||
statisticalValidation: 'failed',
|
||||
});
|
||||
expect(result.report.warnings).toEqual([
|
||||
{
|
||||
code: 'enrichment_failed',
|
||||
message: 'KLO scan enrichment failed after structural scan completed: AI Gateway timed out',
|
||||
recoverable: true,
|
||||
metadata: {
|
||||
mode: 'enriched',
|
||||
detectRelationships: true,
|
||||
},
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('marks dry-run reports without changing host callback behavior', async () => {
|
||||
const runStructural = vi.fn(async () => ({ result: { planned: true }, manifestShardsWritten: 0 }));
|
||||
|
||||
const result = await new KloScanOrchestrator({ syncIdFactory: () => 'sync-4' }).run({
|
||||
connector: connector(),
|
||||
input: { ...input, dryRun: true },
|
||||
trigger: 'cli',
|
||||
context: context(),
|
||||
runStructural,
|
||||
});
|
||||
|
||||
expect(runStructural).toHaveBeenCalledTimes(1);
|
||||
expect(result.report.dryRun).toBe(true);
|
||||
expect(result.report.trigger).toBe('cli');
|
||||
});
|
||||
});
|
||||
297
packages/context/src/scan/orchestrator.ts
Normal file
297
packages/context/src/scan/orchestrator.ts
Normal file
|
|
@ -0,0 +1,297 @@
|
|||
import { redactKloScanReport } from './credentials.js';
|
||||
import { completedKloScanEnrichmentStateSummary, summarizeKloScanEnrichmentState } from './enrichment-state.js';
|
||||
import {
|
||||
failedKloScanEnrichmentSummary,
|
||||
kloScanErrorMessage,
|
||||
skippedKloScanEnrichmentSummary,
|
||||
} from './enrichment-summary.js';
|
||||
import type {
|
||||
KloConnectorCapabilities,
|
||||
KloScanArtifactPaths,
|
||||
KloScanConnector,
|
||||
KloScanContext,
|
||||
KloScanDiffSummary,
|
||||
KloScanEnrichmentSummary,
|
||||
KloScanEnrichmentStateSummary,
|
||||
KloScanInput,
|
||||
KloScanRelationshipSummary,
|
||||
KloScanReport,
|
||||
KloScanTrigger,
|
||||
KloScanWarning,
|
||||
KloSchemaSnapshot,
|
||||
KloStructuralSyncStats,
|
||||
} from './types.js';
|
||||
|
||||
type CapabilityGap = keyof Omit<KloConnectorCapabilities, 'structuralIntrospection'>;
|
||||
|
||||
export interface KloStructuralScanPhaseResult<TResult = unknown> {
|
||||
result: TResult;
|
||||
diffSummary?: Partial<KloScanDiffSummary>;
|
||||
structuralSyncStats?: Partial<KloStructuralSyncStats>;
|
||||
manifestShardsWritten?: number;
|
||||
artifactPaths?: Partial<KloScanArtifactPaths>;
|
||||
relationships?: Partial<KloScanRelationshipSummary>;
|
||||
warnings?: KloScanWarning[];
|
||||
}
|
||||
|
||||
export interface KloEnrichmentScanPhaseResult<TResult = unknown> {
|
||||
result: TResult;
|
||||
enrichment?: Partial<KloScanEnrichmentSummary>;
|
||||
enrichmentState?: Partial<KloScanEnrichmentStateSummary>;
|
||||
manifestShardsWritten?: number;
|
||||
artifactPaths?: Partial<KloScanArtifactPaths>;
|
||||
relationships?: Partial<KloScanRelationshipSummary>;
|
||||
warnings?: KloScanWarning[];
|
||||
}
|
||||
|
||||
export interface KloScanOrchestratorRunInput<TStructuralResult = unknown, TEnrichmentResult = unknown> {
|
||||
connector: KloScanConnector;
|
||||
input: KloScanInput;
|
||||
trigger: KloScanTrigger;
|
||||
context: KloScanContext;
|
||||
syncId?: string;
|
||||
runStructural: (
|
||||
snapshot: KloSchemaSnapshot,
|
||||
context: KloScanContext,
|
||||
) => Promise<KloStructuralScanPhaseResult<TStructuralResult>>;
|
||||
runEnrichment?: (
|
||||
snapshot: KloSchemaSnapshot,
|
||||
structural: KloStructuralScanPhaseResult<TStructuralResult>,
|
||||
context: KloScanContext,
|
||||
) => Promise<KloEnrichmentScanPhaseResult<TEnrichmentResult>>;
|
||||
}
|
||||
|
||||
export interface KloScanOrchestratorRunResult<TStructuralResult = unknown, TEnrichmentResult = unknown> {
|
||||
snapshot: KloSchemaSnapshot;
|
||||
structural: KloStructuralScanPhaseResult<TStructuralResult>;
|
||||
enrichment: KloEnrichmentScanPhaseResult<TEnrichmentResult> | null;
|
||||
report: KloScanReport;
|
||||
}
|
||||
|
||||
export interface KloScanOrchestratorOptions {
|
||||
now?: () => Date;
|
||||
syncIdFactory?: (input: KloScanInput, context: KloScanContext) => string;
|
||||
}
|
||||
|
||||
const emptyDiffSummary: KloScanDiffSummary = {
|
||||
tablesAdded: 0,
|
||||
tablesModified: 0,
|
||||
tablesDeleted: 0,
|
||||
tablesUnchanged: 0,
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
};
|
||||
|
||||
const emptyStructuralSyncStats: KloStructuralSyncStats = {
|
||||
tablesCreated: 0,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
};
|
||||
|
||||
const emptyArtifactPaths: KloScanArtifactPaths = {
|
||||
rawSourcesDir: null,
|
||||
reportPath: null,
|
||||
manifestShards: [],
|
||||
enrichmentArtifacts: [],
|
||||
};
|
||||
|
||||
function mergeDiffSummary(input?: Partial<KloScanDiffSummary>): KloScanDiffSummary {
|
||||
return { ...emptyDiffSummary, ...input };
|
||||
}
|
||||
|
||||
function mergeStructuralSyncStats(input?: Partial<KloStructuralSyncStats>): KloStructuralSyncStats {
|
||||
return { ...emptyStructuralSyncStats, ...input };
|
||||
}
|
||||
|
||||
function mergeEnrichmentSummary(input?: Partial<KloScanEnrichmentSummary>): KloScanEnrichmentSummary {
|
||||
return { ...skippedKloScanEnrichmentSummary, ...input };
|
||||
}
|
||||
|
||||
function mergeEnrichmentState(input?: Partial<KloScanEnrichmentStateSummary>): KloScanEnrichmentStateSummary {
|
||||
if (!input) {
|
||||
return completedKloScanEnrichmentStateSummary();
|
||||
}
|
||||
|
||||
return summarizeKloScanEnrichmentState({
|
||||
resumedStages: input.resumedStages ?? [],
|
||||
completedStages: input.completedStages ?? [],
|
||||
failedStages: input.failedStages ?? [],
|
||||
});
|
||||
}
|
||||
|
||||
function mergeArtifactPaths(
|
||||
structural?: Partial<KloScanArtifactPaths>,
|
||||
enrichment?: Partial<KloScanArtifactPaths>,
|
||||
): KloScanArtifactPaths {
|
||||
return {
|
||||
...emptyArtifactPaths,
|
||||
...structural,
|
||||
...enrichment,
|
||||
manifestShards: [...(structural?.manifestShards ?? []), ...(enrichment?.manifestShards ?? [])],
|
||||
enrichmentArtifacts: [...(structural?.enrichmentArtifacts ?? []), ...(enrichment?.enrichmentArtifacts ?? [])],
|
||||
};
|
||||
}
|
||||
|
||||
function mergeRelationshipSummary(
|
||||
structural?: Partial<KloScanRelationshipSummary>,
|
||||
enrichment?: Partial<KloScanRelationshipSummary>,
|
||||
): KloScanRelationshipSummary {
|
||||
return {
|
||||
accepted: (structural?.accepted ?? 0) + (enrichment?.accepted ?? 0),
|
||||
review: (structural?.review ?? 0) + (enrichment?.review ?? 0),
|
||||
rejected: (structural?.rejected ?? 0) + (enrichment?.rejected ?? 0),
|
||||
skipped: (structural?.skipped ?? 0) + (enrichment?.skipped ?? 0),
|
||||
};
|
||||
}
|
||||
|
||||
function manifestShardsWritten(phase: {
|
||||
manifestShardsWritten?: number;
|
||||
artifactPaths?: Partial<KloScanArtifactPaths>;
|
||||
}): number {
|
||||
return phase.manifestShardsWritten ?? phase.artifactPaths?.manifestShards?.length ?? 0;
|
||||
}
|
||||
|
||||
function requiredCapabilities(mode: KloScanInput['mode'], detectRelationships: boolean | undefined): CapabilityGap[] {
|
||||
const required = new Set<CapabilityGap>();
|
||||
|
||||
if (mode === 'enriched') {
|
||||
required.add('tableSampling');
|
||||
required.add('columnSampling');
|
||||
required.add('columnStats');
|
||||
required.add('readOnlySql');
|
||||
}
|
||||
|
||||
if (mode === 'relationships' || detectRelationships) {
|
||||
required.add('columnStats');
|
||||
required.add('readOnlySql');
|
||||
}
|
||||
|
||||
return [...required];
|
||||
}
|
||||
|
||||
function capabilityGaps(capabilities: KloConnectorCapabilities, input: KloScanInput): CapabilityGap[] {
|
||||
return requiredCapabilities(input.mode ?? 'structural', input.detectRelationships).filter(
|
||||
(capability) => !capabilities[capability],
|
||||
);
|
||||
}
|
||||
|
||||
function warningsForCapabilityGaps(gaps: CapabilityGap[]): KloScanWarning[] {
|
||||
return gaps.map((gap) => ({
|
||||
code: 'connector_capability_missing',
|
||||
message: `KLO scan connector is missing optional capability: ${gap}`,
|
||||
recoverable: true,
|
||||
metadata: { capability: gap },
|
||||
}));
|
||||
}
|
||||
|
||||
function assertNotAborted(context: KloScanContext): void {
|
||||
if (context.signal?.aborted) {
|
||||
throw new Error('KLO scan aborted');
|
||||
}
|
||||
}
|
||||
|
||||
export class KloScanOrchestrator {
|
||||
private readonly now: () => Date;
|
||||
private readonly syncIdFactory: (input: KloScanInput, context: KloScanContext) => string;
|
||||
|
||||
constructor(options: KloScanOrchestratorOptions = {}) {
|
||||
this.now = options.now ?? (() => new Date());
|
||||
this.syncIdFactory = options.syncIdFactory ?? ((_, context) => context.runId);
|
||||
}
|
||||
|
||||
async run<TStructuralResult = unknown, TEnrichmentResult = unknown>(
|
||||
input: KloScanOrchestratorRunInput<TStructuralResult, TEnrichmentResult>,
|
||||
): Promise<KloScanOrchestratorRunResult<TStructuralResult, TEnrichmentResult>> {
|
||||
const mode = input.input.mode ?? 'structural';
|
||||
const syncId = input.syncId ?? this.syncIdFactory(input.input, input.context);
|
||||
const gaps = capabilityGaps(input.connector.capabilities, input.input);
|
||||
const warnings = warningsForCapabilityGaps(gaps);
|
||||
|
||||
input.context.logger?.info('Starting KLO scan', {
|
||||
connectionId: input.input.connectionId,
|
||||
connectorId: input.connector.id,
|
||||
mode,
|
||||
trigger: input.trigger,
|
||||
});
|
||||
|
||||
assertNotAborted(input.context);
|
||||
const snapshot = await input.connector.introspect(input.input, input.context);
|
||||
|
||||
assertNotAborted(input.context);
|
||||
const structural = await input.runStructural(snapshot, input.context);
|
||||
|
||||
let enrichment: KloEnrichmentScanPhaseResult<TEnrichmentResult> | null = null;
|
||||
let failedEnrichment: KloScanEnrichmentSummary | null = null;
|
||||
if (mode !== 'structural' || input.input.detectRelationships) {
|
||||
if (input.runEnrichment) {
|
||||
assertNotAborted(input.context);
|
||||
try {
|
||||
enrichment = await input.runEnrichment(snapshot, structural, input.context);
|
||||
} catch (error) {
|
||||
const message = kloScanErrorMessage(error);
|
||||
failedEnrichment = failedKloScanEnrichmentSummary(mode, input.input.detectRelationships ?? false);
|
||||
warnings.push({
|
||||
code: 'enrichment_failed',
|
||||
message: `KLO scan enrichment failed after structural scan completed: ${message}`,
|
||||
recoverable: true,
|
||||
metadata: { mode, detectRelationships: input.input.detectRelationships ?? false },
|
||||
});
|
||||
input.context.logger?.warn('KLO scan enrichment failed after structural scan completed', {
|
||||
connectionId: input.input.connectionId,
|
||||
runId: input.context.runId,
|
||||
mode,
|
||||
error: message,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
failedEnrichment = failedKloScanEnrichmentSummary(mode, input.input.detectRelationships ?? false);
|
||||
warnings.push({
|
||||
code: 'connector_capability_missing',
|
||||
message: 'KLO scan requested enrichment or relationship detection, but no enrichment phase was provided',
|
||||
recoverable: true,
|
||||
metadata: { mode, detectRelationships: input.input.detectRelationships ?? false },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const manifestShardCount = manifestShardsWritten(structural) + (enrichment ? manifestShardsWritten(enrichment) : 0);
|
||||
|
||||
const report: KloScanReport = redactKloScanReport({
|
||||
connectionId: input.input.connectionId,
|
||||
driver: input.input.driver,
|
||||
syncId,
|
||||
runId: input.context.runId,
|
||||
trigger: input.trigger,
|
||||
mode,
|
||||
dryRun: input.input.dryRun ?? false,
|
||||
artifactPaths: mergeArtifactPaths(structural.artifactPaths, enrichment?.artifactPaths),
|
||||
diffSummary: mergeDiffSummary(structural.diffSummary),
|
||||
manifestShardsWritten: manifestShardCount,
|
||||
structuralSyncStats: mergeStructuralSyncStats(structural.structuralSyncStats),
|
||||
enrichment: mergeEnrichmentSummary(enrichment?.enrichment ?? failedEnrichment ?? undefined),
|
||||
capabilityGaps: gaps,
|
||||
warnings: [...warnings, ...(structural.warnings ?? []), ...(enrichment?.warnings ?? [])],
|
||||
relationships: mergeRelationshipSummary(structural.relationships, enrichment?.relationships),
|
||||
enrichmentState: mergeEnrichmentState(enrichment?.enrichmentState),
|
||||
createdAt: this.now().toISOString(),
|
||||
});
|
||||
|
||||
input.context.logger?.info('Completed KLO scan', {
|
||||
connectionId: report.connectionId,
|
||||
runId: report.runId,
|
||||
syncId: report.syncId,
|
||||
warnings: report.warnings.length,
|
||||
});
|
||||
|
||||
return {
|
||||
snapshot,
|
||||
structural,
|
||||
enrichment,
|
||||
report,
|
||||
};
|
||||
}
|
||||
}
|
||||
310
packages/context/src/scan/relationship-artifacts.test.ts
Normal file
310
packages/context/src/scan/relationship-artifacts.test.ts
Normal file
|
|
@ -0,0 +1,310 @@
|
|||
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { dirname, join } from 'node:path';
|
||||
import { runLocalStageOnlyIngest, type SourceAdapter } from '../ingest/index.js';
|
||||
import { initKloProject, loadKloProject } from '../project/index.js';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { readLocalScanRelationshipArtifacts } from './relationship-artifacts.js';
|
||||
import type { KloRelationshipArtifact, KloRelationshipDiagnosticsArtifact } from './relationship-diagnostics.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type { KloScanReport } from './types.js';
|
||||
|
||||
async function writeProjectFile(projectDir: string, relativePath: string, content: string): Promise<void> {
|
||||
const absolutePath = join(projectDir, relativePath);
|
||||
await mkdir(dirname(absolutePath), { recursive: true });
|
||||
await writeFile(absolutePath, content, 'utf-8');
|
||||
}
|
||||
|
||||
async function writeWarehouseConfig(projectDir: string): Promise<void> {
|
||||
await writeFile(
|
||||
join(projectDir, 'klo.yaml'),
|
||||
[
|
||||
'project: warehouse',
|
||||
'connections:',
|
||||
' warehouse:',
|
||||
' driver: sqlite',
|
||||
' path: warehouse.db',
|
||||
' readonly: true',
|
||||
'ingest:',
|
||||
' adapters:',
|
||||
' - live-database',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
);
|
||||
}
|
||||
|
||||
function liveDatabaseAdapter(): SourceAdapter {
|
||||
return {
|
||||
source: 'live-database',
|
||||
skillNames: ['live_database_ingest'],
|
||||
async fetch(_pullConfig, stagedDir) {
|
||||
await mkdir(join(stagedDir, 'tables'), { recursive: true });
|
||||
await writeFile(join(stagedDir, 'connection.json'), '{"connectionId":"warehouse"}\n', 'utf-8');
|
||||
await writeFile(join(stagedDir, 'foreign-keys.json'), '{"foreignKeys":[]}\n', 'utf-8');
|
||||
await writeFile(
|
||||
join(stagedDir, 'tables', 'orders.json'),
|
||||
'{"name":"orders","db":"public","columns":[{"name":"id","type":"integer","nullable":false,"primaryKey":true}]}\n',
|
||||
'utf-8',
|
||||
);
|
||||
},
|
||||
async detect(stagedDir) {
|
||||
await writeFile(join(stagedDir, 'connection.json'), '{"connectionId":"warehouse"}\n', 'utf-8');
|
||||
return true;
|
||||
},
|
||||
async chunk() {
|
||||
return {
|
||||
workUnits: [
|
||||
{
|
||||
unitKey: 'live-database-public-orders',
|
||||
rawFiles: ['tables/orders.json'],
|
||||
dependencyPaths: ['connection.json', 'foreign-keys.json'],
|
||||
peerFileIndex: [],
|
||||
},
|
||||
],
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async function createLiveDatabaseRun(projectDir: string, runId: string) {
|
||||
await initKloProject({ projectDir, projectName: 'warehouse' });
|
||||
await writeWarehouseConfig(projectDir);
|
||||
const project = await loadKloProject({ projectDir });
|
||||
await runLocalStageOnlyIngest({
|
||||
project,
|
||||
adapters: [liveDatabaseAdapter()],
|
||||
adapter: 'live-database',
|
||||
connectionId: 'warehouse',
|
||||
jobId: runId,
|
||||
now: () => new Date('2026-05-07T10:00:00.000Z'),
|
||||
});
|
||||
return project;
|
||||
}
|
||||
|
||||
function scanReport(enrichmentArtifacts: string[], syncId = '2026-05-07-100000-scan-run-review'): KloScanReport {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
syncId,
|
||||
runId: 'scan-run-review',
|
||||
trigger: 'cli',
|
||||
mode: 'relationships',
|
||||
dryRun: false,
|
||||
artifactPaths: {
|
||||
rawSourcesDir: `raw-sources/warehouse/live-database/${syncId}`,
|
||||
reportPath: `raw-sources/warehouse/live-database/${syncId}/scan-report.json`,
|
||||
manifestShards: [],
|
||||
enrichmentArtifacts,
|
||||
},
|
||||
diffSummary: {
|
||||
tablesAdded: 0,
|
||||
tablesModified: 0,
|
||||
tablesDeleted: 0,
|
||||
tablesUnchanged: 2,
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
manifestShardsWritten: 0,
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 0,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'skipped',
|
||||
},
|
||||
capabilityGaps: [],
|
||||
warnings: [],
|
||||
relationships: { accepted: 0, review: 1, rejected: 1, skipped: 0 },
|
||||
enrichmentState: {
|
||||
resumedStages: [],
|
||||
completedStages: ['relationships'],
|
||||
failedStages: [],
|
||||
},
|
||||
createdAt: '2026-05-07T10:00:00.000Z',
|
||||
};
|
||||
}
|
||||
|
||||
const relationshipArtifact: KloRelationshipArtifact = {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [],
|
||||
review: [
|
||||
{
|
||||
id: 'orders:orders.customer_id->customers:customers.id',
|
||||
status: 'review',
|
||||
source: 'deterministic_name',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'customers',
|
||||
columnIds: ['customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.62,
|
||||
pkScore: 0.91,
|
||||
fkScore: 0.62,
|
||||
score: 0.62,
|
||||
evidence: { sources: ['table_suffix'] },
|
||||
validation: { status: 'unavailable' },
|
||||
graph: { reasons: ['validation_unavailable_review_only'] },
|
||||
reasons: ['validation_unavailable_review_only', 'fk_score_review'],
|
||||
},
|
||||
],
|
||||
rejected: [
|
||||
{
|
||||
id: 'orders:orders.note_id->notes:notes.id',
|
||||
status: 'rejected',
|
||||
source: 'deterministic_name',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.note_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['note_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'notes',
|
||||
columnIds: ['notes.id'],
|
||||
table: { catalog: null, db: 'public', name: 'notes' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.2,
|
||||
pkScore: 0.4,
|
||||
fkScore: 0.2,
|
||||
score: 0.2,
|
||||
evidence: { sources: ['exact_column_match'] },
|
||||
validation: { status: 'failed' },
|
||||
graph: { reasons: ['low_source_coverage'] },
|
||||
reasons: ['low_source_coverage'],
|
||||
},
|
||||
],
|
||||
skipped: [],
|
||||
};
|
||||
|
||||
const diagnosticsArtifact: KloRelationshipDiagnosticsArtifact = {
|
||||
connectionId: 'warehouse',
|
||||
generatedAt: '2026-05-07T10:00:00.000Z',
|
||||
summary: { accepted: 0, review: 1, rejected: 1, skipped: 0 },
|
||||
noAcceptedReason: 'relationship candidates require review before manifest writes',
|
||||
candidateCountsBySource: { deterministic_name: 2 },
|
||||
validation: { available: false, sqlAvailable: false, queryCount: 0 },
|
||||
thresholds: { acceptThreshold: 0.85, reviewThreshold: 0.55 },
|
||||
policy: {
|
||||
validationRequiredForManifest: true,
|
||||
maxCandidatesPerColumn: 25,
|
||||
profileSampleRows: 10000,
|
||||
validationConcurrency: 4,
|
||||
},
|
||||
warnings: [],
|
||||
profileWarnings: ['KLO scan connector cannot run read-only SQL relationship validation'],
|
||||
};
|
||||
|
||||
const profileArtifact: KloRelationshipProfileArtifact = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: false,
|
||||
tables: [],
|
||||
columns: {},
|
||||
queryCount: 0,
|
||||
warnings: ['KLO scan connector cannot run read-only SQL relationship validation'],
|
||||
};
|
||||
|
||||
describe('local scan relationship artifact reader', () => {
|
||||
it('loads relationship, diagnostics, and profile artifacts for a scan run', async () => {
|
||||
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-artifacts-'));
|
||||
try {
|
||||
const project = await createLiveDatabaseRun(projectDir, 'scan-run-review');
|
||||
const syncId = '2026-05-07-100000-scan-run-review';
|
||||
const report = scanReport(
|
||||
[
|
||||
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationships.json`,
|
||||
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-profile.json`,
|
||||
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-diagnostics.json`,
|
||||
],
|
||||
syncId,
|
||||
);
|
||||
await writeProjectFile(projectDir, report.artifactPaths.reportPath ?? '', `${JSON.stringify(report, null, 2)}\n`);
|
||||
await writeProjectFile(
|
||||
projectDir,
|
||||
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationships.json`,
|
||||
`${JSON.stringify(relationshipArtifact, null, 2)}\n`,
|
||||
);
|
||||
await writeProjectFile(
|
||||
projectDir,
|
||||
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-diagnostics.json`,
|
||||
`${JSON.stringify(diagnosticsArtifact, null, 2)}\n`,
|
||||
);
|
||||
await writeProjectFile(
|
||||
projectDir,
|
||||
`raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-profile.json`,
|
||||
`${JSON.stringify(profileArtifact, null, 2)}\n`,
|
||||
);
|
||||
|
||||
const result = await readLocalScanRelationshipArtifacts(project, 'scan-run-review');
|
||||
|
||||
expect(result).toMatchObject({
|
||||
runId: 'scan-run-review',
|
||||
connectionId: 'warehouse',
|
||||
syncId,
|
||||
paths: {
|
||||
relationships: `raw-sources/warehouse/live-database/${syncId}/enrichment/relationships.json`,
|
||||
diagnostics: `raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-diagnostics.json`,
|
||||
profile: `raw-sources/warehouse/live-database/${syncId}/enrichment/relationship-profile.json`,
|
||||
},
|
||||
});
|
||||
expect(result?.relationships.review[0]).toMatchObject({
|
||||
id: 'orders:orders.customer_id->customers:customers.id',
|
||||
status: 'review',
|
||||
reasons: ['validation_unavailable_review_only', 'fk_score_review'],
|
||||
});
|
||||
expect(result?.diagnostics?.noAcceptedReason).toBe('relationship candidates require review before manifest writes');
|
||||
expect(result?.profile?.sqlAvailable).toBe(false);
|
||||
} finally {
|
||||
await rm(projectDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('returns null when the scan run has no report', async () => {
|
||||
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-artifacts-missing-run-'));
|
||||
try {
|
||||
await initKloProject({ projectDir, projectName: 'warehouse' });
|
||||
const project = await loadKloProject({ projectDir });
|
||||
|
||||
await expect(readLocalScanRelationshipArtifacts(project, 'missing-run')).resolves.toBeNull();
|
||||
} finally {
|
||||
await rm(projectDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('throws a focused error when a scan report does not reference relationships.json', async () => {
|
||||
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-artifacts-missing-artifact-'));
|
||||
try {
|
||||
const project = await createLiveDatabaseRun(projectDir, 'scan-run-review');
|
||||
const report = scanReport([]);
|
||||
await writeProjectFile(projectDir, report.artifactPaths.reportPath ?? '', `${JSON.stringify(report, null, 2)}\n`);
|
||||
|
||||
await expect(readLocalScanRelationshipArtifacts(project, 'scan-run-review')).rejects.toThrow(
|
||||
'Scan report "scan-run-review" does not reference relationships.json',
|
||||
);
|
||||
} finally {
|
||||
await rm(projectDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
});
|
||||
75
packages/context/src/scan/relationship-artifacts.ts
Normal file
75
packages/context/src/scan/relationship-artifacts.ts
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import { getLocalScanReport } from './local-scan.js';
|
||||
import type { KloRelationshipArtifact, KloRelationshipDiagnosticsArtifact } from './relationship-diagnostics.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type { KloScanReport } from './types.js';
|
||||
|
||||
export type KloRelationshipArtifactStatus = 'accepted' | 'review' | 'rejected' | 'skipped' | 'all';
|
||||
|
||||
export interface ReadLocalScanRelationshipArtifactsResult {
|
||||
runId: string;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
report: KloScanReport;
|
||||
relationships: KloRelationshipArtifact;
|
||||
diagnostics: KloRelationshipDiagnosticsArtifact | null;
|
||||
profile: KloRelationshipProfileArtifact | null;
|
||||
paths: {
|
||||
relationships: string;
|
||||
diagnostics: string | null;
|
||||
profile: string | null;
|
||||
};
|
||||
}
|
||||
|
||||
function findArtifactPath(report: KloScanReport, fileName: string): string | null {
|
||||
return report.artifactPaths.enrichmentArtifacts.find((path) => path.endsWith(`/enrichment/${fileName}`)) ?? null;
|
||||
}
|
||||
|
||||
async function readJsonArtifact<T>(project: KloLocalProject, path: string): Promise<T> {
|
||||
const raw = await project.fileStore.readFile(path);
|
||||
return JSON.parse(raw.content) as T;
|
||||
}
|
||||
|
||||
async function readOptionalJsonArtifact<T>(project: KloLocalProject, path: string | null): Promise<T | null> {
|
||||
if (!path) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return await readJsonArtifact<T>(project, path);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function readLocalScanRelationshipArtifacts(
|
||||
project: KloLocalProject,
|
||||
runId: string,
|
||||
): Promise<ReadLocalScanRelationshipArtifactsResult | null> {
|
||||
const report = await getLocalScanReport(project, runId);
|
||||
if (!report) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const relationshipsPath = findArtifactPath(report, 'relationships.json');
|
||||
if (!relationshipsPath) {
|
||||
throw new Error(`Scan report "${runId}" does not reference relationships.json`);
|
||||
}
|
||||
|
||||
const diagnosticsPath = findArtifactPath(report, 'relationship-diagnostics.json');
|
||||
const profilePath = findArtifactPath(report, 'relationship-profile.json');
|
||||
|
||||
return {
|
||||
runId,
|
||||
connectionId: report.connectionId,
|
||||
syncId: report.syncId,
|
||||
report,
|
||||
relationships: await readJsonArtifact<KloRelationshipArtifact>(project, relationshipsPath),
|
||||
diagnostics: await readOptionalJsonArtifact<KloRelationshipDiagnosticsArtifact>(project, diagnosticsPath),
|
||||
profile: await readOptionalJsonArtifact<KloRelationshipProfileArtifact>(project, profilePath),
|
||||
paths: {
|
||||
relationships: relationshipsPath,
|
||||
diagnostics: diagnosticsPath,
|
||||
profile: profilePath,
|
||||
},
|
||||
};
|
||||
}
|
||||
451
packages/context/src/scan/relationship-benchmark-report.test.ts
Normal file
451
packages/context/src/scan/relationship-benchmark-report.test.ts
Normal file
|
|
@ -0,0 +1,451 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
buildKloRelationshipBenchmarkReport,
|
||||
formatKloRelationshipBenchmarkReportMarkdown,
|
||||
} from './relationship-benchmark-report.js';
|
||||
import type {
|
||||
KloRelationshipBenchmarkCaseResult,
|
||||
KloRelationshipBenchmarkFixture,
|
||||
KloRelationshipBenchmarkSuiteResult,
|
||||
} from './relationship-benchmarks.js';
|
||||
|
||||
type CaseResultOverrides = Omit<Partial<KloRelationshipBenchmarkCaseResult>, 'metrics'> & {
|
||||
metrics?: Partial<KloRelationshipBenchmarkCaseResult['metrics']>;
|
||||
};
|
||||
|
||||
function caseResult(overrides: CaseResultOverrides = {}): KloRelationshipBenchmarkCaseResult {
|
||||
return {
|
||||
fixtureId: overrides.fixtureId ?? 'demo_b2b_no_declared_constraints',
|
||||
mode: overrides.mode ?? 'declared_pks_and_declared_fks_removed',
|
||||
metrics: {
|
||||
pkPrecision: 1,
|
||||
pkRecall: 0.5,
|
||||
pkF1: 0.6666666666666666,
|
||||
fkPrecision: 1,
|
||||
fkRecall: 1,
|
||||
fkF1: 1,
|
||||
acceptedFalsePositiveCount: 0,
|
||||
reviewRecall: 0,
|
||||
acceptedOrReviewRecall: 1,
|
||||
runtimeSeconds: 0.012345,
|
||||
sqlQueries: 14,
|
||||
llmCalls: 0,
|
||||
...(overrides.metrics ?? {}),
|
||||
},
|
||||
expected: overrides.expected ?? {
|
||||
pk: ['accounts.(id)', 'users.(id)'],
|
||||
fk: ['users.(account_id)->accounts.(id)'],
|
||||
},
|
||||
predicted: overrides.predicted ?? {
|
||||
pk: ['accounts.(id)'],
|
||||
fk: ['users.(account_id)->accounts.(id)'],
|
||||
acceptedFk: ['users.(account_id)->accounts.(id)'],
|
||||
reviewFk: [],
|
||||
},
|
||||
falsePositives: overrides.falsePositives ?? { pk: [], fk: [] },
|
||||
falseNegatives: overrides.falseNegatives ?? { pk: ['users.(id)'], fk: [] },
|
||||
skippedComposite: overrides.skippedComposite ?? { pk: [], fk: [] },
|
||||
validationBlocked: overrides.validationBlocked ?? false,
|
||||
};
|
||||
}
|
||||
|
||||
function fixture(overrides: Partial<KloRelationshipBenchmarkFixture> = {}): KloRelationshipBenchmarkFixture {
|
||||
return {
|
||||
id: overrides.id ?? 'demo_b2b_no_declared_constraints',
|
||||
name: overrides.name ?? 'Packaged B2B demo with declared PK and FK metadata masked',
|
||||
tier: overrides.tier ?? 'smoke',
|
||||
origin: overrides.origin ?? 'synthetic',
|
||||
thresholdEligible: overrides.thresholdEligible,
|
||||
validationBudget: overrides.validationBudget,
|
||||
snapshot: overrides.snapshot ?? {
|
||||
connectionId: 'demo_b2b',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [],
|
||||
},
|
||||
expected: overrides.expected ?? { expectedPks: [], expectedLinks: [] },
|
||||
defaultModes: overrides.defaultModes ?? ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
|
||||
dataPath: overrides.dataPath ?? '/tmp/demo.sqlite',
|
||||
columnEmbeddings: overrides.columnEmbeddings ?? {},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship benchmark report', () => {
|
||||
it('classifies run, validation-blocked, and not-run benchmark cases', () => {
|
||||
const suite: KloRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult(),
|
||||
caseResult({
|
||||
mode: 'validation_disabled',
|
||||
validationBlocked: true,
|
||||
metrics: { fkRecall: 0, acceptedOrReviewRecall: 1, sqlQueries: 0 },
|
||||
predicted: {
|
||||
pk: ['accounts.(id)'],
|
||||
fk: ['users.(account_id)->accounts.(id)'],
|
||||
acceptedFk: [],
|
||||
reviewFk: ['users.(account_id)->accounts.(id)'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: ['demo_b2b_no_declared_constraints:validation_disabled'],
|
||||
aggregate: {
|
||||
caseCount: 2,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0.5,
|
||||
meanAcceptedOrReviewRecall: 1,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKloRelationshipBenchmarkReport({
|
||||
fixtures: [fixture()],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed', 'validation_disabled', 'profiling_disabled'],
|
||||
});
|
||||
|
||||
expect(report.headline).toEqual({
|
||||
caseCount: 2,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
acceptedFalsePositiveCount: 0,
|
||||
validationBlockedCount: 1,
|
||||
});
|
||||
expect(report.cases.map((item) => `${item.fixtureId}:${item.mode}:${item.status}`)).toEqual([
|
||||
'demo_b2b_no_declared_constraints:declared_pks_and_declared_fks_removed:run',
|
||||
'demo_b2b_no_declared_constraints:validation_disabled:validation_blocked',
|
||||
'demo_b2b_no_declared_constraints:profiling_disabled:not_run',
|
||||
]);
|
||||
expect(report.cases[2]?.reason).toBe('mode not selected by fixture defaultModes');
|
||||
});
|
||||
|
||||
it('surfaces validation budget review candidates in the report reason', () => {
|
||||
const suite: KloRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'scale_stress_no_declared_constraints',
|
||||
metrics: { fkRecall: 0.5, acceptedOrReviewRecall: 1 },
|
||||
predicted: {
|
||||
pk: ['dim_entity_00.(entity_00_key)'],
|
||||
fk: [
|
||||
'fact_activity_000.(entity_00_key)->dim_entity_00.(entity_00_key)',
|
||||
'fact_activity_001.(entity_00_key)->dim_entity_00.(entity_00_key)',
|
||||
],
|
||||
acceptedFk: ['fact_activity_000.(entity_00_key)->dim_entity_00.(entity_00_key)'],
|
||||
reviewFk: ['fact_activity_001.(entity_00_key)->dim_entity_00.(entity_00_key)'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 0,
|
||||
headlinePkRecall: 1,
|
||||
headlineFkRecall: 0.5,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 1,
|
||||
meanFkRecall: 0.5,
|
||||
meanAcceptedOrReviewRecall: 1,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKloRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'scale_stress_no_declared_constraints',
|
||||
name: 'Scale stress fixture',
|
||||
tier: 'row_bearing',
|
||||
validationBudget: 800,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
});
|
||||
|
||||
expect(report.cases[0]?.reason).toBe('review candidate validation reasons: validation_unattempted (1)');
|
||||
expect(formatKloRelationshipBenchmarkReportMarkdown(report)).toContain('validation_unattempted');
|
||||
});
|
||||
|
||||
it('uses benchmark suite eligibility for product and smoke report rows', () => {
|
||||
const productCase = caseResult({ fixtureId: 'product_curated' });
|
||||
const productBlocked = caseResult({
|
||||
fixtureId: 'product_curated',
|
||||
mode: 'validation_disabled',
|
||||
validationBlocked: true,
|
||||
metrics: { fkRecall: 0, acceptedOrReviewRecall: 1, sqlQueries: 0 },
|
||||
});
|
||||
const smokeCase = caseResult({ fixtureId: 'smoke_even_if_marked' });
|
||||
const suite: KloRelationshipBenchmarkSuiteResult = {
|
||||
cases: [productCase, productBlocked, smokeCase],
|
||||
validationBlockedCases: ['product_curated:validation_disabled'],
|
||||
aggregate: {
|
||||
caseCount: 3,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0.6666666666666666,
|
||||
meanAcceptedOrReviewRecall: 1,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKloRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'product_curated',
|
||||
name: 'Curated product fixture',
|
||||
tier: 'product',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
|
||||
}),
|
||||
fixture({
|
||||
id: 'smoke_even_if_marked',
|
||||
name: 'Marked smoke fixture',
|
||||
tier: 'smoke',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed', 'validation_disabled'],
|
||||
});
|
||||
|
||||
expect(report.cases.map((item) => `${item.fixtureId}:${item.mode}:${item.tuningEligible}`)).toEqual([
|
||||
'product_curated:declared_pks_and_declared_fks_removed:true',
|
||||
'product_curated:validation_disabled:false',
|
||||
'smoke_even_if_marked:declared_pks_and_declared_fks_removed:false',
|
||||
'smoke_even_if_marked:validation_disabled:false',
|
||||
]);
|
||||
expect(formatKloRelationshipBenchmarkReportMarkdown(report)).toContain(
|
||||
'| product_curated | product | declared_pks_and_declared_fks_removed | run | yes |',
|
||||
);
|
||||
});
|
||||
|
||||
it('formats a compact Markdown report with false negatives and blocked modes', () => {
|
||||
const suite: KloRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
metrics: { fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
falseNegatives: { pk: ['users.(id)'], fk: ['users.(account_id)->accounts.(id)'] },
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 0,
|
||||
headlineAcceptedOrReviewRecall: 0,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0,
|
||||
meanAcceptedOrReviewRecall: 0,
|
||||
},
|
||||
};
|
||||
|
||||
const markdown = formatKloRelationshipBenchmarkReportMarkdown(
|
||||
buildKloRelationshipBenchmarkReport({
|
||||
fixtures: [fixture()],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
);
|
||||
|
||||
expect(markdown).toContain('# KLO Relationship Discovery Benchmark Evidence');
|
||||
expect(markdown).toContain(
|
||||
'| demo_b2b_no_declared_constraints | smoke | declared_pks_and_declared_fks_removed | run | no | 0.500 | 0.000 | 0.000 | 0 |',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'- `demo_b2b_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: users.(id)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'- `demo_b2b_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: users.(account_id)->accounts.(id)',
|
||||
);
|
||||
});
|
||||
|
||||
it('keeps headline failures separate from non-headline failure details', () => {
|
||||
const suite: KloRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'product_curated',
|
||||
falseNegatives: { pk: [], fk: [] },
|
||||
metrics: { pkRecall: 1, fkRecall: 1, acceptedOrReviewRecall: 1 },
|
||||
}),
|
||||
caseResult({
|
||||
fixtureId: 'product_curated',
|
||||
mode: 'embeddings_disabled',
|
||||
falseNegatives: {
|
||||
pk: ['customers.(id)'],
|
||||
fk: ['orders.(buyer_ref)->customers.(id)'],
|
||||
},
|
||||
metrics: { pkRecall: 0.5, fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 2,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 1,
|
||||
headlineFkRecall: 1,
|
||||
headlineAcceptedOrReviewRecall: 1,
|
||||
meanPkRecall: 0.75,
|
||||
meanFkRecall: 0.5,
|
||||
meanAcceptedOrReviewRecall: 0.5,
|
||||
},
|
||||
};
|
||||
|
||||
const markdown = formatKloRelationshipBenchmarkReportMarkdown(
|
||||
buildKloRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'product_curated',
|
||||
name: 'Curated product fixture',
|
||||
tier: 'product',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed', 'embeddings_disabled'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed', 'embeddings_disabled'],
|
||||
}),
|
||||
);
|
||||
|
||||
expect(markdown).toContain('## Failure Details');
|
||||
expect(markdown).toContain('### Headline False Negative FKs\n\n- none');
|
||||
expect(markdown).toContain(
|
||||
'- `product_curated` / `embeddings_disabled` / `run`: orders.(buyer_ref)->customers.(id)',
|
||||
);
|
||||
expect(markdown).toContain('- `product_curated` / `embeddings_disabled` / `run`: customers.(id)');
|
||||
});
|
||||
|
||||
it('formats headline failure context from remaining headline false negatives', () => {
|
||||
const suite: KloRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'public_headline_fixture',
|
||||
metrics: { pkRecall: 0.5, fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
falseNegatives: {
|
||||
pk: ['parent_table.(opaque_key)'],
|
||||
fk: ['child_table.(parent_table_id)->parent_table.(opaque_key)'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0.5,
|
||||
headlineFkRecall: 0,
|
||||
headlineAcceptedOrReviewRecall: 0,
|
||||
meanPkRecall: 0.5,
|
||||
meanFkRecall: 0,
|
||||
meanAcceptedOrReviewRecall: 0,
|
||||
},
|
||||
};
|
||||
|
||||
const markdown = formatKloRelationshipBenchmarkReportMarkdown(
|
||||
buildKloRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'public_headline_fixture',
|
||||
name: 'Public headline fixture',
|
||||
tier: 'row_bearing',
|
||||
thresholdEligible: true,
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
);
|
||||
|
||||
expect(markdown).toContain('## Headline Failure Context');
|
||||
expect(markdown).toContain('- Remaining headline false-negative PKs: 1');
|
||||
expect(markdown).toContain('- Remaining headline false-negative FKs: 1');
|
||||
expect(markdown).toContain(
|
||||
'- `public_headline_fixture` / `declared_pks_and_declared_fks_removed` / `run`: parent_table.(opaque_key)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'- `public_headline_fixture` / `declared_pks_and_declared_fks_removed` / `run`: child_table.(parent_table_id)->parent_table.(opaque_key)',
|
||||
);
|
||||
});
|
||||
|
||||
it('formats skipped composite ground truth separately from false-negative details', () => {
|
||||
const compositePk = 'order_lines.(order_id,line_number)';
|
||||
const compositeFk = 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)';
|
||||
const suite: KloRelationshipBenchmarkSuiteResult = {
|
||||
cases: [
|
||||
caseResult({
|
||||
fixtureId: 'composite_keys_no_declared_constraints',
|
||||
metrics: { pkRecall: 0, fkRecall: 0, acceptedOrReviewRecall: 0 },
|
||||
expected: {
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
},
|
||||
predicted: {
|
||||
pk: [],
|
||||
fk: [],
|
||||
acceptedFk: [],
|
||||
reviewFk: [],
|
||||
},
|
||||
falseNegatives: {
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
},
|
||||
skippedComposite: {
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
},
|
||||
}),
|
||||
],
|
||||
validationBlockedCases: [],
|
||||
aggregate: {
|
||||
caseCount: 1,
|
||||
headlineCaseCount: 1,
|
||||
headlinePkRecall: 0,
|
||||
headlineFkRecall: 0,
|
||||
headlineAcceptedOrReviewRecall: 0,
|
||||
meanPkRecall: 0,
|
||||
meanFkRecall: 0,
|
||||
meanAcceptedOrReviewRecall: 0,
|
||||
},
|
||||
};
|
||||
|
||||
const report = buildKloRelationshipBenchmarkReport({
|
||||
fixtures: [
|
||||
fixture({
|
||||
id: 'composite_keys_no_declared_constraints',
|
||||
name: 'Composite key fixture with no declared constraints',
|
||||
tier: 'row_bearing',
|
||||
defaultModes: ['declared_pks_and_declared_fks_removed'],
|
||||
}),
|
||||
],
|
||||
suite,
|
||||
modes: ['declared_pks_and_declared_fks_removed'],
|
||||
});
|
||||
|
||||
expect(report.cases[0]?.skippedComposite).toEqual({
|
||||
pk: [compositePk],
|
||||
fk: [compositeFk],
|
||||
});
|
||||
|
||||
const markdown = formatKloRelationshipBenchmarkReportMarkdown(report);
|
||||
expect(markdown).toContain('## Composite Ground Truth Skips');
|
||||
expect(markdown).toContain(
|
||||
'### Skipped Composite PKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_lines.(order_id,line_number)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'### Skipped Composite FKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
);
|
||||
expect(markdown).toContain(
|
||||
'### Headline False Negative FKs\n\n- `composite_keys_no_declared_constraints` / `declared_pks_and_declared_fks_removed` / `run`: order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
);
|
||||
});
|
||||
});
|
||||
363
packages/context/src/scan/relationship-benchmark-report.ts
Normal file
363
packages/context/src/scan/relationship-benchmark-report.ts
Normal file
|
|
@ -0,0 +1,363 @@
|
|||
import { isKloRelationshipBenchmarkTuningEligible } from './relationship-benchmarks.js';
|
||||
import type {
|
||||
KloRelationshipBenchmarkCaseResult,
|
||||
KloRelationshipBenchmarkFixture,
|
||||
KloRelationshipBenchmarkMode,
|
||||
KloRelationshipBenchmarkSuiteResult,
|
||||
} from './relationship-benchmarks.js';
|
||||
|
||||
export type KloRelationshipBenchmarkReportCaseStatus = 'run' | 'validation_blocked' | 'not_run';
|
||||
|
||||
export interface KloRelationshipBenchmarkReportCase {
|
||||
fixtureId: string;
|
||||
fixtureName: string;
|
||||
tier: string;
|
||||
mode: KloRelationshipBenchmarkMode;
|
||||
status: KloRelationshipBenchmarkReportCaseStatus;
|
||||
reason: string | null;
|
||||
tuningEligible: boolean;
|
||||
metrics: {
|
||||
pkRecall: number | null;
|
||||
fkRecall: number | null;
|
||||
acceptedOrReviewRecall: number | null;
|
||||
acceptedFalsePositiveCount: number | null;
|
||||
sqlQueries: number | null;
|
||||
llmCalls: number | null;
|
||||
runtimeSeconds: number | null;
|
||||
};
|
||||
falsePositives: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
falseNegatives: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
skippedComposite: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkReport {
|
||||
generatedAt: string;
|
||||
headline: {
|
||||
caseCount: number;
|
||||
headlineCaseCount: number;
|
||||
headlinePkRecall: number;
|
||||
headlineFkRecall: number;
|
||||
headlineAcceptedOrReviewRecall: number;
|
||||
acceptedFalsePositiveCount: number;
|
||||
validationBlockedCount: number;
|
||||
};
|
||||
cases: KloRelationshipBenchmarkReportCase[];
|
||||
}
|
||||
|
||||
function key(fixtureId: string, mode: KloRelationshipBenchmarkMode): string {
|
||||
return `${fixtureId}:${mode}`;
|
||||
}
|
||||
|
||||
function fixed(value: number | null): string {
|
||||
return value === null ? '-' : value.toFixed(3);
|
||||
}
|
||||
|
||||
function reportCaseReason(input: {
|
||||
fixture: KloRelationshipBenchmarkFixture;
|
||||
result: KloRelationshipBenchmarkCaseResult;
|
||||
}): string | null {
|
||||
if (input.result.validationBlocked) {
|
||||
return 'validation unavailable for this benchmark mode';
|
||||
}
|
||||
|
||||
if (input.fixture.validationBudget !== undefined && input.result.predicted.reviewFk.length > 0) {
|
||||
return `review candidate validation reasons: validation_unattempted (${input.result.predicted.reviewFk.length})`;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function reportCaseFromResult(input: {
|
||||
fixture: KloRelationshipBenchmarkFixture;
|
||||
mode: KloRelationshipBenchmarkMode;
|
||||
result: KloRelationshipBenchmarkCaseResult;
|
||||
}): KloRelationshipBenchmarkReportCase {
|
||||
const status = input.result.validationBlocked ? 'validation_blocked' : 'run';
|
||||
return {
|
||||
fixtureId: input.fixture.id,
|
||||
fixtureName: input.fixture.name,
|
||||
tier: input.fixture.tier,
|
||||
mode: input.mode,
|
||||
status,
|
||||
reason: reportCaseReason({ fixture: input.fixture, result: input.result }),
|
||||
tuningEligible: isKloRelationshipBenchmarkTuningEligible({
|
||||
fixture: input.fixture,
|
||||
mode: input.mode,
|
||||
validationBlocked: input.result.validationBlocked,
|
||||
}),
|
||||
metrics: {
|
||||
pkRecall: input.result.metrics.pkRecall,
|
||||
fkRecall: input.result.metrics.fkRecall,
|
||||
acceptedOrReviewRecall: input.result.metrics.acceptedOrReviewRecall,
|
||||
acceptedFalsePositiveCount: input.result.metrics.acceptedFalsePositiveCount,
|
||||
sqlQueries: input.result.metrics.sqlQueries,
|
||||
llmCalls: input.result.metrics.llmCalls,
|
||||
runtimeSeconds: input.result.metrics.runtimeSeconds,
|
||||
},
|
||||
falsePositives: input.result.falsePositives,
|
||||
falseNegatives: input.result.falseNegatives,
|
||||
skippedComposite: input.result.skippedComposite,
|
||||
};
|
||||
}
|
||||
|
||||
function notRunCase(input: {
|
||||
fixture: KloRelationshipBenchmarkFixture;
|
||||
mode: KloRelationshipBenchmarkMode;
|
||||
reason: string;
|
||||
}): KloRelationshipBenchmarkReportCase {
|
||||
return {
|
||||
fixtureId: input.fixture.id,
|
||||
fixtureName: input.fixture.name,
|
||||
tier: input.fixture.tier,
|
||||
mode: input.mode,
|
||||
status: 'not_run',
|
||||
reason: input.reason,
|
||||
tuningEligible: false,
|
||||
metrics: {
|
||||
pkRecall: null,
|
||||
fkRecall: null,
|
||||
acceptedOrReviewRecall: null,
|
||||
acceptedFalsePositiveCount: null,
|
||||
sqlQueries: null,
|
||||
llmCalls: null,
|
||||
runtimeSeconds: null,
|
||||
},
|
||||
falsePositives: { pk: [], fk: [] },
|
||||
falseNegatives: { pk: [], fk: [] },
|
||||
skippedComposite: { pk: [], fk: [] },
|
||||
};
|
||||
}
|
||||
|
||||
export function buildKloRelationshipBenchmarkReport(input: {
|
||||
fixtures: readonly KloRelationshipBenchmarkFixture[];
|
||||
suite: KloRelationshipBenchmarkSuiteResult;
|
||||
modes: readonly KloRelationshipBenchmarkMode[];
|
||||
generatedAt?: string;
|
||||
}): KloRelationshipBenchmarkReport {
|
||||
const resultsByKey = new Map(input.suite.cases.map((result) => [key(result.fixtureId, result.mode), result]));
|
||||
const cases: KloRelationshipBenchmarkReportCase[] = [];
|
||||
|
||||
for (const fixture of input.fixtures) {
|
||||
const selectedModes = new Set(fixture.defaultModes);
|
||||
for (const mode of input.modes) {
|
||||
const result = resultsByKey.get(key(fixture.id, mode));
|
||||
if (result) {
|
||||
cases.push(reportCaseFromResult({ fixture, mode, result }));
|
||||
continue;
|
||||
}
|
||||
cases.push(
|
||||
notRunCase({
|
||||
fixture,
|
||||
mode,
|
||||
reason: selectedModes.has(mode) ? 'mode produced no benchmark result' : 'mode not selected by fixture defaultModes',
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
generatedAt: input.generatedAt ?? new Date().toISOString(),
|
||||
headline: {
|
||||
caseCount: input.suite.aggregate.caseCount,
|
||||
headlineCaseCount: input.suite.aggregate.headlineCaseCount,
|
||||
headlinePkRecall: input.suite.aggregate.headlinePkRecall,
|
||||
headlineFkRecall: input.suite.aggregate.headlineFkRecall,
|
||||
headlineAcceptedOrReviewRecall: input.suite.aggregate.headlineAcceptedOrReviewRecall,
|
||||
acceptedFalsePositiveCount: input.suite.cases.reduce(
|
||||
(sum, result) => sum + result.metrics.acceptedFalsePositiveCount,
|
||||
0,
|
||||
),
|
||||
validationBlockedCount: input.suite.validationBlockedCases.length,
|
||||
},
|
||||
cases,
|
||||
};
|
||||
}
|
||||
|
||||
type KloRelationshipBenchmarkFailureSelector = (
|
||||
item: KloRelationshipBenchmarkReportCase,
|
||||
) => readonly string[];
|
||||
|
||||
function sortedFailureLines(input: {
|
||||
cases: readonly KloRelationshipBenchmarkReportCase[];
|
||||
select: KloRelationshipBenchmarkFailureSelector;
|
||||
}): string[] {
|
||||
return input.cases
|
||||
.flatMap((item) =>
|
||||
input.select(item).map((value) => ({
|
||||
fixtureId: item.fixtureId,
|
||||
mode: item.mode,
|
||||
status: item.status,
|
||||
value,
|
||||
})),
|
||||
)
|
||||
.sort((left, right) => {
|
||||
const leftKey = `${left.fixtureId}:${left.mode}:${left.status}:${left.value}`;
|
||||
const rightKey = `${right.fixtureId}:${right.mode}:${right.status}:${right.value}`;
|
||||
return leftKey.localeCompare(rightKey);
|
||||
})
|
||||
.map((item) => `- \`${item.fixtureId}\` / \`${item.mode}\` / \`${item.status}\`: ${item.value}`);
|
||||
}
|
||||
|
||||
function failureBlock(input: {
|
||||
title: string;
|
||||
cases: readonly KloRelationshipBenchmarkReportCase[];
|
||||
select: KloRelationshipBenchmarkFailureSelector;
|
||||
}): string[] {
|
||||
const values = sortedFailureLines({ cases: input.cases, select: input.select });
|
||||
return ['', `### ${input.title}`, '', ...(values.length > 0 ? values : ['- none'])];
|
||||
}
|
||||
|
||||
function headlineFailureContextBlocks(report: KloRelationshipBenchmarkReport): string[] {
|
||||
const headlineCases = report.cases.filter((item) => item.tuningEligible);
|
||||
const remainingPkMisses = sortedFailureLines({
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falseNegatives.pk,
|
||||
});
|
||||
const remainingFkMisses = sortedFailureLines({
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falseNegatives.fk,
|
||||
});
|
||||
|
||||
return [
|
||||
'',
|
||||
'## Headline Failure Context',
|
||||
'',
|
||||
'Remaining headline misses after this run are listed here so recall gains and still-open algorithmic gaps are visible in the regenerated evidence report.',
|
||||
'',
|
||||
`- Remaining headline false-negative PKs: ${remainingPkMisses.length}`,
|
||||
`- Remaining headline false-negative FKs: ${remainingFkMisses.length}`,
|
||||
'',
|
||||
'### Remaining Headline False Negative PKs',
|
||||
'',
|
||||
...(remainingPkMisses.length > 0 ? remainingPkMisses : ['- none']),
|
||||
'',
|
||||
'### Remaining Headline False Negative FKs',
|
||||
'',
|
||||
...(remainingFkMisses.length > 0 ? remainingFkMisses : ['- none']),
|
||||
];
|
||||
}
|
||||
|
||||
function failureDetailBlocks(report: KloRelationshipBenchmarkReport): string[] {
|
||||
const headlineCases = report.cases.filter((item) => item.tuningEligible);
|
||||
const otherCases = report.cases.filter((item) => !item.tuningEligible);
|
||||
|
||||
return [
|
||||
'',
|
||||
'## Failure Details',
|
||||
...failureBlock({
|
||||
title: 'Headline False Positive PKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falsePositives.pk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Headline False Positive FKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falsePositives.fk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Headline False Negative PKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falseNegatives.pk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Headline False Negative FKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.falseNegatives.fk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Other False Positive PKs',
|
||||
cases: otherCases,
|
||||
select: (item) => item.falsePositives.pk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Other False Positive FKs',
|
||||
cases: otherCases,
|
||||
select: (item) => item.falsePositives.fk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Other False Negative PKs',
|
||||
cases: otherCases,
|
||||
select: (item) => item.falseNegatives.pk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Other False Negative FKs',
|
||||
cases: otherCases,
|
||||
select: (item) => item.falseNegatives.fk,
|
||||
}),
|
||||
];
|
||||
}
|
||||
|
||||
function compositeSkipBlocks(report: KloRelationshipBenchmarkReport): string[] {
|
||||
const headlineCases = report.cases.filter((item) => item.tuningEligible);
|
||||
|
||||
return [
|
||||
'',
|
||||
'## Composite Ground Truth Skips',
|
||||
...failureBlock({
|
||||
title: 'Skipped Composite PKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.skippedComposite.pk,
|
||||
}),
|
||||
...failureBlock({
|
||||
title: 'Skipped Composite FKs',
|
||||
cases: headlineCases,
|
||||
select: (item) => item.skippedComposite.fk,
|
||||
}),
|
||||
];
|
||||
}
|
||||
|
||||
export function formatKloRelationshipBenchmarkReportMarkdown(report: KloRelationshipBenchmarkReport): string {
|
||||
const lines = [
|
||||
'# KLO Relationship Discovery Benchmark Evidence',
|
||||
'',
|
||||
`Generated: ${report.generatedAt}`,
|
||||
'',
|
||||
'## Headline',
|
||||
'',
|
||||
`- Cases run: ${report.headline.caseCount}`,
|
||||
`- Headline cases: ${report.headline.headlineCaseCount}`,
|
||||
`- Headline PK recall: ${fixed(report.headline.headlinePkRecall)}`,
|
||||
`- Headline FK recall: ${fixed(report.headline.headlineFkRecall)}`,
|
||||
`- Headline accepted-or-review recall: ${fixed(report.headline.headlineAcceptedOrReviewRecall)}`,
|
||||
`- Accepted false positives: ${report.headline.acceptedFalsePositiveCount}`,
|
||||
`- Validation-blocked cases: ${report.headline.validationBlockedCount}`,
|
||||
'',
|
||||
'## Cases',
|
||||
'',
|
||||
'| Fixture | Tier | Mode | Status | Tuning Eligible | PK Recall | FK Recall | Accepted+Review Recall | Accepted FP | Reason |',
|
||||
'| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | --- |',
|
||||
];
|
||||
|
||||
for (const item of report.cases) {
|
||||
lines.push(
|
||||
[
|
||||
`| ${item.fixtureId}`,
|
||||
item.tier,
|
||||
item.mode,
|
||||
item.status,
|
||||
item.tuningEligible ? 'yes' : 'no',
|
||||
fixed(item.metrics.pkRecall),
|
||||
fixed(item.metrics.fkRecall),
|
||||
fixed(item.metrics.acceptedOrReviewRecall),
|
||||
String(item.metrics.acceptedFalsePositiveCount ?? '-'),
|
||||
`${item.reason ?? ''} |`,
|
||||
].join(' | '),
|
||||
);
|
||||
}
|
||||
|
||||
lines.push(...headlineFailureContextBlocks(report));
|
||||
lines.push(...failureDetailBlocks(report));
|
||||
lines.push(...compositeSkipBlocks(report));
|
||||
lines.push('');
|
||||
|
||||
return `${lines.join('\n')}\n`;
|
||||
}
|
||||
1269
packages/context/src/scan/relationship-benchmarks.test.ts
Normal file
1269
packages/context/src/scan/relationship-benchmarks.test.ts
Normal file
File diff suppressed because it is too large
Load diff
902
packages/context/src/scan/relationship-benchmarks.ts
Normal file
902
packages/context/src/scan/relationship-benchmarks.ts
Normal file
|
|
@ -0,0 +1,902 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import { mkdtemp, readdir, readFile, stat, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { gunzipSync } from 'node:zlib';
|
||||
import Database from 'better-sqlite3';
|
||||
import YAML from 'yaml';
|
||||
import { z } from 'zod';
|
||||
import type { KloEnrichedRelationship, KloEnrichedSchema, KloRelationshipType } from './enrichment-types.js';
|
||||
import { snapshotToKloEnrichedSchema } from './local-enrichment.js';
|
||||
import type { KloRelationshipDiscoveryCandidate } from './relationship-candidates.js';
|
||||
import {
|
||||
generateKloRelationshipDiscoveryCandidates,
|
||||
mergeKloRelationshipDiscoveryCandidates,
|
||||
} from './relationship-candidates.js';
|
||||
import type { KloLlmProvider } from '@klo/llm';
|
||||
import { proposeKloRelationshipCandidatesWithLlm } from './relationship-llm-proposal.js';
|
||||
import {
|
||||
discoverKloCompositeRelationships,
|
||||
type KloCompositePrimaryKeyCandidate,
|
||||
type KloCompositeRelationshipCandidate,
|
||||
} from './relationship-composite-candidates.js';
|
||||
import { emptyKloRelationshipProfileArtifact } from './relationship-diagnostics.js';
|
||||
import { collectKloFormalMetadataRelationships } from './relationship-formal-metadata.js';
|
||||
import { resolveKloRelationshipGraph } from './relationship-graph-resolver.js';
|
||||
import { type KloRelationshipReadOnlyExecutor, profileKloRelationshipSchema } from './relationship-profiling.js';
|
||||
import type { KloRelationshipValidationBudget } from './relationship-budget.js';
|
||||
import type { KloRelationshipFixtureOrigin } from './relationship-scoring.js';
|
||||
import { validateKloRelationshipDiscoveryCandidates } from './relationship-validation.js';
|
||||
import type { KloQueryResult, KloReadOnlyQueryInput, KloScanContext, KloSchemaSnapshot } from './types.js';
|
||||
|
||||
export const KLO_RELATIONSHIP_BENCHMARK_MODES = [
|
||||
'metadata_present',
|
||||
'declared_fks_removed',
|
||||
'declared_pks_removed',
|
||||
'declared_pks_and_declared_fks_removed',
|
||||
'llm_disabled',
|
||||
'profiling_disabled',
|
||||
'validation_disabled',
|
||||
'embeddings_disabled',
|
||||
] as const;
|
||||
|
||||
export type KloRelationshipBenchmarkMode = (typeof KLO_RELATIONSHIP_BENCHMARK_MODES)[number];
|
||||
|
||||
export const KLO_RELATIONSHIP_BENCHMARK_TIERS = ['unit', 'row_bearing', 'schema_only', 'smoke', 'product'] as const;
|
||||
|
||||
export type KloRelationshipBenchmarkTier = (typeof KLO_RELATIONSHIP_BENCHMARK_TIERS)[number];
|
||||
|
||||
export type KloRelationshipBenchmarkStatus = 'accepted' | 'review' | 'rejected';
|
||||
|
||||
export interface KloRelationshipBenchmarkExpectedPk {
|
||||
table: string;
|
||||
columns: string[];
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkExpectedLink {
|
||||
fromTable: string;
|
||||
fromColumns: string[];
|
||||
toTable: string;
|
||||
toColumns: string[];
|
||||
relationship: KloRelationshipType;
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkExpectedLinks {
|
||||
expectedPks: KloRelationshipBenchmarkExpectedPk[];
|
||||
expectedLinks: KloRelationshipBenchmarkExpectedLink[];
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkFixture {
|
||||
id: string;
|
||||
name: string;
|
||||
tier: KloRelationshipBenchmarkTier;
|
||||
origin: KloRelationshipFixtureOrigin;
|
||||
thresholdEligible?: boolean;
|
||||
validationBudget?: KloRelationshipValidationBudget;
|
||||
snapshot: KloSchemaSnapshot;
|
||||
expected: KloRelationshipBenchmarkExpectedLinks;
|
||||
defaultModes: KloRelationshipBenchmarkMode[];
|
||||
dataPath: string | null;
|
||||
columnEmbeddings: Record<string, number[]>;
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkDetectedPk {
|
||||
table: string;
|
||||
columns: string[];
|
||||
score: number;
|
||||
status: KloRelationshipBenchmarkStatus;
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkDetectedLink {
|
||||
fromTable: string;
|
||||
fromColumns: string[];
|
||||
toTable: string;
|
||||
toColumns: string[];
|
||||
relationship: KloRelationshipType;
|
||||
score: number;
|
||||
status: KloRelationshipBenchmarkStatus;
|
||||
source: string;
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkDetectorResult {
|
||||
pks: KloRelationshipBenchmarkDetectedPk[];
|
||||
links: KloRelationshipBenchmarkDetectedLink[];
|
||||
validationBlocked: boolean;
|
||||
sqlQueries: number;
|
||||
llmCalls: number;
|
||||
runtimeSeconds: number;
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkDetectorInput {
|
||||
fixtureId: string;
|
||||
mode: KloRelationshipBenchmarkMode;
|
||||
snapshot: KloSchemaSnapshot;
|
||||
schema: KloEnrichedSchema;
|
||||
dataPath: string | null;
|
||||
validationBudget?: KloRelationshipValidationBudget;
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkDetector {
|
||||
detect(input: KloRelationshipBenchmarkDetectorInput): Promise<KloRelationshipBenchmarkDetectorResult>;
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkMetrics {
|
||||
pkPrecision: number;
|
||||
pkRecall: number;
|
||||
pkF1: number;
|
||||
fkPrecision: number;
|
||||
fkRecall: number;
|
||||
fkF1: number;
|
||||
acceptedFalsePositiveCount: number;
|
||||
reviewRecall: number;
|
||||
acceptedOrReviewRecall: number;
|
||||
runtimeSeconds: number;
|
||||
sqlQueries: number;
|
||||
llmCalls: number;
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkCaseResult {
|
||||
fixtureId: string;
|
||||
mode: KloRelationshipBenchmarkMode;
|
||||
metrics: KloRelationshipBenchmarkMetrics;
|
||||
expected: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
predicted: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
acceptedFk: string[];
|
||||
reviewFk: string[];
|
||||
};
|
||||
falsePositives: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
falseNegatives: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
skippedComposite: {
|
||||
pk: string[];
|
||||
fk: string[];
|
||||
};
|
||||
validationBlocked: boolean;
|
||||
}
|
||||
|
||||
export interface KloRelationshipBenchmarkSuiteResult {
|
||||
cases: KloRelationshipBenchmarkCaseResult[];
|
||||
validationBlockedCases: string[];
|
||||
aggregate: {
|
||||
caseCount: number;
|
||||
headlineCaseCount: number;
|
||||
headlinePkRecall: number;
|
||||
headlineFkRecall: number;
|
||||
headlineAcceptedOrReviewRecall: number;
|
||||
meanPkRecall: number;
|
||||
meanFkRecall: number;
|
||||
meanAcceptedOrReviewRecall: number;
|
||||
};
|
||||
}
|
||||
|
||||
class KloRelationshipBenchmarkSqliteExecutor implements KloRelationshipReadOnlyExecutor {
|
||||
private readonly db: Database.Database;
|
||||
queryCount = 0;
|
||||
|
||||
constructor(dataPath: string) {
|
||||
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
|
||||
}
|
||||
|
||||
async executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return {
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
};
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function fixtureText(fixtureDir: string, fileName: string): Promise<string> {
|
||||
const rawPath = join(fixtureDir, fileName);
|
||||
try {
|
||||
return await readFile(rawPath, 'utf-8');
|
||||
} catch (error) {
|
||||
if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const compressed = await readFile(`${rawPath}.gz`);
|
||||
return gunzipSync(compressed).toString('utf-8');
|
||||
}
|
||||
|
||||
async function fixtureDataPath(fixtureDir: string): Promise<string | null> {
|
||||
const dataPath = join(fixtureDir, 'data.sqlite');
|
||||
try {
|
||||
const dataStat = await stat(dataPath);
|
||||
return dataStat.isFile() ? dataPath : null;
|
||||
} catch (error) {
|
||||
if ((error as NodeJS.ErrnoException).code !== 'ENOENT') {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const compressedPath = `${dataPath}.gz`;
|
||||
try {
|
||||
const compressedStat = await stat(compressedPath);
|
||||
if (!compressedStat.isFile()) {
|
||||
return null;
|
||||
}
|
||||
const digest = createHash('sha256').update(fixtureDir).digest('hex').slice(0, 16);
|
||||
const tempRoot = await mkdtemp(join(tmpdir(), `klo-relationship-benchmark-${digest}-`));
|
||||
const extractedPath = join(tempRoot, 'data.sqlite');
|
||||
await writeFile(extractedPath, gunzipSync(await readFile(compressedPath)));
|
||||
return extractedPath;
|
||||
} catch (error) {
|
||||
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||
return null;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function fixtureColumnEmbeddings(fixtureDir: string): Promise<Record<string, number[]>> {
|
||||
const embeddingsPath = join(fixtureDir, 'column-embeddings.json');
|
||||
try {
|
||||
const raw = await readFile(embeddingsPath, 'utf-8');
|
||||
const parsed = JSON.parse(raw) as Record<string, unknown>;
|
||||
return Object.fromEntries(
|
||||
Object.entries(parsed).flatMap(([columnId, value]) => {
|
||||
if (!Array.isArray(value) || value.some((item) => typeof item !== 'number')) {
|
||||
return [];
|
||||
}
|
||||
return [[columnId, value as number[]]];
|
||||
}),
|
||||
);
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
const modeSchema = z.enum(KLO_RELATIONSHIP_BENCHMARK_MODES);
|
||||
const tierSchema = z.enum(KLO_RELATIONSHIP_BENCHMARK_TIERS);
|
||||
const originSchema = z.enum(['synthetic', 'public', 'customer']);
|
||||
const validationBudgetSchema = z.union([z.literal('all'), z.number().int().nonnegative()]);
|
||||
|
||||
const fixtureConfigSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
name: z.string().min(1),
|
||||
tier: tierSchema.default('unit'),
|
||||
origin: originSchema,
|
||||
thresholdEligible: z.boolean().optional(),
|
||||
validationBudget: validationBudgetSchema.optional(),
|
||||
defaultModes: z.array(modeSchema).min(1),
|
||||
});
|
||||
|
||||
const expectedLinksSchema = z.object({
|
||||
expectedPks: z.array(
|
||||
z.object({
|
||||
table: z.string().min(1),
|
||||
columns: z.array(z.string().min(1)).min(1),
|
||||
}),
|
||||
),
|
||||
expectedLinks: z.array(
|
||||
z.object({
|
||||
fromTable: z.string().min(1),
|
||||
fromColumns: z.array(z.string().min(1)).min(1),
|
||||
toTable: z.string().min(1),
|
||||
toColumns: z.array(z.string().min(1)).min(1),
|
||||
relationship: z.enum(['many_to_one', 'one_to_many', 'one_to_one']),
|
||||
}),
|
||||
),
|
||||
});
|
||||
|
||||
function sortedUnique(values: Iterable<string>): string[] {
|
||||
return Array.from(new Set(values)).sort((left, right) => left.localeCompare(right));
|
||||
}
|
||||
|
||||
function tupleKey(columns: readonly string[]): string {
|
||||
return `(${columns.join(',')})`;
|
||||
}
|
||||
|
||||
function pkKey(pk: Pick<KloRelationshipBenchmarkExpectedPk, 'table' | 'columns'>): string {
|
||||
return `${pk.table}.${tupleKey(pk.columns)}`;
|
||||
}
|
||||
|
||||
function fkKey(
|
||||
link: Pick<KloRelationshipBenchmarkExpectedLink, 'fromTable' | 'fromColumns' | 'toTable' | 'toColumns'>,
|
||||
): string {
|
||||
return `${link.fromTable}.${tupleKey(link.fromColumns)}->${link.toTable}.${tupleKey(link.toColumns)}`;
|
||||
}
|
||||
|
||||
function relationshipKey(link: KloRelationshipBenchmarkDetectedLink): string {
|
||||
return fkKey(link);
|
||||
}
|
||||
|
||||
function relationshipToBenchmarkLink(candidate: KloEnrichedRelationship): KloRelationshipBenchmarkDetectedLink {
|
||||
return {
|
||||
fromTable: candidate.from.table.name,
|
||||
fromColumns: candidate.from.columns,
|
||||
toTable: candidate.to.table.name,
|
||||
toColumns: candidate.to.columns,
|
||||
relationship: candidate.relationshipType,
|
||||
score: candidate.confidence,
|
||||
status: 'accepted',
|
||||
source: candidate.source,
|
||||
};
|
||||
}
|
||||
|
||||
function broadCandidateToBenchmarkLink(
|
||||
candidate: Pick<KloRelationshipDiscoveryCandidate, 'confidence' | 'from' | 'relationshipType' | 'source' | 'to'>,
|
||||
): KloRelationshipBenchmarkDetectedLink {
|
||||
return {
|
||||
fromTable: candidate.from.table.name,
|
||||
fromColumns: candidate.from.columns,
|
||||
toTable: candidate.to.table.name,
|
||||
toColumns: candidate.to.columns,
|
||||
relationship: candidate.relationshipType,
|
||||
score: candidate.confidence,
|
||||
status: 'review',
|
||||
source: candidate.source,
|
||||
};
|
||||
}
|
||||
|
||||
function compositePkToBenchmarkPk(candidate: KloCompositePrimaryKeyCandidate): KloRelationshipBenchmarkDetectedPk {
|
||||
return {
|
||||
table: candidate.table.name,
|
||||
columns: candidate.columns,
|
||||
score: candidate.score,
|
||||
status: candidate.status,
|
||||
};
|
||||
}
|
||||
|
||||
function compositeRelationshipToBenchmarkLink(
|
||||
candidate: KloCompositeRelationshipCandidate,
|
||||
): KloRelationshipBenchmarkDetectedLink {
|
||||
return {
|
||||
fromTable: candidate.from.table.name,
|
||||
fromColumns: candidate.from.columns,
|
||||
toTable: candidate.to.table.name,
|
||||
toColumns: candidate.to.columns,
|
||||
relationship: candidate.relationshipType,
|
||||
score: candidate.confidence,
|
||||
status: candidate.status,
|
||||
source: candidate.source,
|
||||
};
|
||||
}
|
||||
|
||||
function ratio(numerator: number, denominator: number): number {
|
||||
return denominator === 0 ? 1 : numerator / denominator;
|
||||
}
|
||||
|
||||
function f1(precision: number, recall: number): number {
|
||||
return precision + recall === 0 ? 0 : (2 * precision * recall) / (precision + recall);
|
||||
}
|
||||
|
||||
function difference(left: readonly string[], right: readonly string[]): string[] {
|
||||
const rightSet = new Set(right);
|
||||
return left.filter((item) => !rightSet.has(item));
|
||||
}
|
||||
|
||||
function intersectionSize(left: readonly string[], right: readonly string[]): number {
|
||||
const rightSet = new Set(right);
|
||||
return left.filter((item) => rightSet.has(item)).length;
|
||||
}
|
||||
|
||||
function compositePkKeys(expected: KloRelationshipBenchmarkExpectedLinks): string[] {
|
||||
return sortedUnique(expected.expectedPks.filter((pk) => pk.columns.length > 1).map(pkKey));
|
||||
}
|
||||
|
||||
function compositeFkKeys(expected: KloRelationshipBenchmarkExpectedLinks): string[] {
|
||||
return sortedUnique(
|
||||
expected.expectedLinks.filter((link) => link.fromColumns.length > 1 || link.toColumns.length > 1).map(fkKey),
|
||||
);
|
||||
}
|
||||
|
||||
function scalarExpectedPkKeys(expected: KloRelationshipBenchmarkExpectedLinks): string[] {
|
||||
return sortedUnique(expected.expectedPks.map(pkKey));
|
||||
}
|
||||
|
||||
function scalarExpectedFkKeys(expected: KloRelationshipBenchmarkExpectedLinks): string[] {
|
||||
return sortedUnique(expected.expectedLinks.map(fkKey));
|
||||
}
|
||||
|
||||
function scoreBenchmarkCase(input: {
|
||||
fixtureId: string;
|
||||
mode: KloRelationshipBenchmarkMode;
|
||||
expected: KloRelationshipBenchmarkExpectedLinks;
|
||||
detected: KloRelationshipBenchmarkDetectorResult;
|
||||
}): KloRelationshipBenchmarkCaseResult {
|
||||
const expectedPk = scalarExpectedPkKeys(input.expected);
|
||||
const expectedFk = scalarExpectedFkKeys(input.expected);
|
||||
const predictedPk = sortedUnique(input.detected.pks.map(pkKey));
|
||||
const predictedFk = sortedUnique(input.detected.links.map(relationshipKey));
|
||||
const acceptedFk = sortedUnique(
|
||||
input.detected.links.filter((link) => link.status === 'accepted').map(relationshipKey),
|
||||
);
|
||||
const reviewFk = sortedUnique(input.detected.links.filter((link) => link.status === 'review').map(relationshipKey));
|
||||
const acceptedOrReviewFk = sortedUnique([...acceptedFk, ...reviewFk]);
|
||||
|
||||
const truePositivePk = intersectionSize(predictedPk, expectedPk);
|
||||
const truePositiveFk = intersectionSize(acceptedFk, expectedFk);
|
||||
const acceptedOrReviewTruePositiveFk = intersectionSize(acceptedOrReviewFk, expectedFk);
|
||||
const reviewTruePositiveFk = intersectionSize(reviewFk, expectedFk);
|
||||
const pkPrecision = ratio(truePositivePk, predictedPk.length);
|
||||
const pkRecall = ratio(truePositivePk, expectedPk.length);
|
||||
const fkPrecision = ratio(truePositiveFk, acceptedFk.length);
|
||||
const fkRecall = ratio(truePositiveFk, expectedFk.length);
|
||||
|
||||
const falsePositiveFk = difference(acceptedFk, expectedFk);
|
||||
return {
|
||||
fixtureId: input.fixtureId,
|
||||
mode: input.mode,
|
||||
metrics: {
|
||||
pkPrecision,
|
||||
pkRecall,
|
||||
pkF1: f1(pkPrecision, pkRecall),
|
||||
fkPrecision,
|
||||
fkRecall,
|
||||
fkF1: f1(fkPrecision, fkRecall),
|
||||
acceptedFalsePositiveCount: falsePositiveFk.length,
|
||||
reviewRecall: ratio(reviewTruePositiveFk, expectedFk.length),
|
||||
acceptedOrReviewRecall: ratio(acceptedOrReviewTruePositiveFk, expectedFk.length),
|
||||
runtimeSeconds: input.detected.runtimeSeconds,
|
||||
sqlQueries: input.detected.sqlQueries,
|
||||
llmCalls: input.detected.llmCalls,
|
||||
},
|
||||
expected: {
|
||||
pk: expectedPk,
|
||||
fk: expectedFk,
|
||||
},
|
||||
predicted: {
|
||||
pk: predictedPk,
|
||||
fk: predictedFk,
|
||||
acceptedFk,
|
||||
reviewFk,
|
||||
},
|
||||
falsePositives: {
|
||||
pk: difference(predictedPk, expectedPk),
|
||||
fk: falsePositiveFk,
|
||||
},
|
||||
falseNegatives: {
|
||||
pk: difference(expectedPk, predictedPk),
|
||||
fk: difference(expectedFk, acceptedOrReviewFk),
|
||||
},
|
||||
skippedComposite: {
|
||||
pk: difference(compositePkKeys(input.expected), predictedPk),
|
||||
fk: difference(compositeFkKeys(input.expected), acceptedOrReviewFk),
|
||||
},
|
||||
validationBlocked: input.detected.validationBlocked,
|
||||
};
|
||||
}
|
||||
|
||||
export function maskKloRelationshipBenchmarkSnapshot(
|
||||
snapshot: KloSchemaSnapshot,
|
||||
mode: KloRelationshipBenchmarkMode,
|
||||
): KloSchemaSnapshot {
|
||||
const relationshipDiscoveryMode =
|
||||
mode === 'declared_pks_and_declared_fks_removed' ||
|
||||
mode === 'llm_disabled' ||
|
||||
mode === 'profiling_disabled' ||
|
||||
mode === 'validation_disabled' ||
|
||||
mode === 'embeddings_disabled';
|
||||
const removePks = relationshipDiscoveryMode || mode === 'declared_pks_removed';
|
||||
const removeFks = relationshipDiscoveryMode || mode === 'declared_fks_removed';
|
||||
|
||||
return {
|
||||
...snapshot,
|
||||
scope: { ...snapshot.scope },
|
||||
metadata: { ...snapshot.metadata },
|
||||
tables: snapshot.tables.map((table) => ({
|
||||
...table,
|
||||
columns: table.columns.map((column) => ({
|
||||
...column,
|
||||
primaryKey: removePks ? false : column.primaryKey,
|
||||
})),
|
||||
foreignKeys: removeFks ? [] : table.foreignKeys.map((foreignKey) => ({ ...foreignKey })),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
export function isKloRelationshipBenchmarkTuningEligible(input: {
|
||||
fixture: Pick<KloRelationshipBenchmarkFixture, 'tier' | 'thresholdEligible'>;
|
||||
mode: KloRelationshipBenchmarkMode;
|
||||
validationBlocked: boolean;
|
||||
}): boolean {
|
||||
if (input.validationBlocked || input.mode !== 'declared_pks_and_declared_fks_removed') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (input.fixture.tier === 'smoke' || input.fixture.tier === 'schema_only') {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (input.fixture.thresholdEligible !== undefined) {
|
||||
return input.fixture.thresholdEligible;
|
||||
}
|
||||
|
||||
return input.fixture.tier === 'unit' || input.fixture.tier === 'row_bearing';
|
||||
}
|
||||
|
||||
export function kloRelationshipBenchmarkDetectorWithLlm(
|
||||
llmProvider: KloLlmProvider,
|
||||
): KloRelationshipBenchmarkDetector {
|
||||
return {
|
||||
async detect(input) {
|
||||
const startedAt = performance.now();
|
||||
const formalMetadata = collectKloFormalMetadataRelationships(input.schema);
|
||||
const formalLinks = formalMetadata.accepted.map((relationship) => relationshipToBenchmarkLink(relationship));
|
||||
const acceptedKeys = new Set(formalLinks.map(fkKey));
|
||||
const sqliteDataAvailable = Boolean(input.dataPath && input.snapshot.driver === 'sqlite');
|
||||
const profilingExecutor =
|
||||
sqliteDataAvailable && input.mode !== 'profiling_disabled'
|
||||
? new KloRelationshipBenchmarkSqliteExecutor(input.dataPath as string)
|
||||
: null;
|
||||
const validationExecutor = profilingExecutor && input.mode !== 'validation_disabled' ? profilingExecutor : null;
|
||||
const profiles =
|
||||
input.mode === 'profiling_disabled'
|
||||
? emptyKloRelationshipProfileArtifact({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
reason: 'relationship_benchmark_profiling_disabled',
|
||||
})
|
||||
: await profileKloRelationshipSchema({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
schema: input.schema,
|
||||
executor: profilingExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:profile` },
|
||||
});
|
||||
const broadRelationshipCandidates = generateKloRelationshipDiscoveryCandidates(input.schema, {
|
||||
profiles,
|
||||
useEmbeddings: input.mode !== 'embeddings_disabled',
|
||||
});
|
||||
const llmProposalResult =
|
||||
input.mode === 'llm_disabled'
|
||||
? { candidates: [], warnings: [], llmCalls: 0, summary: 'skipped' as const }
|
||||
: await proposeKloRelationshipCandidatesWithLlm({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
schema: input.schema,
|
||||
profile: profiles,
|
||||
llmProvider,
|
||||
});
|
||||
const candidates = mergeKloRelationshipDiscoveryCandidates([
|
||||
...broadRelationshipCandidates,
|
||||
...llmProposalResult.candidates,
|
||||
]);
|
||||
const validationBudget =
|
||||
input.validationBudget === 'all'
|
||||
? 'all'
|
||||
: input.validationBudget === undefined
|
||||
? 'all'
|
||||
: Math.max(0, input.validationBudget - profiles.queryCount);
|
||||
const validatedBroadCandidates = await validateKloRelationshipDiscoveryCandidates({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
candidates,
|
||||
profiles,
|
||||
executor: validationExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:validate` },
|
||||
tableCount: input.schema.tables.length,
|
||||
settings: {
|
||||
validationBudget,
|
||||
},
|
||||
});
|
||||
const compositeDetection =
|
||||
validationBudget === 'all' &&
|
||||
validationExecutor &&
|
||||
input.mode !== 'profiling_disabled' &&
|
||||
input.mode !== 'validation_disabled'
|
||||
? await discoverKloCompositeRelationships({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
schema: input.schema,
|
||||
profiles,
|
||||
executor: validationExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:composite` },
|
||||
})
|
||||
: { primaryKeys: [], relationships: [], queryCount: 0, warnings: [] };
|
||||
profilingExecutor?.close();
|
||||
const graph = resolveKloRelationshipGraph({
|
||||
schema: input.schema,
|
||||
profiles,
|
||||
candidates: validatedBroadCandidates,
|
||||
});
|
||||
const acceptedBroadCandidates = graph.relationships
|
||||
.filter((candidate) => candidate.status === 'accepted')
|
||||
.map((candidate) => ({
|
||||
...broadCandidateToBenchmarkLink(candidate),
|
||||
score: candidate.fkScore,
|
||||
status: 'accepted' as const,
|
||||
}))
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
|
||||
const reviewCandidates = graph.relationships
|
||||
.filter((candidate) => candidate.status === 'review')
|
||||
.map((candidate) => ({
|
||||
...broadCandidateToBenchmarkLink(candidate),
|
||||
score: candidate.fkScore,
|
||||
status: 'review' as const,
|
||||
}))
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
|
||||
const resolvedPks = graph.pks
|
||||
.filter((pk) => pk.status !== 'rejected')
|
||||
.map((pk) => ({
|
||||
table: pk.table,
|
||||
columns: pk.columns,
|
||||
score: pk.pkScore,
|
||||
status: pk.status,
|
||||
}));
|
||||
const compositePks = compositeDetection.primaryKeys.map(compositePkToBenchmarkPk);
|
||||
const allPksByKey = new Map([...resolvedPks, ...compositePks].map((candidate) => [pkKey(candidate), candidate]));
|
||||
const pks = sortedUnique(allPksByKey.keys()).flatMap((key) => {
|
||||
const candidate = allPksByKey.get(key);
|
||||
return candidate ? [candidate] : [];
|
||||
});
|
||||
|
||||
return {
|
||||
pks,
|
||||
links: [
|
||||
...formalLinks,
|
||||
...acceptedBroadCandidates,
|
||||
...reviewCandidates,
|
||||
...compositeDetection.relationships
|
||||
.map(compositeRelationshipToBenchmarkLink)
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate))),
|
||||
],
|
||||
validationBlocked:
|
||||
input.mode === 'validation_disabled' ||
|
||||
input.mode === 'profiling_disabled' ||
|
||||
(input.dataPath !== null && broadRelationshipCandidates.length > 0 && !profiles.sqlAvailable),
|
||||
sqlQueries: profilingExecutor?.queryCount ?? profiles.queryCount,
|
||||
llmCalls: llmProposalResult.llmCalls,
|
||||
runtimeSeconds: Number(((performance.now() - startedAt) / 1000).toFixed(6)),
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export function currentKloRelationshipBenchmarkDetector(): KloRelationshipBenchmarkDetector {
|
||||
return {
|
||||
async detect(input) {
|
||||
const startedAt = performance.now();
|
||||
const formalMetadata = collectKloFormalMetadataRelationships(input.schema);
|
||||
const formalLinks = formalMetadata.accepted.map((relationship) => relationshipToBenchmarkLink(relationship));
|
||||
const acceptedKeys = new Set(formalLinks.map(fkKey));
|
||||
const sqliteDataAvailable = Boolean(input.dataPath && input.snapshot.driver === 'sqlite');
|
||||
const profilingExecutor =
|
||||
sqliteDataAvailable && input.mode !== 'profiling_disabled'
|
||||
? new KloRelationshipBenchmarkSqliteExecutor(input.dataPath as string)
|
||||
: null;
|
||||
const validationExecutor = profilingExecutor && input.mode !== 'validation_disabled' ? profilingExecutor : null;
|
||||
const profiles =
|
||||
input.mode === 'profiling_disabled'
|
||||
? emptyKloRelationshipProfileArtifact({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
reason: 'relationship_benchmark_profiling_disabled',
|
||||
})
|
||||
: await profileKloRelationshipSchema({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
schema: input.schema,
|
||||
executor: profilingExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:profile` },
|
||||
});
|
||||
const broadRelationshipCandidates = generateKloRelationshipDiscoveryCandidates(input.schema, {
|
||||
profiles,
|
||||
useEmbeddings: input.mode !== 'embeddings_disabled',
|
||||
});
|
||||
const validationBudget =
|
||||
input.validationBudget === 'all'
|
||||
? 'all'
|
||||
: input.validationBudget === undefined
|
||||
? 'all'
|
||||
: Math.max(0, input.validationBudget - profiles.queryCount);
|
||||
const validatedBroadCandidates = await validateKloRelationshipDiscoveryCandidates({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
candidates: broadRelationshipCandidates,
|
||||
profiles,
|
||||
executor: validationExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:validate` },
|
||||
tableCount: input.schema.tables.length,
|
||||
settings: {
|
||||
validationBudget,
|
||||
},
|
||||
});
|
||||
const compositeDetection =
|
||||
validationBudget === 'all' &&
|
||||
validationExecutor &&
|
||||
input.mode !== 'profiling_disabled' &&
|
||||
input.mode !== 'validation_disabled'
|
||||
? await discoverKloCompositeRelationships({
|
||||
connectionId: input.snapshot.connectionId,
|
||||
driver: input.snapshot.driver,
|
||||
schema: input.schema,
|
||||
profiles,
|
||||
executor: validationExecutor,
|
||||
ctx: { runId: `relationship-benchmark:${input.fixtureId}:${input.mode}:composite` },
|
||||
})
|
||||
: { primaryKeys: [], relationships: [], queryCount: 0, warnings: [] };
|
||||
profilingExecutor?.close();
|
||||
const graph = resolveKloRelationshipGraph({
|
||||
schema: input.schema,
|
||||
profiles,
|
||||
candidates: validatedBroadCandidates,
|
||||
});
|
||||
const acceptedBroadCandidates = graph.relationships
|
||||
.filter((candidate) => candidate.status === 'accepted')
|
||||
.map((candidate) => ({
|
||||
...broadCandidateToBenchmarkLink(candidate),
|
||||
score: candidate.fkScore,
|
||||
status: 'accepted' as const,
|
||||
}))
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
|
||||
const reviewCandidates = graph.relationships
|
||||
.filter((candidate) => candidate.status === 'review')
|
||||
.map((candidate) => ({
|
||||
...broadCandidateToBenchmarkLink(candidate),
|
||||
score: candidate.fkScore,
|
||||
status: 'review' as const,
|
||||
}))
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate)));
|
||||
const resolvedPks = graph.pks
|
||||
.filter((pk) => pk.status !== 'rejected')
|
||||
.map((pk) => ({
|
||||
table: pk.table,
|
||||
columns: pk.columns,
|
||||
score: pk.pkScore,
|
||||
status: pk.status,
|
||||
}));
|
||||
const compositePks = compositeDetection.primaryKeys.map(compositePkToBenchmarkPk);
|
||||
const allPksByKey = new Map([...resolvedPks, ...compositePks].map((candidate) => [pkKey(candidate), candidate]));
|
||||
const pks = sortedUnique(allPksByKey.keys()).flatMap((key) => {
|
||||
const candidate = allPksByKey.get(key);
|
||||
return candidate ? [candidate] : [];
|
||||
});
|
||||
|
||||
return {
|
||||
pks,
|
||||
links: [
|
||||
...formalLinks,
|
||||
...acceptedBroadCandidates,
|
||||
...reviewCandidates,
|
||||
...compositeDetection.relationships
|
||||
.map(compositeRelationshipToBenchmarkLink)
|
||||
.filter((candidate) => !acceptedKeys.has(fkKey(candidate))),
|
||||
],
|
||||
validationBlocked:
|
||||
input.mode === 'validation_disabled' ||
|
||||
input.mode === 'profiling_disabled' ||
|
||||
(input.dataPath !== null && broadRelationshipCandidates.length > 0 && !profiles.sqlAvailable),
|
||||
sqlQueries: profilingExecutor?.queryCount ?? profiles.queryCount,
|
||||
llmCalls: 0,
|
||||
runtimeSeconds: Number(((performance.now() - startedAt) / 1000).toFixed(6)),
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function loadKloRelationshipBenchmarkFixture(
|
||||
fixtureDir: string,
|
||||
): Promise<KloRelationshipBenchmarkFixture> {
|
||||
const [fixtureRaw, snapshotRaw, expectedRaw] = await Promise.all([
|
||||
fixtureText(fixtureDir, 'fixture.yaml'),
|
||||
fixtureText(fixtureDir, 'snapshot.json'),
|
||||
fixtureText(fixtureDir, 'expected-links.yaml'),
|
||||
]);
|
||||
const fixture = fixtureConfigSchema.parse(YAML.parse(fixtureRaw));
|
||||
const expected = expectedLinksSchema.parse(YAML.parse(expectedRaw));
|
||||
const snapshot = JSON.parse(snapshotRaw) as KloSchemaSnapshot;
|
||||
|
||||
return {
|
||||
...fixture,
|
||||
snapshot,
|
||||
expected,
|
||||
dataPath: await fixtureDataPath(fixtureDir),
|
||||
columnEmbeddings: await fixtureColumnEmbeddings(fixtureDir),
|
||||
};
|
||||
}
|
||||
|
||||
export async function loadKloRelationshipBenchmarkFixtures(
|
||||
fixtureRoot: string,
|
||||
): Promise<KloRelationshipBenchmarkFixture[]> {
|
||||
const entries = await readdir(fixtureRoot, { withFileTypes: true });
|
||||
const fixtureDirs = entries
|
||||
.filter((entry) => entry.isDirectory())
|
||||
.map((entry) => join(fixtureRoot, entry.name))
|
||||
.sort((left, right) => left.localeCompare(right));
|
||||
|
||||
return Promise.all(fixtureDirs.map((fixtureDir) => loadKloRelationshipBenchmarkFixture(fixtureDir)));
|
||||
}
|
||||
|
||||
export async function runKloRelationshipBenchmarkCase(input: {
|
||||
fixture: KloRelationshipBenchmarkFixture;
|
||||
mode: KloRelationshipBenchmarkMode;
|
||||
detector?: KloRelationshipBenchmarkDetector;
|
||||
}): Promise<KloRelationshipBenchmarkCaseResult> {
|
||||
const snapshot = maskKloRelationshipBenchmarkSnapshot(input.fixture.snapshot, input.mode);
|
||||
const embeddings =
|
||||
input.mode === 'embeddings_disabled'
|
||||
? new Map<string, number[]>()
|
||||
: new Map(Object.entries(input.fixture.columnEmbeddings));
|
||||
const schema = snapshotToKloEnrichedSchema(snapshot, embeddings);
|
||||
const detected = await (input.detector ?? currentKloRelationshipBenchmarkDetector()).detect({
|
||||
fixtureId: input.fixture.id,
|
||||
mode: input.mode,
|
||||
snapshot,
|
||||
schema,
|
||||
dataPath: input.fixture.dataPath,
|
||||
validationBudget: input.fixture.validationBudget,
|
||||
});
|
||||
|
||||
return scoreBenchmarkCase({
|
||||
fixtureId: input.fixture.id,
|
||||
mode: input.mode,
|
||||
expected: input.fixture.expected,
|
||||
detected,
|
||||
});
|
||||
}
|
||||
|
||||
export async function runKloRelationshipBenchmarkSuite(input: {
|
||||
fixtures: KloRelationshipBenchmarkFixture[];
|
||||
detector?: KloRelationshipBenchmarkDetector;
|
||||
}): Promise<KloRelationshipBenchmarkSuiteResult> {
|
||||
const cases: KloRelationshipBenchmarkCaseResult[] = [];
|
||||
for (const fixture of input.fixtures) {
|
||||
for (const mode of fixture.defaultModes) {
|
||||
cases.push(
|
||||
await runKloRelationshipBenchmarkCase({
|
||||
fixture,
|
||||
mode,
|
||||
detector: input.detector,
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const fixtureById = new Map(input.fixtures.map((fixture) => [fixture.id, fixture]));
|
||||
const headlineCases = cases.filter((item) => {
|
||||
const fixture = fixtureById.get(item.fixtureId);
|
||||
return fixture
|
||||
? isKloRelationshipBenchmarkTuningEligible({
|
||||
fixture,
|
||||
mode: item.mode,
|
||||
validationBlocked: item.validationBlocked,
|
||||
})
|
||||
: false;
|
||||
});
|
||||
const aggregateCases = cases.length === 0 ? [] : cases;
|
||||
|
||||
return {
|
||||
cases,
|
||||
validationBlockedCases: cases
|
||||
.filter((item) => item.validationBlocked)
|
||||
.map((item) => `${item.fixtureId}:${item.mode}`),
|
||||
aggregate: {
|
||||
caseCount: cases.length,
|
||||
headlineCaseCount: headlineCases.length,
|
||||
headlinePkRecall: mean(headlineCases.map((item) => item.metrics.pkRecall)),
|
||||
headlineFkRecall: mean(headlineCases.map((item) => item.metrics.fkRecall)),
|
||||
headlineAcceptedOrReviewRecall: mean(headlineCases.map((item) => item.metrics.acceptedOrReviewRecall)),
|
||||
meanPkRecall: mean(aggregateCases.map((item) => item.metrics.pkRecall)),
|
||||
meanFkRecall: mean(aggregateCases.map((item) => item.metrics.fkRecall)),
|
||||
meanAcceptedOrReviewRecall: mean(aggregateCases.map((item) => item.metrics.acceptedOrReviewRecall)),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function mean(values: number[]): number {
|
||||
if (values.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
return values.reduce((sum, value) => sum + value, 0) / values.length;
|
||||
}
|
||||
86
packages/context/src/scan/relationship-budget.test.ts
Normal file
86
packages/context/src/scan/relationship-budget.test.ts
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { applyKloRelationshipValidationBudget, defaultKloRelationshipValidationBudget } from './relationship-budget.js';
|
||||
|
||||
interface Candidate {
|
||||
id: string;
|
||||
confidence: number;
|
||||
}
|
||||
|
||||
describe('relationship validation budget', () => {
|
||||
it('computes the default validation budget from table count', () => {
|
||||
expect(defaultKloRelationshipValidationBudget(0)).toBe(0);
|
||||
expect(defaultKloRelationshipValidationBudget(3)).toBe(6);
|
||||
expect(defaultKloRelationshipValidationBudget(400)).toBe(800);
|
||||
expect(defaultKloRelationshipValidationBudget(900)).toBe(1000);
|
||||
expect(defaultKloRelationshipValidationBudget(-4)).toBe(0);
|
||||
expect(defaultKloRelationshipValidationBudget(3.8)).toBe(6);
|
||||
});
|
||||
|
||||
it('splits candidates by descending score with stable tie ordering', () => {
|
||||
const result = applyKloRelationshipValidationBudget<Candidate>({
|
||||
candidates: [
|
||||
{ id: 'first', confidence: 0.8 },
|
||||
{ id: 'second', confidence: 0.9 },
|
||||
{ id: 'third', confidence: 0.9 },
|
||||
{ id: 'fourth', confidence: 0.2 },
|
||||
],
|
||||
tableCount: 100,
|
||||
budget: 2,
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe(2);
|
||||
expect(result.toValidate.map((entry) => entry.candidate.id)).toEqual(['second', 'third']);
|
||||
expect(result.deferred.map((entry) => entry.candidate.id)).toEqual(['first', 'fourth']);
|
||||
expect(result.toValidate.map((entry) => entry.originalIndex)).toEqual([1, 2]);
|
||||
});
|
||||
|
||||
it('uses the default budget when the budget is omitted', () => {
|
||||
const candidates = Array.from({ length: 8 }, (_, index) => ({
|
||||
id: `candidate-${index}`,
|
||||
confidence: 1 - index / 10,
|
||||
}));
|
||||
|
||||
const result = applyKloRelationshipValidationBudget<Candidate>({
|
||||
candidates,
|
||||
tableCount: 2,
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe(4);
|
||||
expect(result.toValidate).toHaveLength(4);
|
||||
expect(result.deferred).toHaveLength(4);
|
||||
});
|
||||
|
||||
it('treats budget zero as disabling SQL validation', () => {
|
||||
const result = applyKloRelationshipValidationBudget<Candidate>({
|
||||
candidates: [
|
||||
{ id: 'first', confidence: 1 },
|
||||
{ id: 'second', confidence: 0.5 },
|
||||
],
|
||||
tableCount: 10,
|
||||
budget: 0,
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe(0);
|
||||
expect(result.toValidate).toEqual([]);
|
||||
expect(result.deferred.map((entry) => entry.candidate.id)).toEqual(['first', 'second']);
|
||||
});
|
||||
|
||||
it('treats budget all as validating every candidate', () => {
|
||||
const result = applyKloRelationshipValidationBudget<Candidate>({
|
||||
candidates: [
|
||||
{ id: 'first', confidence: 0.1 },
|
||||
{ id: 'second', confidence: 0.9 },
|
||||
],
|
||||
tableCount: 1,
|
||||
budget: 'all',
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
|
||||
expect(result.effectiveBudget).toBe('all');
|
||||
expect(result.toValidate.map((entry) => entry.candidate.id)).toEqual(['first', 'second']);
|
||||
expect(result.deferred).toEqual([]);
|
||||
});
|
||||
});
|
||||
60
packages/context/src/scan/relationship-budget.ts
Normal file
60
packages/context/src/scan/relationship-budget.ts
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
export type KloRelationshipValidationBudget = number | 'all' | undefined;
|
||||
|
||||
export interface KloRelationshipBudgetedCandidate<TCandidate> {
|
||||
candidate: TCandidate;
|
||||
originalIndex: number;
|
||||
score: number;
|
||||
}
|
||||
|
||||
export interface KloRelationshipValidationBudgetResult<TCandidate> {
|
||||
effectiveBudget: number | 'all';
|
||||
toValidate: KloRelationshipBudgetedCandidate<TCandidate>[];
|
||||
deferred: KloRelationshipBudgetedCandidate<TCandidate>[];
|
||||
}
|
||||
|
||||
export interface ApplyKloRelationshipValidationBudgetInput<TCandidate> {
|
||||
candidates: readonly TCandidate[];
|
||||
tableCount: number;
|
||||
budget?: KloRelationshipValidationBudget;
|
||||
score: (candidate: TCandidate) => number;
|
||||
}
|
||||
|
||||
export function defaultKloRelationshipValidationBudget(tableCount: number): number {
|
||||
const safeTableCount = Number.isFinite(tableCount) ? Math.max(0, Math.floor(tableCount)) : 0;
|
||||
return Math.min(2 * safeTableCount, 1000);
|
||||
}
|
||||
|
||||
export function applyKloRelationshipValidationBudget<TCandidate>(
|
||||
input: ApplyKloRelationshipValidationBudgetInput<TCandidate>,
|
||||
): KloRelationshipValidationBudgetResult<TCandidate> {
|
||||
const ranked = input.candidates
|
||||
.map((candidate, originalIndex) => ({
|
||||
candidate,
|
||||
originalIndex,
|
||||
score: input.score(candidate),
|
||||
}))
|
||||
.sort((left, right) => {
|
||||
const scoreDelta = right.score - left.score;
|
||||
return scoreDelta === 0 ? left.originalIndex - right.originalIndex : scoreDelta;
|
||||
});
|
||||
|
||||
if (input.budget === 'all') {
|
||||
return {
|
||||
effectiveBudget: 'all',
|
||||
toValidate: input.candidates.map((candidate, originalIndex) => ({
|
||||
candidate,
|
||||
originalIndex,
|
||||
score: input.score(candidate),
|
||||
})),
|
||||
deferred: [],
|
||||
};
|
||||
}
|
||||
|
||||
const effectiveBudget = input.budget ?? defaultKloRelationshipValidationBudget(input.tableCount);
|
||||
const safeBudget = Math.max(0, Math.floor(effectiveBudget));
|
||||
return {
|
||||
effectiveBudget: safeBudget,
|
||||
toValidate: ranked.slice(0, safeBudget),
|
||||
deferred: ranked.slice(safeBudget),
|
||||
};
|
||||
}
|
||||
881
packages/context/src/scan/relationship-candidates.test.ts
Normal file
881
packages/context/src/scan/relationship-candidates.test.ts
Normal file
|
|
@ -0,0 +1,881 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
|
||||
import { normalizeKloRelationshipName } from './relationship-name-similarity.js';
|
||||
import {
|
||||
generateKloRelationshipDiscoveryCandidates,
|
||||
inferKloRelationshipTargetPks,
|
||||
mergeKloRelationshipDiscoveryCandidates,
|
||||
} from './relationship-candidates.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
|
||||
function column(
|
||||
tableId: string,
|
||||
id: string,
|
||||
name: string,
|
||||
options: Partial<KloEnrichedColumn> = {},
|
||||
): KloEnrichedColumn {
|
||||
const tableRef = options.tableRef ?? { catalog: null, db: 'public', name: tableId };
|
||||
return {
|
||||
id,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: options.nativeType ?? 'INTEGER',
|
||||
normalizedType: options.normalizedType ?? 'integer',
|
||||
dimensionType: options.dimensionType ?? 'number',
|
||||
nullable: options.nullable ?? true,
|
||||
primaryKey: options.primaryKey ?? false,
|
||||
parentColumnId: options.parentColumnId ?? null,
|
||||
descriptions: options.descriptions ?? {},
|
||||
embedding: options.embedding ?? null,
|
||||
sampleValues: options.sampleValues ?? null,
|
||||
cardinality: options.cardinality ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function table(id: string, name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
|
||||
const ref = { catalog: null, db: 'public', name };
|
||||
return {
|
||||
id,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: id, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(tables: KloEnrichedTable[]): KloEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables,
|
||||
relationships: [],
|
||||
};
|
||||
}
|
||||
|
||||
function planCodeProfiles(): KloRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [
|
||||
{ table: { catalog: null, db: 'public', name: 'stg_plans' }, rowCount: 4 },
|
||||
{ table: { catalog: null, db: 'public', name: 'mart_account_segments' }, rowCount: 4 },
|
||||
{ table: { catalog: null, db: 'public', name: 'stg_plan_segment_mapping' }, rowCount: 4 },
|
||||
],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'stg_plans.plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.created_at': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'created_at',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['2026-05-01', '2026-05-02', '2026-05-03', '2026-05-04'],
|
||||
minTextLength: 10,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.email': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'email',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['a@example.test', 'b@example.test', 'c@example.test', 'd@example.test'],
|
||||
minTextLength: 14,
|
||||
maxTextLength: 14,
|
||||
},
|
||||
'stg_plans.is_deleted': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'is_deleted',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['deleted-a', 'deleted-b', 'deleted-c', 'deleted-d'],
|
||||
minTextLength: 9,
|
||||
maxTextLength: 9,
|
||||
},
|
||||
'mart_account_segments.current_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'mart_account_segments' },
|
||||
column: 'current_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'mart_account_segments.normalized_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'mart_account_segments' },
|
||||
column: 'normalized_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plan_segment_mapping.canonical_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plan_segment_mapping' },
|
||||
column: 'canonical_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.canonical_plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'stg_plans' },
|
||||
column: 'canonical_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship discovery candidates', () => {
|
||||
it('normalizes warehouse prefixes and emits review candidates without declared primary keys', () => {
|
||||
const accounts = table('accounts-id', 'dim_accounts', [
|
||||
column('accounts-id', 'accounts-id-col', 'id', { primaryKey: false }),
|
||||
column('accounts-id', 'accounts-name-col', 'account_name', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const invoices = table('invoices-id', 'fct_invoices', [
|
||||
column('invoices-id', 'invoice-id-col', 'id', { primaryKey: false }),
|
||||
column('invoices-id', 'account-id-col', 'account_id', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([accounts, invoices]));
|
||||
|
||||
expect(candidates).toHaveLength(1);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
from: { tableId: 'invoices-id', columnIds: ['account-id-col'], columns: ['account_id'] },
|
||||
to: { tableId: 'accounts-id', columnIds: ['accounts-id-col'], columns: ['id'] },
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'review',
|
||||
source: 'normalized_table_match',
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: 'account',
|
||||
targetKeyScore: 0.92,
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.8);
|
||||
expect(candidates[0]?.evidence.signalVector).toMatchObject({
|
||||
nameSimilarity: 0.92,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0,
|
||||
embeddingSimilarity: 0,
|
||||
profileUniqueness: 0.92,
|
||||
});
|
||||
expect(candidates[0]?.evidence.scoreBreakdown?.score).toBe(candidates[0]?.confidence);
|
||||
expect(candidates[0]?.evidence.scoreBreakdown?.contributions.nameSimilarity).toBeGreaterThan(0);
|
||||
expect(candidates[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like']),
|
||||
);
|
||||
});
|
||||
|
||||
it('generates candidates for PascalCase ID columns without declared keys', () => {
|
||||
const artists = table('artist-id', 'Artist', [
|
||||
column('artist-id', 'artist-id-col', 'ArtistId', { primaryKey: false }),
|
||||
column('artist-id', 'artist-name-col', 'Name', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const albums = table('album-id', 'Album', [
|
||||
column('album-id', 'album-id-col', 'AlbumId', { primaryKey: false }),
|
||||
column('album-id', 'artist-id-fk-col', 'ArtistId', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([artists, albums]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['Album.ArtistId->Artist.ArtistId']);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'normalized_table_match',
|
||||
evidence: {
|
||||
sourceColumnBase: 'artist',
|
||||
targetTableBase: 'artist',
|
||||
targetColumnBase: 'artist_id',
|
||||
targetKeyScore: 0.9,
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like']),
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.9);
|
||||
});
|
||||
|
||||
it('uses the locality cap before scanning parent tables', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id-col', 'id'),
|
||||
column('invoices-id', 'account-id-col', 'account_id'),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([accounts, invoices]), {
|
||||
maxCandidateParentTables: 0,
|
||||
});
|
||||
|
||||
expect(candidates).toEqual([]);
|
||||
});
|
||||
|
||||
it('keeps the nearest parent when the locality cap is one', () => {
|
||||
const artists = table('artist-id', 'Artist', [
|
||||
column('artist-id', 'artist-id-col', 'ArtistId', { primaryKey: false }),
|
||||
column('artist-id', 'artist-name-col', 'Name', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const albums = table('album-id', 'Album', [
|
||||
column('album-id', 'album-id-col', 'AlbumId', { primaryKey: false }),
|
||||
column('album-id', 'artist-id-fk-col', 'ArtistId', { primaryKey: false }),
|
||||
]);
|
||||
const fillerTables = Array.from({ length: 25 }, (_, index) =>
|
||||
table(`filler-${index}`, `WarehouseFiller${index}`, [
|
||||
column(`filler-${index}`, `filler-${index}-id`, 'WarehouseFillerId', { primaryKey: false }),
|
||||
]),
|
||||
);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([albums, ...fillerTables, artists]), {
|
||||
maxCandidateParentTables: 1,
|
||||
});
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['Album.ArtistId->Artist.ArtistId']);
|
||||
});
|
||||
|
||||
it('uses final table tokens from dotted parent table names', () => {
|
||||
const customers = table('customer-id', 'SalesLT.Customer', [
|
||||
column('customer-id', 'customer-id-col', 'CustomerID', { primaryKey: false }),
|
||||
column('customer-id', 'customer-name-col', 'CustomerName', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const orders = table('order-id', 'SalesLT.SalesOrderHeader', [
|
||||
column('order-id', 'order-id-col', 'SalesOrderID', { primaryKey: false }),
|
||||
column('order-id', 'customer-id-fk-col', 'CustomerID', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([customers, orders]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['SalesLT.SalesOrderHeader.CustomerID->SalesLT.Customer.CustomerID']);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
evidence: {
|
||||
sourceColumnBase: 'customer',
|
||||
targetTableBase: 'sales_lt_customer',
|
||||
targetColumnBase: 'customer_id',
|
||||
targetKeyScore: 0.9,
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'inflection', 'target_key_like']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('emits lower-confidence parent-table-name candidates when the target key name differs from the table name', () => {
|
||||
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
|
||||
column('customer-account-id', 'business-entity-id-col', 'BusinessEntityID', { primaryKey: true }),
|
||||
column('customer-account-id', 'account-name-col', 'AccountName', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const subscriptions = table('subscriptions-id', 'fct_subscriptions', [
|
||||
column('subscriptions-id', 'subscription-id-col', 'SubscriptionID', { primaryKey: false }),
|
||||
column('subscriptions-id', 'customer-account-id-col', 'CustomerAccountID', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([customerAccounts, subscriptions]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['fct_subscriptions.CustomerAccountID->crm.CustomerAccount.BusinessEntityID']);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'parent_table_name_match',
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'review',
|
||||
evidence: {
|
||||
sourceColumnBase: 'customer_account',
|
||||
targetTableBase: 'crm_customer_account',
|
||||
targetColumnBase: 'business_entity_id',
|
||||
targetKeyScore: 1,
|
||||
nameScore: 0.82,
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'parent_table_name_match', 'target_key_like']),
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.evidence.signalVector).toMatchObject({
|
||||
nameSimilarity: 0.82,
|
||||
typeCompatibility: 1,
|
||||
});
|
||||
expect(candidates[0]?.evidence.scoreBreakdown?.score).toBe(candidates[0]?.confidence);
|
||||
});
|
||||
|
||||
it('does not emit parent-table-name candidates when the target key type is incompatible', () => {
|
||||
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
|
||||
column('customer-account-id', 'business-entity-id-col', 'BusinessEntityID', {
|
||||
primaryKey: true,
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const subscriptions = table('subscriptions-id', 'fct_subscriptions', [
|
||||
column('subscriptions-id', 'customer-account-id-col', 'CustomerAccountID', {
|
||||
primaryKey: false,
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([customerAccounts, subscriptions]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).not.toContain('fct_subscriptions.CustomerAccountID->crm.CustomerAccount.BusinessEntityID');
|
||||
});
|
||||
|
||||
it('does not use parent-table-name matching to create same-table same-column self-links', () => {
|
||||
const customerAccounts = table('customer-account-id', 'crm.CustomerAccount', [
|
||||
column('customer-account-id', 'customer-account-id-col', 'CustomerAccountID', { primaryKey: false }),
|
||||
column('customer-account-id', 'account-name-col', 'AccountName', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([customerAccounts]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).not.toContain('crm.CustomerAccount.CustomerAccountID->crm.CustomerAccount.CustomerAccountID');
|
||||
});
|
||||
|
||||
it('uses profile evidence to generate natural-key candidates without id-like target names', () => {
|
||||
const countries = table('countries-id', 'dim_countries', [
|
||||
column('countries-id', 'countries-code-col', 'iso_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('countries-id', 'countries-name-col', 'name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const accounts = table('accounts-id', 'fct_accounts', [
|
||||
column('accounts-id', 'account-id-col', 'id', { primaryKey: false }),
|
||||
column('accounts-id', 'country-code-col', 'country_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const profiles = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'dim_countries.iso_code': {
|
||||
table: { catalog: null, db: 'public', name: 'dim_countries' },
|
||||
column: 'iso_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['DE', 'FR', 'US'],
|
||||
minTextLength: 2,
|
||||
maxTextLength: 2,
|
||||
},
|
||||
'fct_accounts.country_code': {
|
||||
table: { catalog: null, db: 'public', name: 'fct_accounts' },
|
||||
column: 'country_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.75,
|
||||
nullRate: 0,
|
||||
sampleValues: ['FR', 'US'],
|
||||
minTextLength: 2,
|
||||
maxTextLength: 2,
|
||||
},
|
||||
},
|
||||
} satisfies KloRelationshipProfileArtifact;
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([countries, accounts]), { profiles });
|
||||
|
||||
expect(candidates).toHaveLength(1);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'profile_match',
|
||||
from: { tableId: 'accounts-id', columnIds: ['country-code-col'], columns: ['country_code'] },
|
||||
to: { tableId: 'countries-id', columnIds: ['countries-code-col'], columns: ['iso_code'] },
|
||||
evidence: {
|
||||
sourceColumnBase: 'country',
|
||||
targetTableBase: 'country',
|
||||
targetColumnBase: 'iso_code',
|
||||
targetKeyScore: 0.86,
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.78);
|
||||
expect(candidates[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining([
|
||||
'foreign_key_code_suffix',
|
||||
'normalized_table_name',
|
||||
'profile_unique_target',
|
||||
'profile_sample_overlap',
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('drops same-table same-column self-links using ordered endpoint equality', () => {
|
||||
const accounts = table('accounts-id', 'stg_accounts', [
|
||||
column('accounts-id', 'accounts-account-id-col', 'account_id', { primaryKey: false }),
|
||||
column('accounts-id', 'accounts-name-col', 'account_name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([accounts]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).not.toContain('stg_accounts.account_id->stg_accounts.account_id');
|
||||
});
|
||||
|
||||
it('keeps legitimate same-table different-column self-references', () => {
|
||||
const employees = table('employees-id', 'employees', [
|
||||
column('employees-id', 'employees-id-col', 'id', { primaryKey: false }),
|
||||
column('employees-id', 'employees-parent-id-col', 'parent_id', { primaryKey: false }),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([employees]));
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toContain('employees.parent_id->employees.id');
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'self_reference',
|
||||
evidence: {
|
||||
reasons: expect.arrayContaining(['self_reference']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('emits column_suffix_match candidates for relationship-key-shaped trailing target columns', () => {
|
||||
const plans = table('plans-id', 'stg_plans', [
|
||||
column('plans-id', 'plans-plan-code-col', 'plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-canonical-plan-code-col', 'canonical_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-created-at-col', 'created_at', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-email-col', 'email', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('plans-id', 'plans-is-deleted-col', 'is_deleted', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const accountSegments = table('account-segments-id', 'mart_account_segments', [
|
||||
column('account-segments-id', 'current-plan-code-col', 'current_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'normalized-plan-code-col', 'normalized_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'source-created-at-col', 'source_created_at', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'billing-email-col', 'billing_email', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
column('account-segments-id', 'source-is-deleted-col', 'source_is_deleted', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const mapping = table('mapping-id', 'stg_plan_segment_mapping', [
|
||||
column('mapping-id', 'mapping-canonical-plan-code-col', 'canonical_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([plans, accountSegments, mapping]), {
|
||||
profiles: planCodeProfiles(),
|
||||
});
|
||||
const candidateKeys = candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
);
|
||||
|
||||
expect(candidateKeys).toEqual([
|
||||
'mart_account_segments.current_plan_code->stg_plans.plan_code',
|
||||
'mart_account_segments.normalized_plan_code->stg_plans.plan_code',
|
||||
'stg_plan_segment_mapping.canonical_plan_code->stg_plans.plan_code',
|
||||
'stg_plans.canonical_plan_code->stg_plans.plan_code',
|
||||
]);
|
||||
expect(candidates).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
source: 'column_suffix_match',
|
||||
confidence: expect.any(Number),
|
||||
evidence: expect.objectContaining({
|
||||
nameScore: 0.78,
|
||||
targetKeyScore: 0.86,
|
||||
reasons: expect.arrayContaining(['column_suffix_match', 'profile_unique_target']),
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
expect(candidateKeys).not.toContain('mart_account_segments.source_created_at->stg_plans.created_at');
|
||||
expect(candidateKeys).not.toContain('mart_account_segments.billing_email->stg_plans.email');
|
||||
expect(candidateKeys).not.toContain('mart_account_segments.source_is_deleted->stg_plans.is_deleted');
|
||||
const suffixCandidate = candidates.find(
|
||||
(candidate) => candidate.from.table.name === 'mart_account_segments' && candidate.from.columns[0] === 'current_plan_code',
|
||||
);
|
||||
expect(suffixCandidate?.confidence).toBe(suffixCandidate?.evidence.scoreBreakdown?.score);
|
||||
expect(suffixCandidate?.evidence.signalVector).toMatchObject({
|
||||
nameSimilarity: 0.78,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 1,
|
||||
profileUniqueness: 1,
|
||||
profileNullRate: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it('does not suffix-match bare single-token targets or incompatible target types', () => {
|
||||
const users = table('users-id', 'users', [
|
||||
column('users-id', 'users-id-col', 'id', { primaryKey: false }),
|
||||
column('users-id', 'users-account-id-col', 'account_id', { primaryKey: false }),
|
||||
]);
|
||||
const plans = table('plans-id', 'plans', [
|
||||
column('plans-id', 'plans-plan-code-col', 'plan_code', {
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
}),
|
||||
]);
|
||||
const accounts = table('accounts-id', 'accounts', [
|
||||
column('accounts-id', 'current-plan-code-col', 'current_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const profiles = {
|
||||
...planCodeProfiles(),
|
||||
columns: {
|
||||
...planCodeProfiles().columns,
|
||||
'users.id': {
|
||||
table: { catalog: null, db: 'public', name: 'users' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'plans.plan_code': {
|
||||
table: { catalog: null, db: 'public', name: 'plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
} satisfies KloRelationshipProfileArtifact;
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([users, plans, accounts]), { profiles });
|
||||
const candidateKeys = candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
);
|
||||
|
||||
expect(candidateKeys).not.toContain('users.account_id->users.id');
|
||||
expect(candidateKeys).not.toContain('accounts.current_plan_code->plans.plan_code');
|
||||
});
|
||||
|
||||
it('uses column embeddings as a recall source for non-standard source names', () => {
|
||||
const customers = table('customers-id', 'customers', [
|
||||
column('customers-id', 'customers-id-col', 'id', {
|
||||
primaryKey: false,
|
||||
embedding: [1, 0, 0],
|
||||
}),
|
||||
column('customers-id', 'customers-name-col', 'name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
embedding: [0, 1, 0],
|
||||
}),
|
||||
]);
|
||||
const orders = table('orders-id', 'orders', [
|
||||
column('orders-id', 'orders-id-col', 'id', {
|
||||
primaryKey: false,
|
||||
embedding: [0, 0, 1],
|
||||
}),
|
||||
column('orders-id', 'buyer-ref-col', 'buyer_ref', {
|
||||
primaryKey: false,
|
||||
embedding: [0.995, 0.005, 0],
|
||||
}),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([customers, orders]), {
|
||||
embeddingSimilarityThreshold: 0.95,
|
||||
});
|
||||
|
||||
expect(candidates).toHaveLength(1);
|
||||
expect(candidates[0]).toMatchObject({
|
||||
source: 'embedding_similarity',
|
||||
from: { tableId: 'orders-id', columnIds: ['buyer-ref-col'], columns: ['buyer_ref'] },
|
||||
to: { tableId: 'customers-id', columnIds: ['customers-id-col'], columns: ['id'] },
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'review',
|
||||
evidence: {
|
||||
sourceColumnBase: 'buyer_ref',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.92,
|
||||
embeddingSimilarity: expect.any(Number),
|
||||
},
|
||||
});
|
||||
expect(candidates[0]?.confidence).toBeGreaterThanOrEqual(0.9);
|
||||
expect(candidates[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining(['embedding_similarity', 'target_key_like']),
|
||||
);
|
||||
});
|
||||
|
||||
it('singularizes names and caps candidates per source column deterministically', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const archivedAccounts = table('archived-accounts-id', 'accounts_archive', [
|
||||
column('archived-accounts-id', 'archived-accounts-id-col', 'id'),
|
||||
]);
|
||||
const events = table('events-id', 'product_events', [
|
||||
column('events-id', 'event-id-col', 'id'),
|
||||
column('events-id', 'account-id-col', 'account_id'),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([events, archivedAccounts, accounts]), {
|
||||
maxCandidatesPerColumn: 1,
|
||||
});
|
||||
|
||||
expect(
|
||||
candidates.map(
|
||||
(candidate) =>
|
||||
`${candidate.from.table.name}.${candidate.from.columns[0]}->${candidate.to.table.name}.${candidate.to.columns[0]}`,
|
||||
),
|
||||
).toEqual(['product_events.account_id->accounts.id']);
|
||||
});
|
||||
|
||||
it('infers target primary-key candidates from incoming review links', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const users = table('users-id', 'users', [column('users-id', 'users-id-col', 'id')]);
|
||||
const events = table('events-id', 'product_events', [
|
||||
column('events-id', 'event-id-col', 'id'),
|
||||
column('events-id', 'account-id-col', 'account_id'),
|
||||
column('events-id', 'user-id-col', 'user_id'),
|
||||
]);
|
||||
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(schema([accounts, users, events]));
|
||||
const inferredPks = inferKloRelationshipTargetPks(candidates);
|
||||
|
||||
expect(inferredPks).toEqual([
|
||||
{
|
||||
table: 'accounts',
|
||||
columns: ['id'],
|
||||
score: expect.any(Number),
|
||||
status: 'review',
|
||||
incomingCandidateCount: 1,
|
||||
},
|
||||
{
|
||||
table: 'users',
|
||||
columns: ['id'],
|
||||
score: expect.any(Number),
|
||||
status: 'review',
|
||||
incomingCandidateCount: 1,
|
||||
},
|
||||
]);
|
||||
expect(inferredPks.every((pk) => pk.score >= 0.8)).toBe(true);
|
||||
});
|
||||
|
||||
it('does not generate candidates from primary-key source columns or incompatible target types', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [
|
||||
column('accounts-id', 'accounts-id-col', 'id', { nativeType: 'TEXT', normalizedType: 'text' }),
|
||||
]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id-col', 'id', { primaryKey: true }),
|
||||
column('invoices-id', 'account-id-col', 'account_id', { nativeType: 'INTEGER', normalizedType: 'integer' }),
|
||||
]);
|
||||
|
||||
expect(generateKloRelationshipDiscoveryCandidates(schema([accounts, invoices]))).toEqual([]);
|
||||
});
|
||||
|
||||
it('normalizes layer prefixes, punctuation, plural forms, and non-plural trailing s words', () => {
|
||||
expect(normalizeKloRelationshipName('mart__Sales_Accounts')).toMatchObject({
|
||||
normalized: 'sales_accounts',
|
||||
singular: 'sales_account',
|
||||
tokens: ['sales', 'accounts'],
|
||||
});
|
||||
expect(normalizeKloRelationshipName('dim_users')).toMatchObject({
|
||||
normalized: 'users',
|
||||
singular: 'user',
|
||||
tokens: ['users'],
|
||||
});
|
||||
expect(normalizeKloRelationshipName('Address')).toMatchObject({
|
||||
normalized: 'address',
|
||||
singular: 'address',
|
||||
plural: 'addresses',
|
||||
tokens: ['address'],
|
||||
});
|
||||
});
|
||||
|
||||
it('merges duplicate deterministic and LLM proposal candidates without losing LLM rationale', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'accounts-id-col', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [column('invoices-id', 'account-id-col', 'account_id')]);
|
||||
const [deterministic] = generateKloRelationshipDiscoveryCandidates(schema([accounts, invoices]));
|
||||
if (!deterministic) {
|
||||
throw new Error('Expected deterministic relationship candidate');
|
||||
}
|
||||
const llmCandidate = {
|
||||
...deterministic,
|
||||
confidence: 0.99,
|
||||
source: 'llm_proposal' as const,
|
||||
evidence: {
|
||||
...deterministic.evidence,
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Invoices point at the owning account dimension.',
|
||||
},
|
||||
};
|
||||
|
||||
const merged = mergeKloRelationshipDiscoveryCandidates([deterministic, llmCandidate]);
|
||||
|
||||
expect(merged).toHaveLength(1);
|
||||
expect(merged[0]).toMatchObject({
|
||||
id: deterministic.id,
|
||||
source: 'normalized_table_match',
|
||||
confidence: 0.99,
|
||||
evidence: {
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Invoices point at the owning account dimension.',
|
||||
},
|
||||
});
|
||||
expect(merged[0]?.evidence.reasons).toEqual(
|
||||
expect.arrayContaining(['foreign_key_suffix', 'normalized_table_name', 'target_key_like', 'llm_proposal']),
|
||||
);
|
||||
});
|
||||
});
|
||||
756
packages/context/src/scan/relationship-candidates.ts
Normal file
756
packages/context/src/scan/relationship-candidates.ts
Normal file
|
|
@ -0,0 +1,756 @@
|
|||
import type {
|
||||
KloEnrichedColumn,
|
||||
KloEnrichedSchema,
|
||||
KloEnrichedTable,
|
||||
KloRelationshipEndpoint,
|
||||
KloRelationshipType,
|
||||
} from './enrichment-types.js';
|
||||
import { localCandidateTables } from './relationship-locality.js';
|
||||
import {
|
||||
normalizeKloRelationshipName,
|
||||
pluralizeKloRelationshipToken,
|
||||
singularizeKloRelationshipToken,
|
||||
} from './relationship-name-similarity.js';
|
||||
export type { KloRelationshipNormalizedName } from './relationship-name-similarity.js';
|
||||
export { normalizeKloRelationshipName } from './relationship-name-similarity.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import {
|
||||
scoreKloRelationshipCandidate,
|
||||
type KloRelationshipScoreBreakdown,
|
||||
type KloRelationshipSignalVector,
|
||||
} from './relationship-scoring.js';
|
||||
|
||||
export type KloRelationshipDiscoveryCandidateSource =
|
||||
| 'exact_column_match'
|
||||
| 'normalized_table_match'
|
||||
| 'parent_table_name_match'
|
||||
| 'inflection'
|
||||
| 'self_reference'
|
||||
| 'profile_match'
|
||||
| 'column_suffix_match'
|
||||
| 'embedding_similarity'
|
||||
| 'llm_proposal';
|
||||
|
||||
export type KloRelationshipDiscoveryCandidateStatus = 'review';
|
||||
|
||||
export interface KloRelationshipDiscoveryCandidateEvidence {
|
||||
sourceColumnBase: string;
|
||||
targetTableBase: string;
|
||||
targetColumnBase: string;
|
||||
targetKeyScore: number;
|
||||
nameScore: number;
|
||||
reasons: string[];
|
||||
signalVector?: KloRelationshipSignalVector;
|
||||
scoreBreakdown?: KloRelationshipScoreBreakdown;
|
||||
embeddingSimilarity?: number;
|
||||
llmConfidence?: number;
|
||||
llmRationale?: string;
|
||||
}
|
||||
|
||||
export interface KloRelationshipDiscoveryCandidate {
|
||||
id: string;
|
||||
from: KloRelationshipEndpoint;
|
||||
to: KloRelationshipEndpoint;
|
||||
relationshipType: KloRelationshipType;
|
||||
confidence: number;
|
||||
source: KloRelationshipDiscoveryCandidateSource;
|
||||
status: KloRelationshipDiscoveryCandidateStatus;
|
||||
evidence: KloRelationshipDiscoveryCandidateEvidence;
|
||||
}
|
||||
|
||||
export interface KloRelationshipDiscoveryCandidateOptions {
|
||||
maxCandidatesPerColumn?: number;
|
||||
maxCandidateParentTables?: number;
|
||||
maxEmbeddingCandidatesPerColumn?: number;
|
||||
minConfidence?: number;
|
||||
embeddingSimilarityThreshold?: number;
|
||||
useEmbeddings?: boolean;
|
||||
profiles?: KloRelationshipProfileArtifact;
|
||||
}
|
||||
|
||||
export interface KloRelationshipInferredTargetPk {
|
||||
table: string;
|
||||
columns: string[];
|
||||
score: number;
|
||||
status: 'review';
|
||||
incomingCandidateCount: number;
|
||||
}
|
||||
|
||||
interface KloRelationshipSourceColumnReference {
|
||||
base: string;
|
||||
reason: string;
|
||||
}
|
||||
|
||||
interface KloRelationshipTargetKeyEvidence {
|
||||
score: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
const INTEGER_TYPES = new Set(['integer', 'int', 'bigint', 'smallint', 'tinyint', 'int4', 'int8', 'number']);
|
||||
const STRING_TYPES = new Set(['text', 'varchar', 'character varying', 'char', 'character', 'string']);
|
||||
const UUID_TYPES = new Set(['uuid', 'uniqueidentifier']);
|
||||
const SELF_REFERENCE_NAMES = new Set(['parent_id', 'manager_id', 'reported_to_id', 'supervisor_id', 'reports_to_id']);
|
||||
const REFERENCE_SUFFIXES: Array<{ suffix: string; reason: string }> = [
|
||||
{ suffix: '_id', reason: 'foreign_key_suffix' },
|
||||
{ suffix: '_key', reason: 'foreign_key_key_suffix' },
|
||||
{ suffix: '_code', reason: 'foreign_key_code_suffix' },
|
||||
{ suffix: '_uuid', reason: 'foreign_key_uuid_suffix' },
|
||||
];
|
||||
const RELATIONSHIP_KEY_TARGET_SUFFIXES = ['_id', '_key', '_code', '_uuid'] as const;
|
||||
|
||||
function isRelationshipKeyShapedTarget(column: KloEnrichedColumn): boolean {
|
||||
const normalized = normalizeKloRelationshipName(column.name);
|
||||
return (
|
||||
normalized.tokens.length >= 2 &&
|
||||
RELATIONSHIP_KEY_TARGET_SUFFIXES.some((suffix) => normalized.normalized.endsWith(suffix))
|
||||
);
|
||||
}
|
||||
|
||||
function columnSuffixMatchesTarget(input: { fromColumn: KloEnrichedColumn; toColumn: KloEnrichedColumn }): boolean {
|
||||
const source = normalizeKloRelationshipName(input.fromColumn.name).normalized;
|
||||
const target = normalizeKloRelationshipName(input.toColumn.name).normalized;
|
||||
return source !== target && target.length > 0 && source.endsWith(`_${target}`);
|
||||
}
|
||||
|
||||
function normalizeType(column: KloEnrichedColumn): string {
|
||||
const rawType = (column.normalizedType || column.nativeType || '').toLowerCase().trim();
|
||||
return rawType.includes('(') ? (rawType.split('(')[0] ?? '') : rawType;
|
||||
}
|
||||
|
||||
function typesCompatible(left: KloEnrichedColumn, right: KloEnrichedColumn): boolean {
|
||||
const leftType = normalizeType(left);
|
||||
const rightType = normalizeType(right);
|
||||
if (leftType === rightType) {
|
||||
return true;
|
||||
}
|
||||
if (INTEGER_TYPES.has(leftType) && INTEGER_TYPES.has(rightType)) {
|
||||
return true;
|
||||
}
|
||||
if (STRING_TYPES.has(leftType) && STRING_TYPES.has(rightType)) {
|
||||
return true;
|
||||
}
|
||||
return UUID_TYPES.has(leftType) && UUID_TYPES.has(rightType);
|
||||
}
|
||||
|
||||
function cosineSimilarity(left: readonly number[] | null, right: readonly number[] | null): number {
|
||||
if (!left || !right || left.length === 0 || left.length !== right.length) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let dot = 0;
|
||||
let leftMagnitude = 0;
|
||||
let rightMagnitude = 0;
|
||||
for (let index = 0; index < left.length; index += 1) {
|
||||
const leftValue = left[index] ?? 0;
|
||||
const rightValue = right[index] ?? 0;
|
||||
dot += leftValue * rightValue;
|
||||
leftMagnitude += leftValue * leftValue;
|
||||
rightMagnitude += rightValue * rightValue;
|
||||
}
|
||||
|
||||
if (leftMagnitude === 0 || rightMagnitude === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return dot / (Math.sqrt(leftMagnitude) * Math.sqrt(rightMagnitude));
|
||||
}
|
||||
|
||||
function hasUsableEmbedding(column: KloEnrichedColumn): boolean {
|
||||
return Array.isArray(column.embedding) && column.embedding.length > 0;
|
||||
}
|
||||
|
||||
function sourceColumnReference(column: KloEnrichedColumn): KloRelationshipSourceColumnReference | null {
|
||||
const normalized = normalizeKloRelationshipName(column.name);
|
||||
if (SELF_REFERENCE_NAMES.has(normalized.normalized)) {
|
||||
return { base: normalized.normalized.replace(/_id$/u, ''), reason: 'foreign_key_suffix' };
|
||||
}
|
||||
|
||||
for (const item of REFERENCE_SUFFIXES) {
|
||||
if (!normalized.normalized.endsWith(item.suffix)) {
|
||||
continue;
|
||||
}
|
||||
const base = normalized.normalized.slice(0, -item.suffix.length);
|
||||
if (base.length > 1) {
|
||||
return { base: singularizeKloRelationshipToken(base), reason: item.reason };
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function addNormalizedTableAlias(aliases: Set<string>, name: string): void {
|
||||
const normalized = normalizeKloRelationshipName(name);
|
||||
if (normalized.normalized.length > 0) {
|
||||
aliases.add(normalized.normalized);
|
||||
}
|
||||
if (normalized.singular.length > 0) {
|
||||
aliases.add(normalized.singular);
|
||||
}
|
||||
if (normalized.plural.length > 0) {
|
||||
aliases.add(normalized.plural);
|
||||
}
|
||||
}
|
||||
|
||||
function tableAliases(table: KloEnrichedTable): Set<string> {
|
||||
const normalized = normalizeKloRelationshipName(table.ref.name);
|
||||
const aliases = new Set([normalized.normalized, normalized.singular, normalized.plural]);
|
||||
if (normalized.tokens.length > 1) {
|
||||
const lastToken = normalized.tokens[normalized.tokens.length - 1];
|
||||
if (lastToken) {
|
||||
aliases.add(lastToken);
|
||||
const singularLastToken = singularizeKloRelationshipToken(lastToken);
|
||||
aliases.add(singularLastToken);
|
||||
aliases.add(pluralizeKloRelationshipToken(singularLastToken));
|
||||
}
|
||||
}
|
||||
return aliases;
|
||||
}
|
||||
|
||||
function finalTableNamePart(table: KloEnrichedTable): string {
|
||||
const parts = table.ref.name.split(/[^\p{L}\p{N}]+/u).filter(Boolean);
|
||||
return parts[parts.length - 1] ?? table.ref.name;
|
||||
}
|
||||
|
||||
function parentTableNameAliases(table: KloEnrichedTable): Set<string> {
|
||||
const aliases = tableAliases(table);
|
||||
addNormalizedTableAlias(aliases, finalTableNamePart(table));
|
||||
return aliases;
|
||||
}
|
||||
|
||||
function targetKeyScore(table: KloEnrichedTable, column: KloEnrichedColumn): number {
|
||||
const columnName = normalizeKloRelationshipName(column.name).normalized;
|
||||
const tableKeyBases = parentTableNameAliases(table);
|
||||
if (column.primaryKey) {
|
||||
return 1;
|
||||
}
|
||||
if (columnName === 'id') {
|
||||
return 0.92;
|
||||
}
|
||||
if (Array.from(tableKeyBases).some((tableKeyBase) => columnName === `${tableKeyBase}_id`)) {
|
||||
return 0.9;
|
||||
}
|
||||
if (Array.from(tableKeyBases).some((tableKeyBase) => columnName === `${tableKeyBase}_key`)) {
|
||||
return 0.82;
|
||||
}
|
||||
if (columnName === 'key' || columnName === 'uuid') {
|
||||
return 0.74;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function profileColumn(
|
||||
profiles: KloRelationshipProfileArtifact | undefined,
|
||||
tableName: string,
|
||||
columnName: string,
|
||||
) {
|
||||
return profiles?.columns[`${tableName}.${columnName}`] ?? null;
|
||||
}
|
||||
|
||||
function profileSampleOverlap(input: {
|
||||
profiles: KloRelationshipProfileArtifact | undefined;
|
||||
fromTable: KloEnrichedTable;
|
||||
fromColumn: KloEnrichedColumn;
|
||||
toTable: KloEnrichedTable;
|
||||
toColumn: KloEnrichedColumn;
|
||||
}): number {
|
||||
const source = profileColumn(input.profiles, input.fromTable.ref.name, input.fromColumn.name);
|
||||
const target = profileColumn(input.profiles, input.toTable.ref.name, input.toColumn.name);
|
||||
if (!source || !target || source.sampleValues.length === 0 || target.sampleValues.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
const targetValues = new Set(target.sampleValues.map((value) => value.toLowerCase()));
|
||||
const overlap = source.sampleValues.filter((value) => targetValues.has(value.toLowerCase())).length;
|
||||
return overlap / source.sampleValues.length;
|
||||
}
|
||||
|
||||
function tableProfileRowCount(profiles: KloRelationshipProfileArtifact | undefined, tableName: string): number | null {
|
||||
return profiles?.tables.find((table) => table.table.name === tableName)?.rowCount ?? null;
|
||||
}
|
||||
|
||||
function structuralPriorScore(input: {
|
||||
profiles: KloRelationshipProfileArtifact | undefined;
|
||||
fromTable: KloEnrichedTable;
|
||||
toTable: KloEnrichedTable;
|
||||
}): number {
|
||||
if (input.fromTable.id === input.toTable.id) {
|
||||
return 0.72;
|
||||
}
|
||||
|
||||
const sourceRows = tableProfileRowCount(input.profiles, input.fromTable.ref.name);
|
||||
const targetRows = tableProfileRowCount(input.profiles, input.toTable.ref.name);
|
||||
if (sourceRows === null || targetRows === null || sourceRows <= 0 || targetRows <= 0) {
|
||||
return 0.5;
|
||||
}
|
||||
|
||||
const ratio = targetRows / sourceRows;
|
||||
if (ratio >= 0.05 && ratio <= 20) {
|
||||
return 0.7;
|
||||
}
|
||||
return 0.4;
|
||||
}
|
||||
|
||||
function candidateSignalVector(input: {
|
||||
profiles: KloRelationshipProfileArtifact | undefined;
|
||||
fromTable: KloEnrichedTable;
|
||||
fromColumn: KloEnrichedColumn;
|
||||
toTable: KloEnrichedTable;
|
||||
toColumn: KloEnrichedColumn;
|
||||
targetKeyScore: number;
|
||||
nameScore: number;
|
||||
valueOverlap: number;
|
||||
embeddingSimilarity?: number;
|
||||
}): KloRelationshipSignalVector {
|
||||
const sourceProfile = profileColumn(input.profiles, input.fromTable.ref.name, input.fromColumn.name);
|
||||
const targetProfile = profileColumn(input.profiles, input.toTable.ref.name, input.toColumn.name);
|
||||
const targetUniqueness = targetProfile?.uniquenessRatio ?? input.targetKeyScore;
|
||||
const sourceNonNullness = sourceProfile ? 1 - sourceProfile.nullRate : 0.5;
|
||||
|
||||
return {
|
||||
nameSimilarity: input.nameScore,
|
||||
typeCompatibility: typesCompatible(input.fromColumn, input.toColumn) ? 1 : 0,
|
||||
valueOverlap: input.valueOverlap,
|
||||
embeddingSimilarity: input.embeddingSimilarity ?? 0,
|
||||
profileUniqueness: targetUniqueness,
|
||||
profileNullRate: sourceNonNullness,
|
||||
structuralPrior: structuralPriorScore({
|
||||
profiles: input.profiles,
|
||||
fromTable: input.fromTable,
|
||||
toTable: input.toTable,
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
function candidateParentTables(input: {
|
||||
tables: readonly KloEnrichedTable[];
|
||||
fromTable: KloEnrichedTable;
|
||||
fromColumn: KloEnrichedColumn;
|
||||
options: KloRelationshipDiscoveryCandidateOptions;
|
||||
}): KloEnrichedTable[] {
|
||||
const maxParentTables = input.options.maxCandidateParentTables ?? 20;
|
||||
if (maxParentTables <= 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: input.fromTable,
|
||||
childColumn: input.fromColumn,
|
||||
parentTables: input.tables,
|
||||
maxParentTables,
|
||||
}).map((item) => item.table);
|
||||
|
||||
const normalizedColumn = normalizeKloRelationshipName(input.fromColumn.name).normalized;
|
||||
if (!SELF_REFERENCE_NAMES.has(normalizedColumn) || ranked.some((table) => table.id === input.fromTable.id)) {
|
||||
return ranked;
|
||||
}
|
||||
|
||||
return [
|
||||
input.fromTable,
|
||||
...ranked.filter((table) => table.id !== input.fromTable.id).slice(0, Math.max(0, maxParentTables - 1)),
|
||||
];
|
||||
}
|
||||
|
||||
function targetKeyEvidence(
|
||||
table: KloEnrichedTable,
|
||||
column: KloEnrichedColumn,
|
||||
profiles: KloRelationshipProfileArtifact | undefined,
|
||||
): KloRelationshipTargetKeyEvidence {
|
||||
const deterministicScore = targetKeyScore(table, column);
|
||||
if (deterministicScore > 0) {
|
||||
return { score: deterministicScore, reasons: ['target_key_like'] };
|
||||
}
|
||||
|
||||
const profile = profileColumn(profiles, table.ref.name, column.name);
|
||||
if (!profile || profile.uniquenessRatio < 0.98 || profile.nullRate > 0.05) {
|
||||
return { score: 0, reasons: [] };
|
||||
}
|
||||
|
||||
const columnName = normalizeKloRelationshipName(column.name).normalized;
|
||||
if (columnName === 'code' || columnName.endsWith('_code') || columnName === 'key' || columnName.endsWith('_key')) {
|
||||
return { score: 0.86, reasons: ['profile_unique_target'] };
|
||||
}
|
||||
|
||||
return { score: 0.78, reasons: ['profile_unique_target'] };
|
||||
}
|
||||
|
||||
function endpoint(table: KloEnrichedTable, column: KloEnrichedColumn): KloRelationshipEndpoint {
|
||||
return {
|
||||
tableId: table.id,
|
||||
columnIds: [column.id],
|
||||
table: table.ref,
|
||||
columns: [column.name],
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipId(from: KloRelationshipEndpoint, to: KloRelationshipEndpoint): string {
|
||||
return `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`;
|
||||
}
|
||||
|
||||
function endpointsHaveSameOrderedColumns(left: KloRelationshipEndpoint, right: KloRelationshipEndpoint): boolean {
|
||||
if (left.columnIds.length !== right.columnIds.length || left.columns.length !== right.columns.length) {
|
||||
return false;
|
||||
}
|
||||
return left.columnIds.every(
|
||||
(columnId, index) => columnId === right.columnIds[index] && left.columns[index] === right.columns[index],
|
||||
);
|
||||
}
|
||||
|
||||
function isDegenerateSameColumnSelfLink(candidate: Pick<KloRelationshipDiscoveryCandidate, 'from' | 'to'>): boolean {
|
||||
return candidate.from.tableId === candidate.to.tableId && endpointsHaveSameOrderedColumns(candidate.from, candidate.to);
|
||||
}
|
||||
|
||||
function singleRelationshipColumn(endpointValue: KloRelationshipEndpoint): string {
|
||||
const column = endpointValue.columns[0];
|
||||
if (!column) {
|
||||
throw new Error(`Expected relationship endpoint ${endpointValue.table.name} to contain one column`);
|
||||
}
|
||||
return column;
|
||||
}
|
||||
|
||||
function candidateSortKey(candidate: KloRelationshipDiscoveryCandidate): string {
|
||||
return `${candidate.from.table.name}.${singleRelationshipColumn(candidate.from)}->${candidate.to.table.name}.${singleRelationshipColumn(candidate.to)}`;
|
||||
}
|
||||
|
||||
function uniqueReasons(values: readonly string[]): string[] {
|
||||
return Array.from(new Set(values.filter((value) => value.trim().length > 0)));
|
||||
}
|
||||
|
||||
function mergeCandidateEvidence(
|
||||
left: KloRelationshipDiscoveryCandidate,
|
||||
right: KloRelationshipDiscoveryCandidate,
|
||||
): KloRelationshipDiscoveryCandidate {
|
||||
const preferred = right.confidence > left.confidence && left.source === 'llm_proposal' ? right : left;
|
||||
const supplement = preferred === left ? right : left;
|
||||
return {
|
||||
...preferred,
|
||||
confidence: Math.max(left.confidence, right.confidence),
|
||||
evidence: {
|
||||
...preferred.evidence,
|
||||
llmConfidence: preferred.evidence.llmConfidence ?? supplement.evidence.llmConfidence,
|
||||
llmRationale: preferred.evidence.llmRationale ?? supplement.evidence.llmRationale,
|
||||
reasons: uniqueReasons([...preferred.evidence.reasons, ...supplement.evidence.reasons]),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function sourceForEvidence(reasons: string[]): KloRelationshipDiscoveryCandidateSource {
|
||||
if (reasons.includes('self_reference')) {
|
||||
return 'self_reference';
|
||||
}
|
||||
if (reasons.includes('embedding_similarity')) {
|
||||
return 'embedding_similarity';
|
||||
}
|
||||
if (reasons.includes('column_suffix_match')) {
|
||||
return 'column_suffix_match';
|
||||
}
|
||||
if (reasons.includes('parent_table_name_match')) {
|
||||
return 'parent_table_name_match';
|
||||
}
|
||||
if (reasons.includes('profile_sample_overlap') || reasons.includes('profile_unique_target')) {
|
||||
return 'profile_match';
|
||||
}
|
||||
if (reasons.includes('normalized_table_name')) {
|
||||
return 'normalized_table_match';
|
||||
}
|
||||
if (reasons.includes('exact_column_name')) {
|
||||
return 'exact_column_match';
|
||||
}
|
||||
if (reasons.includes('inflection')) {
|
||||
return 'inflection';
|
||||
}
|
||||
return 'normalized_table_match';
|
||||
}
|
||||
|
||||
function createCandidate(input: {
|
||||
fromTable: KloEnrichedTable;
|
||||
fromColumn: KloEnrichedColumn;
|
||||
toTable: KloEnrichedTable;
|
||||
toColumn: KloEnrichedColumn;
|
||||
sourceBase: string;
|
||||
targetBase: string;
|
||||
targetKeyScore: number;
|
||||
nameScore: number;
|
||||
reasons: string[];
|
||||
profiles: KloRelationshipProfileArtifact | undefined;
|
||||
valueOverlap: number;
|
||||
embeddingSimilarity?: number;
|
||||
}): KloRelationshipDiscoveryCandidate {
|
||||
const from = endpoint(input.fromTable, input.fromColumn);
|
||||
const to = endpoint(input.toTable, input.toColumn);
|
||||
const signalVector = candidateSignalVector({
|
||||
profiles: input.profiles,
|
||||
fromTable: input.fromTable,
|
||||
fromColumn: input.fromColumn,
|
||||
toTable: input.toTable,
|
||||
toColumn: input.toColumn,
|
||||
targetKeyScore: input.targetKeyScore,
|
||||
nameScore: input.nameScore,
|
||||
valueOverlap: input.valueOverlap,
|
||||
embeddingSimilarity: input.embeddingSimilarity,
|
||||
});
|
||||
const scoreBreakdown = scoreKloRelationshipCandidate(signalVector);
|
||||
|
||||
return {
|
||||
id: relationshipId(from, to),
|
||||
from,
|
||||
to,
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: scoreBreakdown.score,
|
||||
source: sourceForEvidence(input.reasons),
|
||||
status: 'review',
|
||||
evidence: {
|
||||
sourceColumnBase: input.sourceBase,
|
||||
targetTableBase: input.targetBase,
|
||||
targetColumnBase: normalizeKloRelationshipName(input.toColumn.name).normalized,
|
||||
targetKeyScore: input.targetKeyScore,
|
||||
nameScore: input.nameScore,
|
||||
reasons: input.reasons,
|
||||
signalVector,
|
||||
scoreBreakdown,
|
||||
...(input.embeddingSimilarity === undefined
|
||||
? {}
|
||||
: { embeddingSimilarity: Number(input.embeddingSimilarity.toFixed(3)) }),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function generateKloEmbeddingRelationshipCandidates(
|
||||
schema: KloEnrichedSchema,
|
||||
options: KloRelationshipDiscoveryCandidateOptions,
|
||||
): KloRelationshipDiscoveryCandidate[] {
|
||||
if (options.useEmbeddings === false) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const threshold = options.embeddingSimilarityThreshold ?? 0.92;
|
||||
const maxCandidatesPerColumn = options.maxEmbeddingCandidatesPerColumn ?? options.maxCandidatesPerColumn ?? 25;
|
||||
const tables = schema.tables.filter((table) => table.enabled);
|
||||
const candidates: KloRelationshipDiscoveryCandidate[] = [];
|
||||
|
||||
for (const fromTable of tables) {
|
||||
for (const fromColumn of fromTable.columns) {
|
||||
if (fromColumn.primaryKey || !hasUsableEmbedding(fromColumn)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const columnCandidates: KloRelationshipDiscoveryCandidate[] = [];
|
||||
for (const toTable of candidateParentTables({ tables, fromTable, fromColumn, options })) {
|
||||
if (fromTable.id === toTable.id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const toColumn of toTable.columns) {
|
||||
if (!hasUsableEmbedding(toColumn) || !typesCompatible(fromColumn, toColumn)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const keyEvidence = targetKeyEvidence(toTable, toColumn, options.profiles);
|
||||
if (keyEvidence.score === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const similarity = cosineSimilarity(fromColumn.embedding, toColumn.embedding);
|
||||
if (similarity < threshold) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const sourceBase = normalizeKloRelationshipName(fromColumn.name).normalized;
|
||||
const targetBase = normalizeKloRelationshipName(toTable.ref.name).singular;
|
||||
const reasons = ['embedding_similarity', ...keyEvidence.reasons];
|
||||
const candidate = createCandidate({
|
||||
fromTable,
|
||||
fromColumn,
|
||||
toTable,
|
||||
toColumn,
|
||||
sourceBase,
|
||||
targetBase,
|
||||
targetKeyScore: keyEvidence.score,
|
||||
nameScore: similarity,
|
||||
reasons,
|
||||
profiles: options.profiles,
|
||||
valueOverlap: profileSampleOverlap({
|
||||
profiles: options.profiles,
|
||||
fromTable,
|
||||
fromColumn,
|
||||
toTable,
|
||||
toColumn,
|
||||
}),
|
||||
embeddingSimilarity: similarity,
|
||||
});
|
||||
if (candidate.confidence >= (options.minConfidence ?? 0.72) && !isDegenerateSameColumnSelfLink(candidate)) {
|
||||
columnCandidates.push(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
columnCandidates.sort(
|
||||
(left, right) => right.confidence - left.confidence || candidateSortKey(left).localeCompare(candidateSortKey(right)),
|
||||
);
|
||||
candidates.push(...columnCandidates.slice(0, maxCandidatesPerColumn));
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
export function generateKloRelationshipDiscoveryCandidates(
|
||||
schema: KloEnrichedSchema,
|
||||
options: KloRelationshipDiscoveryCandidateOptions = {},
|
||||
): KloRelationshipDiscoveryCandidate[] {
|
||||
const maxCandidatesPerColumn = options.maxCandidatesPerColumn ?? 25;
|
||||
const minConfidence = options.minConfidence ?? 0.72;
|
||||
const tables = schema.tables.filter((table) => table.enabled);
|
||||
const candidates: KloRelationshipDiscoveryCandidate[] = [];
|
||||
|
||||
for (const fromTable of tables) {
|
||||
for (const fromColumn of fromTable.columns) {
|
||||
if (fromColumn.primaryKey) {
|
||||
continue;
|
||||
}
|
||||
const sourceReference = sourceColumnReference(fromColumn);
|
||||
if (!sourceReference) {
|
||||
continue;
|
||||
}
|
||||
const sourceBase = sourceReference.base;
|
||||
|
||||
const columnCandidates: KloRelationshipDiscoveryCandidate[] = [];
|
||||
for (const toTable of candidateParentTables({ tables, fromTable, fromColumn, options })) {
|
||||
const strictAliases = tableAliases(toTable);
|
||||
const parentAliases = parentTableNameAliases(toTable);
|
||||
const targetBase = normalizeKloRelationshipName(toTable.ref.name).singular;
|
||||
const sameTable = fromTable.id === toTable.id;
|
||||
const nameMatchesTarget = strictAliases.has(sourceBase);
|
||||
const parentTableNameMatcher = !sameTable && !nameMatchesTarget && parentAliases.has(sourceBase);
|
||||
const selfReference = sameTable && SELF_REFERENCE_NAMES.has(normalizeKloRelationshipName(fromColumn.name).normalized);
|
||||
const strictTableMatcher = (!sameTable && nameMatchesTarget) || selfReference;
|
||||
|
||||
for (const toColumn of toTable.columns) {
|
||||
const keyEvidence = targetKeyEvidence(toTable, toColumn, options.profiles);
|
||||
if (keyEvidence.score === 0 || !typesCompatible(fromColumn, toColumn)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const suffixMatcher =
|
||||
!strictTableMatcher &&
|
||||
!parentTableNameMatcher &&
|
||||
columnSuffixMatchesTarget({ fromColumn, toColumn }) &&
|
||||
isRelationshipKeyShapedTarget(toColumn);
|
||||
if (!strictTableMatcher && !suffixMatcher && !parentTableNameMatcher) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const overlap = profileSampleOverlap({
|
||||
profiles: options.profiles,
|
||||
fromTable,
|
||||
fromColumn,
|
||||
toTable,
|
||||
toColumn,
|
||||
});
|
||||
if (
|
||||
(strictTableMatcher || parentTableNameMatcher) &&
|
||||
keyEvidence.reasons.includes('profile_unique_target') &&
|
||||
overlap === 0
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
const reasons = suffixMatcher
|
||||
? ['column_suffix_match', ...keyEvidence.reasons]
|
||||
: [sourceReference.reason, ...keyEvidence.reasons];
|
||||
if (overlap > 0) {
|
||||
reasons.push('profile_sample_overlap');
|
||||
}
|
||||
let nameScore = suffixMatcher ? 0.78 : 0.88;
|
||||
if (parentTableNameMatcher) {
|
||||
reasons.push('parent_table_name_match');
|
||||
nameScore = 0.82;
|
||||
} else if (selfReference) {
|
||||
reasons.push('self_reference');
|
||||
nameScore = 0.82;
|
||||
} else if (!suffixMatcher && normalizeKloRelationshipName(toTable.ref.name).singular === sourceBase) {
|
||||
reasons.push('normalized_table_name');
|
||||
nameScore = 0.92;
|
||||
} else if (!suffixMatcher && strictAliases.has(sourceBase)) {
|
||||
reasons.push('inflection');
|
||||
nameScore = 0.88;
|
||||
}
|
||||
if (
|
||||
!suffixMatcher &&
|
||||
!parentTableNameMatcher &&
|
||||
normalizeKloRelationshipName(fromColumn.name).normalized === normalizeKloRelationshipName(toColumn.name).normalized
|
||||
) {
|
||||
reasons.push('exact_column_name');
|
||||
nameScore = Math.max(nameScore, 0.9);
|
||||
}
|
||||
|
||||
const candidate = createCandidate({
|
||||
fromTable,
|
||||
fromColumn,
|
||||
toTable,
|
||||
toColumn,
|
||||
sourceBase,
|
||||
targetBase,
|
||||
targetKeyScore: keyEvidence.score,
|
||||
nameScore,
|
||||
reasons,
|
||||
profiles: options.profiles,
|
||||
valueOverlap: overlap,
|
||||
});
|
||||
if (candidate.confidence >= minConfidence && !isDegenerateSameColumnSelfLink(candidate)) {
|
||||
columnCandidates.push(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
columnCandidates.sort(
|
||||
(left, right) => right.confidence - left.confidence || candidateSortKey(left).localeCompare(candidateSortKey(right)),
|
||||
);
|
||||
candidates.push(...columnCandidates.slice(0, maxCandidatesPerColumn));
|
||||
}
|
||||
}
|
||||
|
||||
candidates.push(...generateKloEmbeddingRelationshipCandidates(schema, options));
|
||||
|
||||
const byId = new Map<string, KloRelationshipDiscoveryCandidate>();
|
||||
for (const candidate of candidates) {
|
||||
const existing = byId.get(candidate.id);
|
||||
if (!existing || candidate.confidence > existing.confidence) {
|
||||
byId.set(candidate.id, candidate);
|
||||
}
|
||||
}
|
||||
return Array.from(byId.values()).sort(
|
||||
(left, right) => right.confidence - left.confidence || candidateSortKey(left).localeCompare(candidateSortKey(right)),
|
||||
);
|
||||
}
|
||||
|
||||
export function mergeKloRelationshipDiscoveryCandidates(
|
||||
candidates: readonly KloRelationshipDiscoveryCandidate[],
|
||||
): KloRelationshipDiscoveryCandidate[] {
|
||||
const byId = new Map<string, KloRelationshipDiscoveryCandidate>();
|
||||
for (const candidate of candidates) {
|
||||
const existing = byId.get(candidate.id);
|
||||
byId.set(candidate.id, existing ? mergeCandidateEvidence(existing, candidate) : candidate);
|
||||
}
|
||||
return Array.from(byId.values()).sort((left, right) => candidateSortKey(left).localeCompare(candidateSortKey(right)));
|
||||
}
|
||||
|
||||
export function inferKloRelationshipTargetPks(
|
||||
candidates: readonly KloRelationshipDiscoveryCandidate[],
|
||||
): KloRelationshipInferredTargetPk[] {
|
||||
const incoming = new Map<string, { table: string; column: string; scores: number[] }>();
|
||||
for (const candidate of candidates) {
|
||||
const toColumn = singleRelationshipColumn(candidate.to);
|
||||
const key = `${candidate.to.table.name}.${toColumn}`;
|
||||
const item = incoming.get(key) ?? { table: candidate.to.table.name, column: toColumn, scores: [] };
|
||||
item.scores.push(candidate.confidence);
|
||||
incoming.set(key, item);
|
||||
}
|
||||
|
||||
return Array.from(incoming.values())
|
||||
.map((item) => ({
|
||||
table: item.table,
|
||||
columns: [item.column],
|
||||
score: Number(Math.min(0.95, Math.max(...item.scores)).toFixed(3)),
|
||||
status: 'review' as const,
|
||||
incomingCandidateCount: item.scores.length,
|
||||
}))
|
||||
.sort((left, right) => left.table.localeCompare(right.table) || left.columns[0]!.localeCompare(right.columns[0]!));
|
||||
}
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { snapshotToKloEnrichedSchema } from './local-enrichment.js';
|
||||
import { loadKloRelationshipBenchmarkFixture, maskKloRelationshipBenchmarkSnapshot } from './relationship-benchmarks.js';
|
||||
import { discoverKloCompositeRelationships } from './relationship-composite-candidates.js';
|
||||
import { profileKloRelationshipSchema, type KloRelationshipReadOnlyExecutor } from './relationship-profiling.js';
|
||||
import type { KloQueryResult, KloReadOnlyQueryInput, KloScanContext } from './types.js';
|
||||
|
||||
class TestSqliteExecutor implements KloRelationshipReadOnlyExecutor {
|
||||
private readonly db: Database.Database;
|
||||
|
||||
constructor(dataPath: string) {
|
||||
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
|
||||
}
|
||||
|
||||
async executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return {
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
};
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
describe('composite relationship discovery detector', () => {
|
||||
it('infers composite primary keys and validates composite foreign keys from row evidence', async () => {
|
||||
const fixtureRoot = new URL('../../test/fixtures/relationship-benchmarks/', import.meta.url);
|
||||
const fixture = await loadKloRelationshipBenchmarkFixture(
|
||||
join(fixtureRoot.pathname, 'composite_keys_no_declared_constraints'),
|
||||
);
|
||||
const snapshot = maskKloRelationshipBenchmarkSnapshot(fixture.snapshot, 'declared_pks_and_declared_fks_removed');
|
||||
const schema = snapshotToKloEnrichedSchema(snapshot, new Map());
|
||||
const executor = new TestSqliteExecutor(fixture.dataPath ?? '');
|
||||
const profiles = await profileKloRelationshipSchema({
|
||||
connectionId: snapshot.connectionId,
|
||||
driver: snapshot.driver,
|
||||
schema,
|
||||
executor,
|
||||
ctx: { runId: 'test:composite-profile' },
|
||||
});
|
||||
|
||||
const result = await discoverKloCompositeRelationships({
|
||||
connectionId: snapshot.connectionId,
|
||||
driver: snapshot.driver,
|
||||
schema,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'test:composite-detect' },
|
||||
});
|
||||
executor.close();
|
||||
|
||||
expect(result.primaryKeys.map((item) => `${item.table.name}.(${item.columns.join(',')})`)).toEqual([
|
||||
'order_line_allocations.(order_id,line_number,warehouse_code)',
|
||||
'order_lines.(order_id,line_number)',
|
||||
]);
|
||||
expect(
|
||||
result.relationships.map(
|
||||
(item) =>
|
||||
`${item.from.table.name}.(${item.from.columns.join(',')})->${item.to.table.name}.(${item.to.columns.join(',')})`,
|
||||
),
|
||||
).toEqual(['order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)']);
|
||||
expect(result.relationships[0]).toMatchObject({
|
||||
relationshipType: 'many_to_one',
|
||||
status: 'accepted',
|
||||
confidence: 0.95,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
reasons: ['composite_validation_passed'],
|
||||
},
|
||||
});
|
||||
expect(result.queryCount).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
622
packages/context/src/scan/relationship-composite-candidates.ts
Normal file
622
packages/context/src/scan/relationship-composite-candidates.ts
Normal file
|
|
@ -0,0 +1,622 @@
|
|||
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable, KloRelationshipType } from './enrichment-types.js';
|
||||
import {
|
||||
formatKloRelationshipTableRef,
|
||||
quoteKloRelationshipIdentifier,
|
||||
type KloRelationshipProfileArtifact,
|
||||
type KloRelationshipReadOnlyExecutor,
|
||||
} from './relationship-profiling.js';
|
||||
import type { KloConnectionDriver, KloQueryResult, KloScanContext, KloTableRef } from './types.js';
|
||||
|
||||
export type KloCompositeRelationshipStatus = 'accepted' | 'review' | 'rejected';
|
||||
|
||||
export interface KloCompositeRelationshipTupleEndpoint {
|
||||
tableId: string;
|
||||
columnIds: string[];
|
||||
table: KloTableRef;
|
||||
columns: string[];
|
||||
}
|
||||
|
||||
export interface KloCompositePrimaryKeyCandidate {
|
||||
id: string;
|
||||
tableId: string;
|
||||
table: KloTableRef;
|
||||
columns: string[];
|
||||
columnIds: string[];
|
||||
score: number;
|
||||
status: KloCompositeRelationshipStatus;
|
||||
evidence: {
|
||||
rowCount: number;
|
||||
distinctCount: number;
|
||||
uniquenessRatio: number;
|
||||
nullRate: number;
|
||||
reasons: string[];
|
||||
};
|
||||
}
|
||||
|
||||
export interface KloCompositeRelationshipValidationEvidence {
|
||||
targetUniqueness: number;
|
||||
sourceCoverage: number;
|
||||
violationCount: number;
|
||||
violationRatio: number;
|
||||
childDistinct: number;
|
||||
parentDistinct: number;
|
||||
overlap: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KloCompositeRelationshipCandidate {
|
||||
id: string;
|
||||
from: KloCompositeRelationshipTupleEndpoint;
|
||||
to: KloCompositeRelationshipTupleEndpoint;
|
||||
relationshipType: KloRelationshipType;
|
||||
confidence: number;
|
||||
status: KloCompositeRelationshipStatus;
|
||||
source: 'composite_profile_match';
|
||||
validation: KloCompositeRelationshipValidationEvidence;
|
||||
}
|
||||
|
||||
export interface DiscoverKloCompositeRelationshipsInput {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
schema: KloEnrichedSchema;
|
||||
profiles: KloRelationshipProfileArtifact;
|
||||
executor: KloRelationshipReadOnlyExecutor | null;
|
||||
ctx: KloScanContext;
|
||||
maxCompositeWidth?: number;
|
||||
maxColumnsPerTable?: number;
|
||||
minPrimaryKeyUniqueness?: number;
|
||||
minSourceCoverage?: number;
|
||||
maxViolationRatio?: number;
|
||||
}
|
||||
|
||||
export interface DiscoverKloCompositeRelationshipsResult {
|
||||
primaryKeys: KloCompositePrimaryKeyCandidate[];
|
||||
relationships: KloCompositeRelationshipCandidate[];
|
||||
queryCount: number;
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
const KEY_NAME_PARTS = new Set(['id', 'key', 'code', 'number', 'num', 'line', 'warehouse', 'account', 'order']);
|
||||
const DEFAULT_MAX_COMPOSITE_WIDTH = 3;
|
||||
const DEFAULT_MAX_COLUMNS_PER_TABLE = 8;
|
||||
const DEFAULT_MIN_PRIMARY_KEY_UNIQUENESS = 0.98;
|
||||
const DEFAULT_MIN_SOURCE_COVERAGE = 0.9;
|
||||
const DEFAULT_MAX_VIOLATION_RATIO = 0.01;
|
||||
|
||||
function enabledTables(schema: KloEnrichedSchema): KloEnrichedTable[] {
|
||||
return schema.tables.filter((table) => table.enabled);
|
||||
}
|
||||
|
||||
function tableRowCount(profiles: KloRelationshipProfileArtifact, tableName: string): number {
|
||||
return profiles.tables.find((item) => item.table.name === tableName)?.rowCount ?? 0;
|
||||
}
|
||||
|
||||
function profileKey(tableName: string, columnName: string): string {
|
||||
return `${tableName}.${columnName}`;
|
||||
}
|
||||
|
||||
function profileNullRate(profiles: KloRelationshipProfileArtifact, tableName: string, columnName: string): number {
|
||||
return profiles.columns[profileKey(tableName, columnName)]?.nullRate ?? 1;
|
||||
}
|
||||
|
||||
function normalizedColumnName(name: string): string {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/gu, '_')
|
||||
.replace(/^_+|_+$/gu, '');
|
||||
}
|
||||
|
||||
function columnNameScore(column: KloEnrichedColumn): number {
|
||||
const parts = normalizedColumnName(column.name).split('_').filter(Boolean);
|
||||
if (parts.some((part) => KEY_NAME_PARTS.has(part))) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function nameParts(name: string): string[] {
|
||||
return normalizedColumnName(name).split('_').filter(Boolean);
|
||||
}
|
||||
|
||||
function keyLikeTableNameParts(tableName: string): Set<string> {
|
||||
return new Set(nameParts(tableName).filter((part) => KEY_NAME_PARTS.has(part)));
|
||||
}
|
||||
|
||||
function tupleCoversTableNameKeyParts(tableName: string, columns: readonly KloEnrichedColumn[]): boolean {
|
||||
const required = keyLikeTableNameParts(tableName);
|
||||
if (required.size === 0) {
|
||||
return true;
|
||||
}
|
||||
const columnParts = new Set(columns.flatMap((column) => nameParts(column.name)));
|
||||
return Array.from(required).every((part) => columnParts.has(part));
|
||||
}
|
||||
|
||||
function candidateKeyColumns(input: {
|
||||
table: KloEnrichedTable;
|
||||
profiles: KloRelationshipProfileArtifact;
|
||||
maxColumnsPerTable: number;
|
||||
}): KloEnrichedColumn[] {
|
||||
return input.table.columns
|
||||
.map((column, index) => ({ column, index }))
|
||||
.filter(({ column }) => {
|
||||
if (column.dimensionType === 'time' || column.dimensionType === 'boolean') {
|
||||
return false;
|
||||
}
|
||||
const profile = input.profiles.columns[profileKey(input.table.ref.name, column.name)];
|
||||
return Boolean(profile) && profile!.nullRate <= 0.02 && columnNameScore(column) > 0;
|
||||
})
|
||||
.sort(
|
||||
(left, right) =>
|
||||
columnNameScore(right.column) - columnNameScore(left.column) || left.index - right.index,
|
||||
)
|
||||
.slice(0, input.maxColumnsPerTable)
|
||||
.map(({ column }) => column);
|
||||
}
|
||||
|
||||
function hasStrongSingleColumnKey(input: {
|
||||
table: KloEnrichedTable;
|
||||
profiles: KloRelationshipProfileArtifact;
|
||||
minPrimaryKeyUniqueness: number;
|
||||
}): boolean {
|
||||
return input.table.columns.some((column) => {
|
||||
if (column.dimensionType === 'time' || column.dimensionType === 'boolean' || columnNameScore(column) === 0) {
|
||||
return false;
|
||||
}
|
||||
const profile = input.profiles.columns[profileKey(input.table.ref.name, column.name)];
|
||||
return Boolean(profile) && profile!.nullRate <= 0.02 && profile!.uniquenessRatio >= input.minPrimaryKeyUniqueness;
|
||||
});
|
||||
}
|
||||
|
||||
function combinations<T>(values: readonly T[], width: number): T[][] {
|
||||
if (width <= 0) {
|
||||
return [[]];
|
||||
}
|
||||
if (values.length < width) {
|
||||
return [];
|
||||
}
|
||||
const output: T[][] = [];
|
||||
values.forEach((value, index) => {
|
||||
for (const tail of combinations(values.slice(index + 1), width - 1)) {
|
||||
output.push([value, ...tail]);
|
||||
}
|
||||
});
|
||||
return output;
|
||||
}
|
||||
|
||||
function tupleKey(tableName: string, columns: readonly string[]): string {
|
||||
return `${tableName}.(${columns.join(',')})`;
|
||||
}
|
||||
|
||||
function relationshipKey(input: {
|
||||
fromTable: string;
|
||||
fromColumns: readonly string[];
|
||||
toTable: string;
|
||||
toColumns: readonly string[];
|
||||
}): string {
|
||||
return `${tupleKey(input.fromTable, input.fromColumns)}->${tupleKey(input.toTable, input.toColumns)}`;
|
||||
}
|
||||
|
||||
function tupleEndpoint(table: KloEnrichedTable, columns: readonly KloEnrichedColumn[]): KloCompositeRelationshipTupleEndpoint {
|
||||
return {
|
||||
tableId: table.id,
|
||||
columnIds: columns.map((column) => column.id),
|
||||
table: table.ref,
|
||||
columns: columns.map((column) => column.name),
|
||||
};
|
||||
}
|
||||
|
||||
function row(result: KloQueryResult): unknown[] {
|
||||
return result.rows[0] ?? [];
|
||||
}
|
||||
|
||||
function numberAt(result: KloQueryResult, header: string): number {
|
||||
const index = result.headers.findIndex((candidate) => candidate.toLowerCase() === header.toLowerCase());
|
||||
const value = row(result)[index];
|
||||
if (typeof value === 'number') {
|
||||
return value;
|
||||
}
|
||||
if (typeof value === 'bigint') {
|
||||
return Number(value);
|
||||
}
|
||||
if (typeof value === 'string' && value.trim() !== '') {
|
||||
return Number(value);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function topSql(driver: KloConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return ` TOP (${Math.max(1, Math.floor(limit))})`;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function limitSql(driver: KloConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return '';
|
||||
}
|
||||
return ` LIMIT ${Math.max(1, Math.floor(limit))}`;
|
||||
}
|
||||
|
||||
function aliasedTupleSelect(driver: KloConnectionDriver, columns: readonly string[]): string {
|
||||
return columns
|
||||
.map((column, index) => `${quoteKloRelationshipIdentifier(driver, column)} AS c${index}`)
|
||||
.join(', ');
|
||||
}
|
||||
|
||||
function nonNullPredicate(driver: KloConnectionDriver, columns: readonly string[]): string {
|
||||
return columns.map((column) => `${quoteKloRelationshipIdentifier(driver, column)} IS NOT NULL`).join(' AND ');
|
||||
}
|
||||
|
||||
function tupleEquality(columns: number): string {
|
||||
return Array.from({ length: columns }, (_, index) => `child_values.c${index} = parent_values.c${index}`).join(
|
||||
' AND ',
|
||||
);
|
||||
}
|
||||
|
||||
function buildTupleDistinctSql(input: {
|
||||
driver: KloConnectionDriver;
|
||||
table: KloTableRef;
|
||||
columns: readonly string[];
|
||||
}): string {
|
||||
const tableSql = formatKloRelationshipTableRef(input.driver, input.table);
|
||||
return [
|
||||
'WITH tuple_values AS (',
|
||||
`SELECT DISTINCT ${aliasedTupleSelect(input.driver, input.columns)} FROM ${tableSql}`,
|
||||
`WHERE ${nonNullPredicate(input.driver, input.columns)}`,
|
||||
')',
|
||||
'SELECT COUNT(*) AS distinct_count FROM tuple_values',
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
function buildCompositeCoverageSql(input: {
|
||||
driver: KloConnectionDriver;
|
||||
childTable: KloTableRef;
|
||||
childColumns: readonly string[];
|
||||
parentTable: KloTableRef;
|
||||
parentColumns: readonly string[];
|
||||
maxDistinctSourceValues: number;
|
||||
}): string {
|
||||
const childTableSql = formatKloRelationshipTableRef(input.driver, input.childTable);
|
||||
const parentTableSql = formatKloRelationshipTableRef(input.driver, input.parentTable);
|
||||
const top = topSql(input.driver, input.maxDistinctSourceValues);
|
||||
const limit = limitSql(input.driver, input.maxDistinctSourceValues);
|
||||
return [
|
||||
'WITH child_values AS (',
|
||||
`SELECT DISTINCT${top} ${aliasedTupleSelect(input.driver, input.childColumns)} FROM ${childTableSql}`,
|
||||
`WHERE ${nonNullPredicate(input.driver, input.childColumns)}${limit}`,
|
||||
'), parent_values AS (',
|
||||
`SELECT DISTINCT ${aliasedTupleSelect(input.driver, input.parentColumns)} FROM ${parentTableSql}`,
|
||||
`WHERE ${nonNullPredicate(input.driver, input.parentColumns)}`,
|
||||
')',
|
||||
'SELECT',
|
||||
'(SELECT COUNT(*) FROM child_values) AS child_distinct,',
|
||||
'(SELECT COUNT(*) FROM parent_values) AS parent_distinct,',
|
||||
'SUM(CASE WHEN parent_values.c0 IS NOT NULL THEN 1 ELSE 0 END) AS overlap,',
|
||||
'SUM(CASE WHEN parent_values.c0 IS NULL THEN 1 ELSE 0 END) AS violation_count',
|
||||
'FROM child_values',
|
||||
`LEFT JOIN parent_values ON ${tupleEquality(input.childColumns.length)}`,
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
function relationshipStatus(input: {
|
||||
targetUniqueness: number;
|
||||
sourceCoverage: number;
|
||||
violationRatio: number;
|
||||
minSourceCoverage: number;
|
||||
maxViolationRatio: number;
|
||||
}): KloCompositeRelationshipStatus {
|
||||
if (
|
||||
input.targetUniqueness >= DEFAULT_MIN_PRIMARY_KEY_UNIQUENESS &&
|
||||
input.sourceCoverage >= input.minSourceCoverage &&
|
||||
input.violationRatio <= input.maxViolationRatio
|
||||
) {
|
||||
return 'accepted';
|
||||
}
|
||||
if (input.sourceCoverage >= 0.55) {
|
||||
return 'review';
|
||||
}
|
||||
return 'rejected';
|
||||
}
|
||||
|
||||
function hasAcceptedSubset(
|
||||
accepted: readonly KloCompositePrimaryKeyCandidate[],
|
||||
tableName: string,
|
||||
columns: readonly string[],
|
||||
): boolean {
|
||||
const columnSet = new Set(columns);
|
||||
return accepted.some(
|
||||
(candidate) =>
|
||||
candidate.table.name === tableName &&
|
||||
candidate.columns.length < columns.length &&
|
||||
candidate.columns.every((column) => columnSet.has(column)),
|
||||
);
|
||||
}
|
||||
|
||||
async function detectCompositePrimaryKeys(input: {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
table: KloEnrichedTable;
|
||||
profiles: KloRelationshipProfileArtifact;
|
||||
executor: KloRelationshipReadOnlyExecutor;
|
||||
ctx: KloScanContext;
|
||||
maxCompositeWidth: number;
|
||||
maxColumnsPerTable: number;
|
||||
minPrimaryKeyUniqueness: number;
|
||||
}): Promise<{ primaryKeys: KloCompositePrimaryKeyCandidate[]; queryCount: number }> {
|
||||
const rowCount = tableRowCount(input.profiles, input.table.ref.name);
|
||||
if (rowCount === 0) {
|
||||
return { primaryKeys: [], queryCount: 0 };
|
||||
}
|
||||
if (
|
||||
hasStrongSingleColumnKey({
|
||||
table: input.table,
|
||||
profiles: input.profiles,
|
||||
minPrimaryKeyUniqueness: input.minPrimaryKeyUniqueness,
|
||||
})
|
||||
) {
|
||||
return { primaryKeys: [], queryCount: 0 };
|
||||
}
|
||||
|
||||
const columns = candidateKeyColumns({
|
||||
table: input.table,
|
||||
profiles: input.profiles,
|
||||
maxColumnsPerTable: input.maxColumnsPerTable,
|
||||
});
|
||||
const primaryKeys: KloCompositePrimaryKeyCandidate[] = [];
|
||||
let queryCount = 0;
|
||||
|
||||
for (let width = 2; width <= input.maxCompositeWidth; width += 1) {
|
||||
for (const columnTuple of combinations(columns, width)) {
|
||||
const columnNames = columnTuple.map((column) => column.name);
|
||||
if (!tupleCoversTableNameKeyParts(input.table.ref.name, columnTuple)) {
|
||||
continue;
|
||||
}
|
||||
if (hasAcceptedSubset(primaryKeys, input.table.ref.name, columnNames)) {
|
||||
continue;
|
||||
}
|
||||
const result = await input.executor.executeReadOnly(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
sql: buildTupleDistinctSql({
|
||||
driver: input.driver,
|
||||
table: input.table.ref,
|
||||
columns: columnNames,
|
||||
}),
|
||||
maxRows: 1,
|
||||
},
|
||||
input.ctx,
|
||||
);
|
||||
queryCount += 1;
|
||||
const distinctCount = numberAt(result, 'distinct_count');
|
||||
const uniquenessRatio = rowCount === 0 ? 0 : distinctCount / rowCount;
|
||||
if (uniquenessRatio < input.minPrimaryKeyUniqueness) {
|
||||
continue;
|
||||
}
|
||||
const nullRate = Math.max(
|
||||
...columnNames.map((columnName) => profileNullRate(input.profiles, input.table.ref.name, columnName)),
|
||||
);
|
||||
primaryKeys.push({
|
||||
id: tupleKey(input.table.ref.name, columnNames),
|
||||
tableId: input.table.id,
|
||||
table: input.table.ref,
|
||||
columns: columnNames,
|
||||
columnIds: columnTuple.map((column) => column.id),
|
||||
score: Number(Math.min(0.99, 0.72 + uniquenessRatio * 0.22 + (1 - nullRate) * 0.06).toFixed(3)),
|
||||
status: 'accepted',
|
||||
evidence: {
|
||||
rowCount,
|
||||
distinctCount,
|
||||
uniquenessRatio,
|
||||
nullRate,
|
||||
reasons: ['composite_unique_tuple', 'not_null_profile'],
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
primaryKeys: primaryKeys.sort((left, right) =>
|
||||
tupleKey(left.table.name, left.columns).localeCompare(tupleKey(right.table.name, right.columns)),
|
||||
),
|
||||
queryCount,
|
||||
};
|
||||
}
|
||||
|
||||
function columnsByName(table: KloEnrichedTable): Map<string, KloEnrichedColumn> {
|
||||
return new Map(table.columns.map((column) => [column.name, column]));
|
||||
}
|
||||
|
||||
function compatibleTuple(sourceColumns: readonly KloEnrichedColumn[], targetColumns: readonly KloEnrichedColumn[]): boolean {
|
||||
if (sourceColumns.length !== targetColumns.length) {
|
||||
return false;
|
||||
}
|
||||
return sourceColumns.every((source, index) => {
|
||||
const target = targetColumns[index];
|
||||
return Boolean(target) && source.dimensionType === target.dimensionType;
|
||||
});
|
||||
}
|
||||
|
||||
async function validateCompositeRelationship(input: {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
sourceTable: KloEnrichedTable;
|
||||
sourceColumns: readonly KloEnrichedColumn[];
|
||||
targetKey: KloCompositePrimaryKeyCandidate;
|
||||
targetTable: KloEnrichedTable;
|
||||
targetColumns: readonly KloEnrichedColumn[];
|
||||
executor: KloRelationshipReadOnlyExecutor;
|
||||
ctx: KloScanContext;
|
||||
minSourceCoverage: number;
|
||||
maxViolationRatio: number;
|
||||
}): Promise<{ relationship: KloCompositeRelationshipCandidate; queryCount: number }> {
|
||||
const result = await input.executor.executeReadOnly(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
sql: buildCompositeCoverageSql({
|
||||
driver: input.driver,
|
||||
childTable: input.sourceTable.ref,
|
||||
childColumns: input.sourceColumns.map((column) => column.name),
|
||||
parentTable: input.targetTable.ref,
|
||||
parentColumns: input.targetColumns.map((column) => column.name),
|
||||
maxDistinctSourceValues: 10000,
|
||||
}),
|
||||
maxRows: 1,
|
||||
},
|
||||
input.ctx,
|
||||
);
|
||||
const childDistinct = numberAt(result, 'child_distinct');
|
||||
const parentDistinct = numberAt(result, 'parent_distinct');
|
||||
const overlap = numberAt(result, 'overlap');
|
||||
const violationCount = numberAt(result, 'violation_count');
|
||||
const sourceCoverage = childDistinct === 0 ? 0 : overlap / childDistinct;
|
||||
const violationRatio = childDistinct === 0 ? 1 : violationCount / childDistinct;
|
||||
const targetUniqueness = input.targetKey.evidence.uniquenessRatio;
|
||||
const status = relationshipStatus({
|
||||
targetUniqueness,
|
||||
sourceCoverage,
|
||||
violationRatio,
|
||||
minSourceCoverage: input.minSourceCoverage,
|
||||
maxViolationRatio: input.maxViolationRatio,
|
||||
});
|
||||
|
||||
const from = tupleEndpoint(input.sourceTable, input.sourceColumns);
|
||||
const to = {
|
||||
tableId: input.targetKey.tableId,
|
||||
columnIds: input.targetKey.columnIds,
|
||||
table: input.targetKey.table,
|
||||
columns: input.targetKey.columns,
|
||||
};
|
||||
const reasons =
|
||||
status === 'accepted'
|
||||
? ['composite_validation_passed']
|
||||
: [
|
||||
'composite_validation_failed',
|
||||
sourceCoverage < input.minSourceCoverage ? 'low_source_coverage' : '',
|
||||
violationRatio > input.maxViolationRatio ? 'excessive_violations' : '',
|
||||
].filter(Boolean);
|
||||
|
||||
return {
|
||||
queryCount: 1,
|
||||
relationship: {
|
||||
id: relationshipKey({
|
||||
fromTable: from.table.name,
|
||||
fromColumns: from.columns,
|
||||
toTable: to.table.name,
|
||||
toColumns: to.columns,
|
||||
}),
|
||||
from,
|
||||
to,
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: status === 'accepted' ? 0.95 : 0.62,
|
||||
status,
|
||||
source: 'composite_profile_match',
|
||||
validation: {
|
||||
targetUniqueness,
|
||||
sourceCoverage,
|
||||
violationCount,
|
||||
violationRatio,
|
||||
childDistinct,
|
||||
parentDistinct,
|
||||
overlap,
|
||||
reasons,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function discoverKloCompositeRelationships(
|
||||
input: DiscoverKloCompositeRelationshipsInput,
|
||||
): Promise<DiscoverKloCompositeRelationshipsResult> {
|
||||
if (!input.executor || !input.profiles.sqlAvailable) {
|
||||
return {
|
||||
primaryKeys: [],
|
||||
relationships: [],
|
||||
queryCount: 0,
|
||||
warnings: ['composite_relationship_validation_unavailable'],
|
||||
};
|
||||
}
|
||||
|
||||
const settings = {
|
||||
maxCompositeWidth: input.maxCompositeWidth ?? DEFAULT_MAX_COMPOSITE_WIDTH,
|
||||
maxColumnsPerTable: input.maxColumnsPerTable ?? DEFAULT_MAX_COLUMNS_PER_TABLE,
|
||||
minPrimaryKeyUniqueness: input.minPrimaryKeyUniqueness ?? DEFAULT_MIN_PRIMARY_KEY_UNIQUENESS,
|
||||
minSourceCoverage: input.minSourceCoverage ?? DEFAULT_MIN_SOURCE_COVERAGE,
|
||||
maxViolationRatio: input.maxViolationRatio ?? DEFAULT_MAX_VIOLATION_RATIO,
|
||||
};
|
||||
const tables = enabledTables(input.schema);
|
||||
const tableByName = new Map(tables.map((table) => [table.ref.name, table]));
|
||||
const primaryKeys: KloCompositePrimaryKeyCandidate[] = [];
|
||||
let queryCount = 0;
|
||||
|
||||
for (const table of tables) {
|
||||
const result = await detectCompositePrimaryKeys({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
table,
|
||||
profiles: input.profiles,
|
||||
executor: input.executor,
|
||||
ctx: input.ctx,
|
||||
maxCompositeWidth: settings.maxCompositeWidth,
|
||||
maxColumnsPerTable: settings.maxColumnsPerTable,
|
||||
minPrimaryKeyUniqueness: settings.minPrimaryKeyUniqueness,
|
||||
});
|
||||
primaryKeys.push(...result.primaryKeys);
|
||||
queryCount += result.queryCount;
|
||||
}
|
||||
|
||||
const relationships: KloCompositeRelationshipCandidate[] = [];
|
||||
for (const targetKey of primaryKeys) {
|
||||
const targetTable = tableByName.get(targetKey.table.name);
|
||||
if (!targetTable) {
|
||||
continue;
|
||||
}
|
||||
const targetColumnByName = columnsByName(targetTable);
|
||||
const targetColumns = targetKey.columns.flatMap((columnName) => {
|
||||
const column = targetColumnByName.get(columnName);
|
||||
return column ? [column] : [];
|
||||
});
|
||||
if (targetColumns.length !== targetKey.columns.length) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const sourceTable of tables) {
|
||||
if (sourceTable.id === targetTable.id) {
|
||||
continue;
|
||||
}
|
||||
const sourceColumnByName = columnsByName(sourceTable);
|
||||
const sourceColumns = targetKey.columns.flatMap((columnName) => {
|
||||
const column = sourceColumnByName.get(columnName);
|
||||
return column ? [column] : [];
|
||||
});
|
||||
if (sourceColumns.length !== targetKey.columns.length || !compatibleTuple(sourceColumns, targetColumns)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const result = await validateCompositeRelationship({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
sourceTable,
|
||||
sourceColumns,
|
||||
targetKey,
|
||||
targetTable,
|
||||
targetColumns,
|
||||
executor: input.executor,
|
||||
ctx: input.ctx,
|
||||
minSourceCoverage: settings.minSourceCoverage,
|
||||
maxViolationRatio: settings.maxViolationRatio,
|
||||
});
|
||||
queryCount += result.queryCount;
|
||||
if (result.relationship.status !== 'rejected') {
|
||||
relationships.push(result.relationship);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
primaryKeys: primaryKeys.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
relationships: relationships.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
queryCount,
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
373
packages/context/src/scan/relationship-diagnostics.test.ts
Normal file
373
packages/context/src/scan/relationship-diagnostics.test.ts
Normal file
|
|
@ -0,0 +1,373 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KloEnrichedRelationship, KloRelationshipEndpoint } from './enrichment-types.js';
|
||||
import type { KloResolvedRelationshipDiscoveryCandidate } from './relationship-graph-resolver.js';
|
||||
import {
|
||||
buildKloRelationshipArtifacts,
|
||||
buildKloRelationshipDiagnostics,
|
||||
emptyKloRelationshipProfileArtifact,
|
||||
} from './relationship-diagnostics.js';
|
||||
|
||||
function endpoint(table: string, column: string): KloRelationshipEndpoint {
|
||||
return {
|
||||
tableId: table,
|
||||
columnIds: [`${table}.${column}`],
|
||||
table: { catalog: null, db: null, name: table },
|
||||
columns: [column],
|
||||
};
|
||||
}
|
||||
|
||||
function enrichedRelationship(input: {
|
||||
id: string;
|
||||
fromTable: string;
|
||||
fromColumn: string;
|
||||
toTable: string;
|
||||
toColumn: string;
|
||||
confidence?: number;
|
||||
}): KloEnrichedRelationship {
|
||||
return {
|
||||
id: input.id,
|
||||
source: 'inferred',
|
||||
from: endpoint(input.fromTable, input.fromColumn),
|
||||
to: endpoint(input.toTable, input.toColumn),
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: input.confidence ?? 0.92,
|
||||
isPrimaryKeyReference: true,
|
||||
};
|
||||
}
|
||||
|
||||
function resolvedRelationship(input: {
|
||||
id: string;
|
||||
status: 'accepted' | 'review' | 'rejected';
|
||||
source?: 'normalized_table_match' | 'exact_column_match' | 'inflection' | 'self_reference' | 'llm_proposal';
|
||||
fkScore?: number;
|
||||
pkScore?: number;
|
||||
validationReasons?: string[];
|
||||
graphReasons?: string[];
|
||||
}): KloResolvedRelationshipDiscoveryCandidate {
|
||||
return {
|
||||
id: input.id,
|
||||
from: endpoint('orders', 'customer_id'),
|
||||
to: endpoint('customers', 'id'),
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.88,
|
||||
source: input.source ?? 'normalized_table_match',
|
||||
status: input.status,
|
||||
evidence:
|
||||
input.source === 'llm_proposal'
|
||||
? {
|
||||
sourceColumnBase: 'buyer',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.88,
|
||||
nameScore: 0.45,
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
}
|
||||
: {
|
||||
sourceColumnBase: 'customer',
|
||||
targetTableBase: 'customer',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.9,
|
||||
nameScore: 0.85,
|
||||
reasons: ['table_name_matches_source_column'],
|
||||
},
|
||||
score: 0.91,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: input.status === 'rejected' ? 0.2 : 1,
|
||||
violationCount: input.status === 'rejected' ? 8 : 0,
|
||||
violationRatio: input.status === 'rejected' ? 0.8 : 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 10,
|
||||
parentDistinct: 10,
|
||||
overlap: input.status === 'rejected' ? 2 : 10,
|
||||
checkedValues: 10,
|
||||
reasons: input.validationReasons ?? ['validation_passed'],
|
||||
},
|
||||
pkScore: input.pkScore ?? 0.97,
|
||||
fkScore: input.fkScore ?? 0.94,
|
||||
graph: {
|
||||
targetPkScore: input.pkScore ?? 0.97,
|
||||
incomingCandidateCount: 1,
|
||||
conflictRank: 1,
|
||||
reasons: input.graphReasons ?? ['target_pk_score_passed', 'fk_score_passed'],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship diagnostics artifacts', () => {
|
||||
it('groups graph-resolved relationships and preserves evidence reasons', () => {
|
||||
const artifacts = buildKloRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
resolvedRelationships: [
|
||||
resolvedRelationship({ id: 'accepted-edge', status: 'accepted', source: 'llm_proposal' }),
|
||||
resolvedRelationship({
|
||||
id: 'review-edge',
|
||||
status: 'review',
|
||||
validationReasons: ['validation_unavailable'],
|
||||
graphReasons: ['validation_unavailable_review_only', 'fk_score_review'],
|
||||
}),
|
||||
resolvedRelationship({
|
||||
id: 'rejected-edge',
|
||||
status: 'rejected',
|
||||
validationReasons: ['low_source_coverage'],
|
||||
graphReasons: ['fk_score_rejected'],
|
||||
}),
|
||||
],
|
||||
});
|
||||
|
||||
expect(artifacts.accepted).toHaveLength(1);
|
||||
expect(artifacts.accepted[0]).toMatchObject({
|
||||
source: 'llm_proposal',
|
||||
evidence: {
|
||||
llmConfidence: 0.89,
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
reasons: expect.arrayContaining(['llm_proposal', 'llm_pk_proposal']),
|
||||
});
|
||||
expect(artifacts.review).toHaveLength(1);
|
||||
expect(artifacts.rejected).toHaveLength(1);
|
||||
expect(artifacts.review[0]).toMatchObject({
|
||||
id: 'review-edge',
|
||||
status: 'review',
|
||||
source: 'normalized_table_match',
|
||||
fkScore: 0.94,
|
||||
reasons: expect.arrayContaining(['validation_unavailable', 'validation_unavailable_review_only']),
|
||||
});
|
||||
expect(artifacts.rejected[0]?.reasons).toEqual(
|
||||
expect.arrayContaining(['table_name_matches_source_column', 'low_source_coverage', 'fk_score_rejected']),
|
||||
);
|
||||
});
|
||||
|
||||
it('adapts legacy relationship updates into the richer artifact shape', () => {
|
||||
const artifacts = buildKloRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
enrichedRelationship({
|
||||
id: 'orders-customer',
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
}),
|
||||
],
|
||||
rejected: [
|
||||
enrichedRelationship({
|
||||
id: 'orders-account',
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'account_id',
|
||||
toTable: 'accounts',
|
||||
toColumn: 'id',
|
||||
confidence: 0.4,
|
||||
}),
|
||||
],
|
||||
skipped: [{ relationshipId: 'orders-region', reason: 'validation_port_unavailable' }],
|
||||
},
|
||||
});
|
||||
|
||||
expect(artifacts.accepted[0]).toMatchObject({
|
||||
id: 'orders-customer',
|
||||
status: 'accepted',
|
||||
source: 'inferred',
|
||||
reasons: ['accepted_relationship_update'],
|
||||
});
|
||||
expect(artifacts.rejected[0]).toMatchObject({
|
||||
id: 'orders-account',
|
||||
status: 'rejected',
|
||||
reasons: ['rejected_relationship_update'],
|
||||
});
|
||||
expect(artifacts.skipped).toEqual([{ relationshipId: 'orders-region', reason: 'validation_port_unavailable' }]);
|
||||
});
|
||||
|
||||
it('deduplicates resolved and formal relationship update artifacts by edge id', () => {
|
||||
const artifacts = buildKloRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
resolvedRelationships: [
|
||||
{
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
from: endpoint('orders', 'account_id'),
|
||||
to: endpoint('accounts', 'id'),
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'normalized_table_match',
|
||||
status: 'accepted',
|
||||
confidence: 0.92,
|
||||
score: 0.9,
|
||||
pkScore: 0.92,
|
||||
fkScore: 0.9,
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: 'account',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.92,
|
||||
nameScore: 0.92,
|
||||
reasons: ['foreign_key_suffix'],
|
||||
},
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
checkedValues: 2,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
graph: {
|
||||
targetPkScore: 0.92,
|
||||
incomingCandidateCount: 1,
|
||||
conflictRank: 1,
|
||||
reasons: ['fk_score_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
{
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'formal',
|
||||
from: endpoint('orders', 'account_id'),
|
||||
to: endpoint('accounts', 'id'),
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
});
|
||||
|
||||
expect(artifacts.accepted).toHaveLength(1);
|
||||
expect(artifacts.accepted[0]).toMatchObject({
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'normalized_table_match',
|
||||
reasons: expect.arrayContaining(['foreign_key_suffix', 'validation_passed', 'fk_score_passed']),
|
||||
});
|
||||
});
|
||||
|
||||
it('explains validation-unavailable review candidates', () => {
|
||||
const artifacts = buildKloRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
resolvedRelationships: [
|
||||
resolvedRelationship({
|
||||
id: 'review-edge',
|
||||
status: 'review',
|
||||
validationReasons: ['validation_unavailable'],
|
||||
graphReasons: ['validation_unavailable_review_only'],
|
||||
}),
|
||||
],
|
||||
});
|
||||
const profile = emptyKloRelationshipProfileArtifact({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
reason: 'read_only_sql_unavailable',
|
||||
});
|
||||
|
||||
const diagnostics = buildKloRelationshipDiagnostics({
|
||||
connectionId: 'warehouse',
|
||||
generatedAt: '2026-05-07T12:00:00.000Z',
|
||||
artifacts,
|
||||
profile,
|
||||
warnings: [
|
||||
{
|
||||
code: 'connector_capability_missing',
|
||||
message: 'KLO scan connector cannot run standalone statistical relationship validation',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
},
|
||||
],
|
||||
thresholds: { acceptThreshold: 0.85, reviewThreshold: 0.55 },
|
||||
});
|
||||
|
||||
expect(diagnostics.summary).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(diagnostics.noAcceptedReason).toBe('validation unavailable; review candidates written');
|
||||
expect(diagnostics.candidateCountsBySource).toEqual({ normalized_table_match: 1 });
|
||||
expect(diagnostics.validation).toEqual({
|
||||
available: false,
|
||||
sqlAvailable: false,
|
||||
queryCount: 0,
|
||||
});
|
||||
expect(diagnostics.profileWarnings).toEqual(['read_only_sql_unavailable']);
|
||||
expect(diagnostics.warnings[0]).toMatchObject({ code: 'connector_capability_missing' });
|
||||
});
|
||||
|
||||
it('explains empty relationship output as a no-candidate outcome', () => {
|
||||
const artifacts = buildKloRelationshipArtifacts({ connectionId: 'warehouse' });
|
||||
const diagnostics = buildKloRelationshipDiagnostics({
|
||||
connectionId: 'warehouse',
|
||||
generatedAt: '2026-05-07T12:00:00.000Z',
|
||||
artifacts,
|
||||
profile: emptyKloRelationshipProfileArtifact({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
reason: 'relationship_profiling_not_run',
|
||||
}),
|
||||
});
|
||||
|
||||
expect(diagnostics.summary).toEqual({ accepted: 0, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(diagnostics.noAcceptedReason).toBe('no candidate pairs passed type compatibility');
|
||||
expect(diagnostics.candidateCountsBySource).toEqual({});
|
||||
});
|
||||
|
||||
it('records composite relationship endpoints in relationship artifacts', () => {
|
||||
const artifacts = buildKloRelationshipArtifacts({
|
||||
connectionId: 'warehouse',
|
||||
compositeRelationships: [
|
||||
{
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'composite_profile_match',
|
||||
status: 'accepted',
|
||||
from: {
|
||||
tableId: 'order_line_allocations',
|
||||
columnIds: ['order_line_allocations.order_id', 'order_line_allocations.line_number'],
|
||||
table: { catalog: null, db: null, name: 'order_line_allocations' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'order_lines',
|
||||
columnIds: ['order_lines.order_id', 'order_lines.line_number'],
|
||||
table: { catalog: null, db: null, name: 'order_lines' },
|
||||
columns: ['order_id', 'line_number'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.95,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
childDistinct: 2,
|
||||
parentDistinct: 2,
|
||||
overlap: 2,
|
||||
reasons: ['composite_validation_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(artifacts.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)',
|
||||
source: 'composite_profile_match',
|
||||
from: expect.objectContaining({
|
||||
columnIds: ['order_line_allocations.order_id', 'order_line_allocations.line_number'],
|
||||
columns: ['order_id', 'line_number'],
|
||||
}),
|
||||
to: expect.objectContaining({
|
||||
columnIds: ['order_lines.order_id', 'order_lines.line_number'],
|
||||
columns: ['order_id', 'line_number'],
|
||||
}),
|
||||
reasons: ['composite_validation_passed'],
|
||||
validation: expect.objectContaining({ sourceCoverage: 1 }),
|
||||
}),
|
||||
]);
|
||||
});
|
||||
});
|
||||
364
packages/context/src/scan/relationship-diagnostics.ts
Normal file
364
packages/context/src/scan/relationship-diagnostics.ts
Normal file
|
|
@ -0,0 +1,364 @@
|
|||
import type {
|
||||
KloEnrichedRelationship,
|
||||
KloRelationshipEndpoint,
|
||||
KloRelationshipType,
|
||||
KloRelationshipUpdate,
|
||||
} from './enrichment-types.js';
|
||||
import type {
|
||||
KloResolvedRelationshipDiscoveryCandidate,
|
||||
KloResolvedRelationshipStatus,
|
||||
} from './relationship-graph-resolver.js';
|
||||
import type { KloCompositeRelationshipCandidate } from './relationship-composite-candidates.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type { KloConnectionDriver, KloScanWarning } from './types.js';
|
||||
|
||||
export interface KloRelationshipArtifactEndpoint {
|
||||
tableId: string;
|
||||
columnIds: string[];
|
||||
table: {
|
||||
catalog: string | null;
|
||||
db: string | null;
|
||||
name: string;
|
||||
};
|
||||
columns: string[];
|
||||
}
|
||||
|
||||
export interface KloRelationshipArtifactEdge {
|
||||
id: string;
|
||||
status: KloResolvedRelationshipStatus;
|
||||
source: string;
|
||||
from: KloRelationshipArtifactEndpoint;
|
||||
to: KloRelationshipArtifactEndpoint;
|
||||
relationshipType: KloRelationshipType;
|
||||
confidence: number;
|
||||
pkScore: number | null;
|
||||
fkScore: number | null;
|
||||
score: number | null;
|
||||
evidence: unknown | null;
|
||||
validation: unknown | null;
|
||||
graph: unknown | null;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KloRelationshipArtifact {
|
||||
connectionId: string;
|
||||
accepted: KloRelationshipArtifactEdge[];
|
||||
review: KloRelationshipArtifactEdge[];
|
||||
rejected: KloRelationshipArtifactEdge[];
|
||||
skipped: KloRelationshipUpdate['skipped'];
|
||||
}
|
||||
|
||||
export interface KloRelationshipDiagnosticsSummary {
|
||||
accepted: number;
|
||||
review: number;
|
||||
rejected: number;
|
||||
skipped: number;
|
||||
}
|
||||
|
||||
export interface KloRelationshipDiagnosticsValidation {
|
||||
available: boolean;
|
||||
sqlAvailable: boolean;
|
||||
queryCount: number;
|
||||
}
|
||||
|
||||
export interface KloRelationshipDiagnosticsThresholds {
|
||||
acceptThreshold: number;
|
||||
reviewThreshold: number;
|
||||
}
|
||||
|
||||
export interface KloRelationshipDiagnosticsPolicy {
|
||||
validationRequiredForManifest: boolean;
|
||||
maxCandidatesPerColumn: number;
|
||||
profileSampleRows: number;
|
||||
validationConcurrency: number;
|
||||
}
|
||||
|
||||
export interface KloRelationshipDiagnosticsArtifact {
|
||||
connectionId: string;
|
||||
generatedAt: string;
|
||||
summary: KloRelationshipDiagnosticsSummary;
|
||||
noAcceptedReason: string | null;
|
||||
candidateCountsBySource: Record<string, number>;
|
||||
validation: KloRelationshipDiagnosticsValidation;
|
||||
thresholds: KloRelationshipDiagnosticsThresholds;
|
||||
policy: KloRelationshipDiagnosticsPolicy;
|
||||
warnings: KloScanWarning[];
|
||||
profileWarnings: string[];
|
||||
}
|
||||
|
||||
export interface BuildKloRelationshipArtifactsInput {
|
||||
connectionId: string;
|
||||
relationshipUpdate?: KloRelationshipUpdate | null;
|
||||
resolvedRelationships?: readonly KloResolvedRelationshipDiscoveryCandidate[];
|
||||
compositeRelationships?: readonly KloCompositeRelationshipCandidate[];
|
||||
}
|
||||
|
||||
export interface BuildKloRelationshipDiagnosticsInput {
|
||||
connectionId: string;
|
||||
artifacts: KloRelationshipArtifact;
|
||||
profile: KloRelationshipProfileArtifact;
|
||||
warnings?: readonly KloScanWarning[];
|
||||
thresholds?: Partial<KloRelationshipDiagnosticsThresholds>;
|
||||
policy?: Partial<KloRelationshipDiagnosticsPolicy>;
|
||||
generatedAt?: string;
|
||||
}
|
||||
|
||||
export interface EmptyKloRelationshipProfileArtifactInput {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
reason: string;
|
||||
}
|
||||
|
||||
const DEFAULT_THRESHOLDS: KloRelationshipDiagnosticsThresholds = {
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
};
|
||||
|
||||
const DEFAULT_POLICY: KloRelationshipDiagnosticsPolicy = {
|
||||
validationRequiredForManifest: true,
|
||||
maxCandidatesPerColumn: 25,
|
||||
profileSampleRows: 10000,
|
||||
validationConcurrency: 4,
|
||||
};
|
||||
|
||||
function endpointArtifact(endpoint: KloRelationshipEndpoint): KloRelationshipArtifactEndpoint {
|
||||
return {
|
||||
tableId: endpoint.tableId,
|
||||
columnIds: endpoint.columnIds,
|
||||
table: {
|
||||
catalog: endpoint.table.catalog,
|
||||
db: endpoint.table.db,
|
||||
name: endpoint.table.name,
|
||||
},
|
||||
columns: endpoint.columns,
|
||||
};
|
||||
}
|
||||
|
||||
function uniqueReasons(values: readonly string[]): string[] {
|
||||
return Array.from(new Set(values.filter((value) => value.trim().length > 0)));
|
||||
}
|
||||
|
||||
function relationshipUpdateEdge(
|
||||
relationship: KloEnrichedRelationship,
|
||||
status: 'accepted' | 'rejected',
|
||||
): KloRelationshipArtifactEdge {
|
||||
const acceptedReason = relationship.source === 'formal' ? 'formal_metadata_accepted' : 'accepted_relationship_update';
|
||||
return {
|
||||
id: relationship.id,
|
||||
status,
|
||||
source: relationship.source,
|
||||
from: endpointArtifact(relationship.from),
|
||||
to: endpointArtifact(relationship.to),
|
||||
relationshipType: relationship.relationshipType,
|
||||
confidence: relationship.confidence,
|
||||
pkScore: null,
|
||||
fkScore: null,
|
||||
score: relationship.confidence,
|
||||
evidence: relationship.source === 'formal' ? { source: 'formal_metadata' } : null,
|
||||
validation: relationship.source === 'formal' ? { status: 'formal_metadata' } : null,
|
||||
graph: null,
|
||||
reasons: [status === 'accepted' ? acceptedReason : 'rejected_relationship_update'],
|
||||
};
|
||||
}
|
||||
|
||||
function resolvedEdge(candidate: KloResolvedRelationshipDiscoveryCandidate): KloRelationshipArtifactEdge {
|
||||
return {
|
||||
id: candidate.id,
|
||||
status: candidate.status,
|
||||
source: candidate.source,
|
||||
from: endpointArtifact(candidate.from),
|
||||
to: endpointArtifact(candidate.to),
|
||||
relationshipType: candidate.relationshipType,
|
||||
confidence: candidate.confidence,
|
||||
pkScore: candidate.pkScore,
|
||||
fkScore: candidate.fkScore,
|
||||
score: candidate.score,
|
||||
evidence: candidate.evidence,
|
||||
validation: candidate.validation,
|
||||
graph: candidate.graph,
|
||||
reasons: uniqueReasons([
|
||||
...candidate.evidence.reasons,
|
||||
...candidate.validation.reasons,
|
||||
...candidate.graph.reasons,
|
||||
]),
|
||||
};
|
||||
}
|
||||
|
||||
function compositeEndpointArtifact(endpoint: KloCompositeRelationshipCandidate['from']): KloRelationshipArtifactEndpoint {
|
||||
return {
|
||||
tableId: endpoint.tableId,
|
||||
columnIds: endpoint.columnIds,
|
||||
table: {
|
||||
catalog: endpoint.table.catalog,
|
||||
db: endpoint.table.db,
|
||||
name: endpoint.table.name,
|
||||
},
|
||||
columns: endpoint.columns,
|
||||
};
|
||||
}
|
||||
|
||||
function compositeEdge(candidate: KloCompositeRelationshipCandidate): KloRelationshipArtifactEdge {
|
||||
return {
|
||||
id: candidate.id,
|
||||
status: candidate.status,
|
||||
source: candidate.source,
|
||||
from: compositeEndpointArtifact(candidate.from),
|
||||
to: compositeEndpointArtifact(candidate.to),
|
||||
relationshipType: candidate.relationshipType,
|
||||
confidence: candidate.confidence,
|
||||
pkScore: null,
|
||||
fkScore: candidate.confidence,
|
||||
score: candidate.confidence,
|
||||
evidence: { source: candidate.source },
|
||||
validation: candidate.validation,
|
||||
graph: null,
|
||||
reasons: uniqueReasons(candidate.validation.reasons),
|
||||
};
|
||||
}
|
||||
|
||||
function emptyArtifacts(connectionId: string): KloRelationshipArtifact {
|
||||
return {
|
||||
connectionId,
|
||||
accepted: [],
|
||||
review: [],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
};
|
||||
}
|
||||
|
||||
function pushUniqueEdge(edges: KloRelationshipArtifactEdge[], edge: KloRelationshipArtifactEdge): void {
|
||||
if (!edges.some((item) => item.id === edge.id)) {
|
||||
edges.push(edge);
|
||||
}
|
||||
}
|
||||
|
||||
export function buildKloRelationshipArtifacts(input: BuildKloRelationshipArtifactsInput): KloRelationshipArtifact {
|
||||
const artifacts = emptyArtifacts(input.connectionId);
|
||||
|
||||
if (input.resolvedRelationships) {
|
||||
for (const candidate of input.resolvedRelationships) {
|
||||
const edge = resolvedEdge(candidate);
|
||||
if (edge.status === 'accepted') {
|
||||
pushUniqueEdge(artifacts.accepted, edge);
|
||||
} else if (edge.status === 'review') {
|
||||
pushUniqueEdge(artifacts.review, edge);
|
||||
} else {
|
||||
pushUniqueEdge(artifacts.rejected, edge);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const candidate of input.compositeRelationships ?? []) {
|
||||
const edge = compositeEdge(candidate);
|
||||
if (edge.status === 'accepted') {
|
||||
pushUniqueEdge(artifacts.accepted, edge);
|
||||
} else if (edge.status === 'review') {
|
||||
pushUniqueEdge(artifacts.review, edge);
|
||||
} else {
|
||||
pushUniqueEdge(artifacts.rejected, edge);
|
||||
}
|
||||
}
|
||||
|
||||
const relationshipUpdate = input.relationshipUpdate;
|
||||
if (relationshipUpdate) {
|
||||
for (const relationship of relationshipUpdate.accepted) {
|
||||
pushUniqueEdge(artifacts.accepted, relationshipUpdateEdge(relationship, 'accepted'));
|
||||
}
|
||||
for (const relationship of relationshipUpdate.rejected) {
|
||||
pushUniqueEdge(artifacts.rejected, relationshipUpdateEdge(relationship, 'rejected'));
|
||||
}
|
||||
artifacts.skipped.push(...relationshipUpdate.skipped);
|
||||
}
|
||||
|
||||
return {
|
||||
connectionId: artifacts.connectionId,
|
||||
accepted: artifacts.accepted.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
review: artifacts.review.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
rejected: artifacts.rejected.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
skipped: [...artifacts.skipped].sort((left, right) => left.relationshipId.localeCompare(right.relationshipId)),
|
||||
};
|
||||
}
|
||||
|
||||
function allEdges(artifacts: KloRelationshipArtifact): KloRelationshipArtifactEdge[] {
|
||||
return [...artifacts.accepted, ...artifacts.review, ...artifacts.rejected];
|
||||
}
|
||||
|
||||
function candidateCountsBySource(artifacts: KloRelationshipArtifact): Record<string, number> {
|
||||
const counts: Record<string, number> = {};
|
||||
for (const edge of allEdges(artifacts)) {
|
||||
counts[edge.source] = (counts[edge.source] ?? 0) + 1;
|
||||
}
|
||||
return Object.fromEntries(Object.entries(counts).sort(([left], [right]) => left.localeCompare(right)));
|
||||
}
|
||||
|
||||
function hasReason(artifacts: KloRelationshipArtifact, reason: string): boolean {
|
||||
return allEdges(artifacts).some((edge) => edge.reasons.includes(reason));
|
||||
}
|
||||
|
||||
function noAcceptedReason(input: {
|
||||
artifacts: KloRelationshipArtifact;
|
||||
profile: KloRelationshipProfileArtifact;
|
||||
}): string | null {
|
||||
if (input.artifacts.accepted.length > 0) {
|
||||
return null;
|
||||
}
|
||||
if (
|
||||
input.artifacts.review.length > 0 &&
|
||||
(!input.profile.sqlAvailable ||
|
||||
hasReason(input.artifacts, 'validation_unavailable') ||
|
||||
hasReason(input.artifacts, 'validation_unavailable_review_only'))
|
||||
) {
|
||||
return 'validation unavailable; review candidates written';
|
||||
}
|
||||
if (input.artifacts.review.length > 0) {
|
||||
return 'relationship candidates require review before manifest writes';
|
||||
}
|
||||
if (input.artifacts.rejected.length > 0) {
|
||||
return 'all candidate pairs were rejected';
|
||||
}
|
||||
return 'no candidate pairs passed type compatibility';
|
||||
}
|
||||
|
||||
export function emptyKloRelationshipProfileArtifact(
|
||||
input: EmptyKloRelationshipProfileArtifactInput,
|
||||
): KloRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
sqlAvailable: false,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
columns: {},
|
||||
warnings: [input.reason],
|
||||
};
|
||||
}
|
||||
|
||||
export function buildKloRelationshipDiagnostics(
|
||||
input: BuildKloRelationshipDiagnosticsInput,
|
||||
): KloRelationshipDiagnosticsArtifact {
|
||||
const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
|
||||
const policy = { ...DEFAULT_POLICY, ...input.policy };
|
||||
const summary: KloRelationshipDiagnosticsSummary = {
|
||||
accepted: input.artifacts.accepted.length,
|
||||
review: input.artifacts.review.length,
|
||||
rejected: input.artifacts.rejected.length,
|
||||
skipped: input.artifacts.skipped.length,
|
||||
};
|
||||
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
generatedAt: input.generatedAt ?? new Date().toISOString(),
|
||||
summary,
|
||||
noAcceptedReason: noAcceptedReason({ artifacts: input.artifacts, profile: input.profile }),
|
||||
candidateCountsBySource: candidateCountsBySource(input.artifacts),
|
||||
validation: {
|
||||
available: input.profile.sqlAvailable,
|
||||
sqlAvailable: input.profile.sqlAvailable,
|
||||
queryCount: input.profile.queryCount,
|
||||
},
|
||||
thresholds,
|
||||
policy,
|
||||
warnings: [...(input.warnings ?? [])],
|
||||
profileWarnings: [...input.profile.warnings],
|
||||
};
|
||||
}
|
||||
699
packages/context/src/scan/relationship-discovery.test.ts
Normal file
699
packages/context/src/scan/relationship-discovery.test.ts
Normal file
|
|
@ -0,0 +1,699 @@
|
|||
import type { KloLlmProvider } from '@klo/llm';
|
||||
import Database from 'better-sqlite3';
|
||||
import { afterEach, describe, expect, it, vi } from 'vitest';
|
||||
import { buildDefaultKloProjectConfig } from '../project/config.js';
|
||||
import { snapshotToKloEnrichedSchema } from './local-enrichment.js';
|
||||
import {
|
||||
loadKloRelationshipBenchmarkFixture,
|
||||
maskKloRelationshipBenchmarkSnapshot,
|
||||
} from './relationship-benchmarks.js';
|
||||
import { discoverKloRelationships } from './relationship-discovery.js';
|
||||
import { createKloConnectorCapabilities } from './types.js';
|
||||
import type { KloQueryResult, KloReadOnlyQueryInput, KloScanConnector, KloScanContext, KloSchemaSnapshot } from './types.js';
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
queryCount = 0;
|
||||
|
||||
executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function snapshot(): KloSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'accounts',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function declaredForeignKeySnapshot(): KloSchemaSnapshot {
|
||||
const source = snapshot();
|
||||
return {
|
||||
...source,
|
||||
tables: source.tables.map((table) =>
|
||||
table.name === 'accounts'
|
||||
? {
|
||||
...table,
|
||||
columns: table.columns.map((column) => (column.name === 'id' ? { ...column, primaryKey: true } : column)),
|
||||
}
|
||||
: table.name === 'orders'
|
||||
? {
|
||||
...table,
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'account_id',
|
||||
toCatalog: null,
|
||||
toDb: null,
|
||||
toTable: 'accounts',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_account_id_fkey',
|
||||
},
|
||||
],
|
||||
}
|
||||
: table,
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
function naturalKeySnapshot(): KloSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'dim_countries',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 3,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'iso_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'fct_accounts',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 4,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'country_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function connector(executor: InMemorySqliteExecutor | null): KloScanConnector {
|
||||
return {
|
||||
id: 'sqlite:test',
|
||||
driver: 'sqlite',
|
||||
capabilities: createKloConnectorCapabilities({
|
||||
readOnlySql: executor !== null,
|
||||
columnStats: executor !== null,
|
||||
tableSampling: false,
|
||||
columnSampling: false,
|
||||
}),
|
||||
introspect: async () => snapshot(),
|
||||
executeReadOnly: executor ? executor.executeReadOnly.bind(executor) : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
function llmProvider(): KloLlmProvider {
|
||||
const model = { modelId: 'claude-sonnet-4-6', provider: 'anthropic' };
|
||||
return {
|
||||
getModel: vi.fn(() => model as ReturnType<KloLlmProvider['getModel']>),
|
||||
getModelByName: vi.fn(() => model as ReturnType<KloLlmProvider['getModelByName']>),
|
||||
cacheMarker: vi.fn(),
|
||||
repairToolCallHandler: vi.fn(),
|
||||
thinkingProviderOptions: vi.fn(() => ({})),
|
||||
telemetryConfig: vi.fn(() => undefined),
|
||||
promptCachingConfig: vi.fn(
|
||||
() =>
|
||||
({
|
||||
enabled: false,
|
||||
systemTtl: '1h',
|
||||
toolsTtl: '1h',
|
||||
historyTtl: '5m',
|
||||
cacheSystem: true,
|
||||
cacheTools: true,
|
||||
cacheHistory: true,
|
||||
vertexFallbackTo5m: false,
|
||||
}) as ReturnType<KloLlmProvider['promptCachingConfig']>,
|
||||
),
|
||||
activeBackend: vi.fn(() => 'anthropic' as ReturnType<KloLlmProvider['activeBackend']>),
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipSettings() {
|
||||
return buildDefaultKloProjectConfig('warehouse').scan.relationships;
|
||||
}
|
||||
|
||||
function llmOnlyRelationshipSnapshot(): KloSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
extractedAt: '2026-05-07T00:00:00.000Z',
|
||||
scope: {},
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'buyer_ref',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
describe('production relationship discovery', () => {
|
||||
let executor: InMemorySqliteExecutor | null = null;
|
||||
|
||||
afterEach(() => {
|
||||
executor?.close();
|
||||
executor = null;
|
||||
});
|
||||
|
||||
it('accepts a validated relationship without declared PK or FK metadata', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
|
||||
INSERT INTO orders (id, account_id) VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
|
||||
const result = await discoverKloRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: connector(executor),
|
||||
schema: snapshotToKloEnrichedSchema(snapshot()),
|
||||
context: { runId: 'relationship-run-1' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.statisticalValidation).toBe('completed');
|
||||
expect(result.profile.sqlAvailable).toBe(true);
|
||||
expect(result.profile.queryCount).toBeGreaterThan(0);
|
||||
expect(result.relationshipUpdate.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'inferred',
|
||||
isPrimaryKeyReference: true,
|
||||
}),
|
||||
]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
status: 'accepted',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
|
||||
graph: expect.objectContaining({ reasons: expect.arrayContaining(['fk_score_passed']) }),
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts a profile-driven natural-key relationship without declared metadata', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE dim_countries (iso_code TEXT NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE fct_accounts (id INTEGER NOT NULL, country_code TEXT NOT NULL);
|
||||
INSERT INTO dim_countries (iso_code, name) VALUES ('US', 'United States'), ('FR', 'France'), ('DE', 'Germany');
|
||||
INSERT INTO fct_accounts (id, country_code) VALUES (1, 'US'), (2, 'FR'), (3, 'US'), (4, 'DE');
|
||||
`);
|
||||
|
||||
const schema = naturalKeySnapshot();
|
||||
const result = await discoverKloRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: {
|
||||
...connector(executor),
|
||||
introspect: async () => schema,
|
||||
},
|
||||
schema: snapshotToKloEnrichedSchema(schema),
|
||||
context: { runId: 'natural-key-relationship-run' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'fct_accounts' }), columns: ['country_code'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'dim_countries' }), columns: ['iso_code'] }),
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'inferred',
|
||||
isPrimaryKeyReference: true,
|
||||
}),
|
||||
]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
source: 'profile_match',
|
||||
status: 'accepted',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
|
||||
graph: expect.objectContaining({ reasons: expect.arrayContaining(['fk_score_passed']) }),
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts an embedding-driven relationship without declared metadata or LLM proposals', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE customers (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, buyer_ref INTEGER NOT NULL);
|
||||
INSERT INTO customers (id, name) VALUES (1, 'Acme'), (2, 'Orbit'), (3, 'Globex');
|
||||
INSERT INTO orders (id, buyer_ref) VALUES (10, 1), (11, 2), (12, 2), (13, 3);
|
||||
`);
|
||||
|
||||
const sourceSnapshot = llmOnlyRelationshipSnapshot();
|
||||
const schema = snapshotToKloEnrichedSchema(
|
||||
sourceSnapshot,
|
||||
new Map([
|
||||
['customers.id', [1, 0, 0]],
|
||||
['customers.name', [0, 1, 0]],
|
||||
['orders.id', [0, 0, 1]],
|
||||
['orders.buyer_ref', [0.995, 0.005, 0]],
|
||||
]),
|
||||
);
|
||||
|
||||
const result = await discoverKloRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: {
|
||||
...connector(executor),
|
||||
introspect: async () => sourceSnapshot,
|
||||
},
|
||||
schema,
|
||||
context: { runId: 'embedding-relationship-run' },
|
||||
settings: {
|
||||
...relationshipSettings(),
|
||||
llmProposals: false,
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.llmRelationshipValidation).toBe('skipped');
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate.accepted[0]).toMatchObject({
|
||||
from: { table: { name: 'orders' }, columns: ['buyer_ref'] },
|
||||
to: { table: { name: 'customers' }, columns: ['id'] },
|
||||
});
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
source: 'embedding_similarity',
|
||||
status: 'accepted',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_passed']) }),
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['embedding_similarity', 'target_key_like']),
|
||||
embeddingSimilarity: expect.any(Number),
|
||||
}),
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps candidates review-only when read-only SQL is unavailable', async () => {
|
||||
const result = await discoverKloRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: connector(null),
|
||||
schema: snapshotToKloEnrichedSchema(snapshot()),
|
||||
context: { runId: 'relationship-run-no-sql' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(result.statisticalValidation).toBe('skipped');
|
||||
expect(result.relationshipUpdate.accepted).toEqual([]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
status: 'review',
|
||||
validation: expect.objectContaining({ reasons: expect.arrayContaining(['validation_unavailable']) }),
|
||||
});
|
||||
expect(result.warnings).toContainEqual({
|
||||
code: 'connector_capability_missing',
|
||||
message: 'KLO scan connector cannot run read-only SQL relationship validation',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts formal metadata relationships when read-only SQL is unavailable', async () => {
|
||||
const sourceSnapshot = declaredForeignKeySnapshot();
|
||||
const result = await discoverKloRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: connector(null),
|
||||
schema: snapshotToKloEnrichedSchema(sourceSnapshot),
|
||||
context: { runId: 'formal-metadata-no-sql' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
|
||||
expect(result.statisticalValidation).toBe('skipped');
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.resolvedRelationships).toEqual([]);
|
||||
expect(result.relationshipUpdate.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'orders:(orders.account_id)->accounts:(accounts.id)',
|
||||
source: 'formal',
|
||||
confidence: 1,
|
||||
from: expect.objectContaining({ table: expect.objectContaining({ name: 'orders' }), columns: ['account_id'] }),
|
||||
to: expect.objectContaining({ table: expect.objectContaining({ name: 'accounts' }), columns: ['id'] }),
|
||||
}),
|
||||
]);
|
||||
expect(result.relationshipUpdate.rejected).toEqual([]);
|
||||
expect(result.relationshipUpdate.skipped).toEqual([]);
|
||||
});
|
||||
|
||||
it('accepts LLM-only relationship proposals only after SQL validation and graph resolution pass', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE customers (id INTEGER);
|
||||
CREATE TABLE orders (id INTEGER, buyer_ref INTEGER);
|
||||
INSERT INTO customers (id) VALUES (1), (2);
|
||||
INSERT INTO orders (id, buyer_ref) VALUES (10, 1), (11, 2);
|
||||
`);
|
||||
const generateText = vi.fn(async () => ({
|
||||
output: {
|
||||
pkCandidates: [{ table: 'customers', column: 'id', confidence: 0.91, rationale: 'Unique customer key.' }],
|
||||
fkCandidates: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'buyer_ref',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
confidence: 0.89,
|
||||
rationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
],
|
||||
},
|
||||
}));
|
||||
|
||||
const result = await discoverKloRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: connector(executor),
|
||||
schema: snapshotToKloEnrichedSchema(llmOnlyRelationshipSnapshot()),
|
||||
context: { runId: 'llm-relationship-orchestrator' },
|
||||
settings: relationshipSettings(),
|
||||
llmProvider: llmProvider(),
|
||||
generateText,
|
||||
});
|
||||
|
||||
expect(result.llmRelationshipValidation).toBe('completed');
|
||||
expect(result.relationships).toEqual({ accepted: 1, review: 0, rejected: 0, skipped: 0 });
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
source: 'llm_proposal',
|
||||
status: 'accepted',
|
||||
evidence: {
|
||||
llmRationale: 'Buyer reference values align with customer identifiers.',
|
||||
},
|
||||
});
|
||||
expect(result.relationshipUpdate.accepted[0]).toMatchObject({
|
||||
from: { table: { name: 'orders' }, columns: ['buyer_ref'] },
|
||||
to: { table: { name: 'customers' }, columns: ['id'] },
|
||||
});
|
||||
});
|
||||
|
||||
it('uses configured acceptance thresholds when resolving graph relationships', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'Acme'), (2, 'Orbit');
|
||||
INSERT INTO orders VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
|
||||
const settings = {
|
||||
...buildDefaultKloProjectConfig('warehouse').scan.relationships,
|
||||
acceptThreshold: 0.99,
|
||||
reviewThreshold: 0.55,
|
||||
};
|
||||
|
||||
const result = await discoverKloRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: connector(executor),
|
||||
schema: snapshotToKloEnrichedSchema(snapshot()),
|
||||
context: { runId: 'configured-thresholds' },
|
||||
settings,
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual({ accepted: 0, review: 1, rejected: 0, skipped: 0 });
|
||||
expect(result.relationshipUpdate.accepted).toEqual([]);
|
||||
expect(result.resolvedRelationships[0]).toMatchObject({
|
||||
status: 'review',
|
||||
graph: { reasons: expect.arrayContaining(['fk_score_review']) },
|
||||
});
|
||||
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('passes maxCandidatesPerColumn into broad deterministic candidate generation', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE account_archive (id INTEGER NOT NULL, name TEXT NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'Acme'), (2, 'Orbit');
|
||||
INSERT INTO account_archive VALUES (99, 'Archive');
|
||||
INSERT INTO orders VALUES (10, 1), (11, 1), (12, 2);
|
||||
`);
|
||||
|
||||
const richSnapshot = snapshot();
|
||||
richSnapshot.tables.splice(1, 0, {
|
||||
catalog: null,
|
||||
db: null,
|
||||
name: 'account_archive',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 1,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'name',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = await discoverKloRelationships({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
connector: {
|
||||
...connector(executor),
|
||||
introspect: async () => richSnapshot,
|
||||
},
|
||||
schema: snapshotToKloEnrichedSchema(richSnapshot),
|
||||
context: { runId: 'candidate-cap' },
|
||||
settings: {
|
||||
...buildDefaultKloProjectConfig('warehouse').scan.relationships,
|
||||
maxCandidatesPerColumn: 1,
|
||||
},
|
||||
});
|
||||
|
||||
const sourceTargets = result.resolvedRelationships
|
||||
.filter((relationship) => relationship.from.columns[0] === 'account_id')
|
||||
.map((relationship) => `${relationship.to.table.name}.${relationship.to.columns[0]}`);
|
||||
expect(sourceTargets).toHaveLength(1);
|
||||
expect(sourceTargets).toEqual(['accounts.id']);
|
||||
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('accepts SQL-validated composite relationships in production relationship-discovery detection', async () => {
|
||||
const fixtureRoot = new URL(
|
||||
'../../test/fixtures/relationship-benchmarks/composite_keys_no_declared_constraints',
|
||||
import.meta.url,
|
||||
);
|
||||
const fixture = await loadKloRelationshipBenchmarkFixture(fixtureRoot.pathname);
|
||||
const maskedSnapshot = maskKloRelationshipBenchmarkSnapshot(fixture.snapshot, 'declared_pks_and_declared_fks_removed');
|
||||
const database = new Database(fixture.dataPath ?? '', { readonly: true, fileMustExist: true });
|
||||
const testConnector: KloScanConnector = {
|
||||
id: 'sqlite:composite',
|
||||
driver: 'sqlite',
|
||||
capabilities: createKloConnectorCapabilities({
|
||||
readOnlySql: true,
|
||||
columnStats: true,
|
||||
tableSampling: false,
|
||||
columnSampling: false,
|
||||
}),
|
||||
introspect: async () => maskedSnapshot,
|
||||
executeReadOnly: async (input) => {
|
||||
const rows = database.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return {
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const result = await discoverKloRelationships({
|
||||
connectionId: maskedSnapshot.connectionId,
|
||||
driver: maskedSnapshot.driver,
|
||||
connector: testConnector,
|
||||
schema: snapshotToKloEnrichedSchema(maskedSnapshot, new Map()),
|
||||
context: { runId: 'test:production-composite' },
|
||||
settings: relationshipSettings(),
|
||||
});
|
||||
database.close();
|
||||
|
||||
expect(
|
||||
result.relationshipUpdate.accepted.map(
|
||||
(relationship) =>
|
||||
`${relationship.from.table.name}.(${relationship.from.columns.join(',')})->${relationship.to.table.name}.(${relationship.to.columns.join(',')})`,
|
||||
),
|
||||
).toContain('order_line_allocations.(order_id,line_number)->order_lines.(order_id,line_number)');
|
||||
expect(result.relationships.accepted).toBeGreaterThanOrEqual(1);
|
||||
expect(result.compositeRelationships.map((relationship) => relationship.status)).toContain('accepted');
|
||||
});
|
||||
});
|
||||
338
packages/context/src/scan/relationship-discovery.ts
Normal file
338
packages/context/src/scan/relationship-discovery.ts
Normal file
|
|
@ -0,0 +1,338 @@
|
|||
import type { KloLlmProvider } from '@klo/llm';
|
||||
import type { KloScanRelationshipConfig } from '../project/config.js';
|
||||
import type { KloEnrichedRelationship, KloEnrichedSchema, KloRelationshipUpdate } from './enrichment-types.js';
|
||||
import {
|
||||
generateKloRelationshipDiscoveryCandidates,
|
||||
type KloRelationshipDiscoveryCandidate,
|
||||
mergeKloRelationshipDiscoveryCandidates,
|
||||
} from './relationship-candidates.js';
|
||||
import {
|
||||
discoverKloCompositeRelationships,
|
||||
type KloCompositeRelationshipCandidate,
|
||||
} from './relationship-composite-candidates.js';
|
||||
import { collectKloFormalMetadataRelationships } from './relationship-formal-metadata.js';
|
||||
import {
|
||||
type KloResolvedRelationshipDiscoveryCandidate,
|
||||
resolveKloRelationshipGraph,
|
||||
} from './relationship-graph-resolver.js';
|
||||
import {
|
||||
type KloRelationshipLlmProposalGenerateText,
|
||||
proposeKloRelationshipCandidatesWithLlm,
|
||||
} from './relationship-llm-proposal.js';
|
||||
import {
|
||||
createKloRelationshipProfileCache,
|
||||
type KloRelationshipProfileArtifact,
|
||||
type KloRelationshipReadOnlyExecutor,
|
||||
profileKloRelationshipSchema,
|
||||
} from './relationship-profiling.js';
|
||||
import { validateKloRelationshipDiscoveryCandidates } from './relationship-validation.js';
|
||||
import type {
|
||||
KloConnectionDriver,
|
||||
KloScanConnector,
|
||||
KloScanContext,
|
||||
KloScanEnrichmentSummary,
|
||||
KloScanRelationshipSummary,
|
||||
KloScanWarning,
|
||||
} from './types.js';
|
||||
|
||||
export interface DiscoverKloRelationshipsInput {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
connector: KloScanConnector;
|
||||
schema: KloEnrichedSchema;
|
||||
context: KloScanContext;
|
||||
settings: KloScanRelationshipConfig;
|
||||
llmProvider?: KloLlmProvider | null;
|
||||
generateText?: KloRelationshipLlmProposalGenerateText;
|
||||
}
|
||||
|
||||
export interface DiscoverKloRelationshipsResult {
|
||||
relationshipUpdate: KloRelationshipUpdate;
|
||||
relationships: KloScanRelationshipSummary;
|
||||
profile: KloRelationshipProfileArtifact;
|
||||
resolvedRelationships: KloResolvedRelationshipDiscoveryCandidate[];
|
||||
compositeRelationships: KloCompositeRelationshipCandidate[];
|
||||
statisticalValidation: KloScanEnrichmentSummary['statisticalValidation'];
|
||||
llmRelationshipValidation: KloScanEnrichmentSummary['llmRelationshipValidation'];
|
||||
warnings: KloScanWarning[];
|
||||
}
|
||||
|
||||
function relationshipFromResolved(candidate: KloResolvedRelationshipDiscoveryCandidate): KloEnrichedRelationship {
|
||||
return {
|
||||
id: candidate.id,
|
||||
source: 'inferred',
|
||||
from: candidate.from,
|
||||
to: candidate.to,
|
||||
relationshipType: candidate.relationshipType,
|
||||
confidence: candidate.fkScore,
|
||||
isPrimaryKeyReference: candidate.pkScore >= 0.78,
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipFromComposite(candidate: KloCompositeRelationshipCandidate): KloEnrichedRelationship {
|
||||
return {
|
||||
id: candidate.id,
|
||||
source: 'inferred',
|
||||
from: {
|
||||
tableId: candidate.from.tableId,
|
||||
columnIds: candidate.from.columnIds,
|
||||
table: candidate.from.table,
|
||||
columns: candidate.from.columns,
|
||||
},
|
||||
to: {
|
||||
tableId: candidate.to.tableId,
|
||||
columnIds: candidate.to.columnIds,
|
||||
table: candidate.to.table,
|
||||
columns: candidate.to.columns,
|
||||
},
|
||||
relationshipType: candidate.relationshipType,
|
||||
confidence: candidate.confidence,
|
||||
isPrimaryKeyReference: candidate.status === 'accepted',
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipId(input: Pick<KloEnrichedRelationship, 'from' | 'to'>): string {
|
||||
return `${input.from.tableId}:(${input.from.columnIds.join(',')})->${input.to.tableId}:(${input.to.columnIds.join(',')})`;
|
||||
}
|
||||
|
||||
function nonFormalAcceptedRelationships(input: {
|
||||
formalIds: ReadonlySet<string>;
|
||||
resolvedRelationships: readonly KloResolvedRelationshipDiscoveryCandidate[];
|
||||
}): KloEnrichedRelationship[] {
|
||||
return input.resolvedRelationships
|
||||
.filter((candidate) => candidate.status === 'accepted' && !input.formalIds.has(candidate.id))
|
||||
.map(relationshipFromResolved);
|
||||
}
|
||||
|
||||
function relationshipSummary(
|
||||
resolvedRelationships: readonly KloResolvedRelationshipDiscoveryCandidate[],
|
||||
): KloScanRelationshipSummary {
|
||||
return {
|
||||
accepted: resolvedRelationships.filter((candidate) => candidate.status === 'accepted').length,
|
||||
review: resolvedRelationships.filter((candidate) => candidate.status === 'review').length,
|
||||
rejected: resolvedRelationships.filter((candidate) => candidate.status === 'rejected').length,
|
||||
skipped: 0,
|
||||
};
|
||||
}
|
||||
|
||||
function compositeSummary(relationships: readonly KloCompositeRelationshipCandidate[]): KloScanRelationshipSummary {
|
||||
return {
|
||||
accepted: relationships.filter((candidate) => candidate.status === 'accepted').length,
|
||||
review: relationships.filter((candidate) => candidate.status === 'review').length,
|
||||
rejected: relationships.filter((candidate) => candidate.status === 'rejected').length,
|
||||
skipped: 0,
|
||||
};
|
||||
}
|
||||
|
||||
async function detectCompositeRelationships(input: {
|
||||
connectionId: string;
|
||||
driver: DiscoverKloRelationshipsInput['driver'];
|
||||
schema: KloEnrichedSchema;
|
||||
profile: KloRelationshipProfileArtifact;
|
||||
executor: KloRelationshipReadOnlyExecutor | null;
|
||||
context: DiscoverKloRelationshipsInput['context'];
|
||||
warnings: KloScanWarning[];
|
||||
}): Promise<KloCompositeRelationshipCandidate[]> {
|
||||
if (!input.executor || !input.profile.sqlAvailable) {
|
||||
return [];
|
||||
}
|
||||
try {
|
||||
const compositeDetection = await discoverKloCompositeRelationships({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
schema: input.schema,
|
||||
profiles: input.profile,
|
||||
executor: input.executor,
|
||||
ctx: input.context,
|
||||
});
|
||||
for (const warning of compositeDetection.warnings) {
|
||||
input.warnings.push({
|
||||
code: 'relationship_validation_failed',
|
||||
message: warning,
|
||||
recoverable: true,
|
||||
metadata: { source: 'composite_relationship_detection' },
|
||||
});
|
||||
}
|
||||
return compositeDetection.relationships;
|
||||
} catch (error) {
|
||||
input.warnings.push({
|
||||
code: 'relationship_validation_failed',
|
||||
message: `KLO composite relationship detection failed: ${error instanceof Error ? error.message : String(error)}`,
|
||||
recoverable: true,
|
||||
metadata: { source: 'composite_relationship_detection' },
|
||||
});
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function combinedRelationshipSummary(input: {
|
||||
formalAccepted: number;
|
||||
formalSkipped: number;
|
||||
resolvedRelationships: readonly KloResolvedRelationshipDiscoveryCandidate[];
|
||||
}): KloScanRelationshipSummary {
|
||||
const graph = relationshipSummary(input.resolvedRelationships);
|
||||
return {
|
||||
accepted: input.formalAccepted + graph.accepted,
|
||||
review: graph.review,
|
||||
rejected: graph.rejected,
|
||||
skipped: input.formalSkipped,
|
||||
};
|
||||
}
|
||||
|
||||
function sqlExecutor(input: DiscoverKloRelationshipsInput): {
|
||||
executor: KloRelationshipReadOnlyExecutor | null;
|
||||
warnings: KloScanWarning[];
|
||||
} {
|
||||
if (!input.connector.capabilities.readOnlySql) {
|
||||
return {
|
||||
executor: null,
|
||||
warnings: [
|
||||
{
|
||||
code: 'connector_capability_missing',
|
||||
message: 'KLO scan connector cannot run read-only SQL relationship validation',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
if (!input.connector.executeReadOnly) {
|
||||
return {
|
||||
executor: null,
|
||||
warnings: [
|
||||
{
|
||||
code: 'relationship_validation_failed',
|
||||
message: 'KLO scan connector advertises readOnlySql but does not expose executeReadOnly',
|
||||
recoverable: true,
|
||||
metadata: { capability: 'readOnlySql' },
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
executor: {
|
||||
executeReadOnly: input.connector.executeReadOnly.bind(input.connector),
|
||||
},
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
|
||||
export async function discoverKloRelationships(
|
||||
input: DiscoverKloRelationshipsInput,
|
||||
): Promise<DiscoverKloRelationshipsResult> {
|
||||
const { executor, warnings } = sqlExecutor(input);
|
||||
const formalMetadata = collectKloFormalMetadataRelationships(input.schema);
|
||||
const profileCache = createKloRelationshipProfileCache();
|
||||
const profile = await profileKloRelationshipSchema({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
schema: input.schema,
|
||||
executor,
|
||||
ctx: input.context,
|
||||
profileSampleRows: input.settings.profileSampleRows,
|
||||
cache: profileCache,
|
||||
});
|
||||
const deterministicCandidates: KloRelationshipDiscoveryCandidate[] = generateKloRelationshipDiscoveryCandidates(
|
||||
input.schema,
|
||||
{
|
||||
maxCandidatesPerColumn: input.settings.maxCandidatesPerColumn,
|
||||
profiles: profile,
|
||||
},
|
||||
);
|
||||
const llmProposalResult = input.settings.llmProposals
|
||||
? await proposeKloRelationshipCandidatesWithLlm({
|
||||
connectionId: input.connectionId,
|
||||
schema: input.schema,
|
||||
profile,
|
||||
llmProvider: input.llmProvider ?? null,
|
||||
settings: {
|
||||
maxTablesPerBatch: input.settings.maxLlmTablesPerBatch,
|
||||
},
|
||||
generateText: input.generateText,
|
||||
})
|
||||
: { candidates: [], warnings: [], llmCalls: 0, summary: 'skipped' as const };
|
||||
const candidates = mergeKloRelationshipDiscoveryCandidates([
|
||||
...deterministicCandidates,
|
||||
...llmProposalResult.candidates,
|
||||
]).filter((candidate) => !formalMetadata.acceptedIds.has(candidate.id));
|
||||
warnings.push(...llmProposalResult.warnings);
|
||||
const validated = await validateKloRelationshipDiscoveryCandidates({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
candidates,
|
||||
profiles: profile,
|
||||
executor,
|
||||
ctx: input.context,
|
||||
tableCount: input.schema.tables.length,
|
||||
settings: {
|
||||
acceptThreshold: input.settings.acceptThreshold,
|
||||
reviewThreshold: input.settings.reviewThreshold,
|
||||
maxDistinctSourceValues: input.settings.profileSampleRows,
|
||||
concurrency: input.settings.validationConcurrency,
|
||||
validationBudget: input.settings.validationBudget,
|
||||
},
|
||||
});
|
||||
const graph = resolveKloRelationshipGraph({
|
||||
schema: input.schema,
|
||||
profiles: profile,
|
||||
candidates: validated,
|
||||
settings: {
|
||||
acceptThreshold: input.settings.acceptThreshold,
|
||||
reviewThreshold: input.settings.reviewThreshold,
|
||||
validationRequiredForManifest: input.settings.validationRequiredForManifest,
|
||||
},
|
||||
});
|
||||
const compositeRelationships = await detectCompositeRelationships({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
schema: input.schema,
|
||||
profile,
|
||||
executor,
|
||||
context: input.context,
|
||||
warnings,
|
||||
});
|
||||
const inferredAccepted = nonFormalAcceptedRelationships({
|
||||
formalIds: formalMetadata.acceptedIds,
|
||||
resolvedRelationships: graph.relationships,
|
||||
});
|
||||
const compositeAccepted = compositeRelationships
|
||||
.filter((candidate) => candidate.status === 'accepted')
|
||||
.map(relationshipFromComposite);
|
||||
const relationshipsForAcceptance = formalMetadata.accepted.concat(inferredAccepted, compositeAccepted);
|
||||
const acceptedById = new Map(relationshipsForAcceptance.map((relationship) => [relationship.id, relationship]));
|
||||
const accepted = Array.from(acceptedById.values()).sort((left, right) =>
|
||||
relationshipId(left).localeCompare(relationshipId(right)),
|
||||
);
|
||||
const rejected = graph.relationships
|
||||
.filter((candidate) => candidate.status === 'rejected')
|
||||
.map(relationshipFromResolved);
|
||||
const combined = combinedRelationshipSummary({
|
||||
formalAccepted: formalMetadata.accepted.length,
|
||||
formalSkipped: formalMetadata.skipped.length,
|
||||
resolvedRelationships: graph.relationships,
|
||||
});
|
||||
const compositeCounts = compositeSummary(compositeRelationships);
|
||||
|
||||
return {
|
||||
relationshipUpdate: {
|
||||
connectionId: input.connectionId,
|
||||
accepted,
|
||||
rejected,
|
||||
skipped: formalMetadata.skipped,
|
||||
},
|
||||
relationships: {
|
||||
accepted: combined.accepted + compositeCounts.accepted,
|
||||
review: combined.review + compositeCounts.review,
|
||||
rejected: combined.rejected + compositeCounts.rejected,
|
||||
skipped: combined.skipped,
|
||||
},
|
||||
profile,
|
||||
resolvedRelationships: graph.relationships,
|
||||
compositeRelationships,
|
||||
statisticalValidation: profile.sqlAvailable ? 'completed' : 'skipped',
|
||||
llmRelationshipValidation: llmProposalResult.summary,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
|
|
@ -0,0 +1,211 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import {
|
||||
buildKloRelationshipFeedbackCalibrationReport,
|
||||
calibrateLocalRelationshipFeedbackLabels,
|
||||
formatKloRelationshipFeedbackCalibrationMarkdown,
|
||||
} from './relationship-feedback-calibration.js';
|
||||
import type {
|
||||
ExportLocalRelationshipFeedbackLabelsResult,
|
||||
KloRelationshipFeedbackLabel,
|
||||
} from './relationship-feedback-export.js';
|
||||
|
||||
function label(
|
||||
input: Partial<KloRelationshipFeedbackLabel> &
|
||||
Pick<KloRelationshipFeedbackLabel, 'candidateId' | 'decision' | 'score'>,
|
||||
): KloRelationshipFeedbackLabel {
|
||||
return {
|
||||
schemaVersion: 1,
|
||||
previousStatus: 'review',
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
decidedAt: '2026-05-07T12:00:00.000Z',
|
||||
reviewer: 'Andrey',
|
||||
note: null,
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'deterministic_name',
|
||||
confidence: input.score ?? 0,
|
||||
pkScore: input.pkScore ?? null,
|
||||
fkScore: input.fkScore ?? input.score,
|
||||
fromTable: 'public.orders',
|
||||
fromColumns: ['customer_id'],
|
||||
toTable: 'public.customers',
|
||||
toColumns: ['id'],
|
||||
reasons: [],
|
||||
artifactPath: 'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json',
|
||||
...input,
|
||||
};
|
||||
}
|
||||
|
||||
function feedback(labels: KloRelationshipFeedbackLabel[]): ExportLocalRelationshipFeedbackLabelsResult {
|
||||
return {
|
||||
generatedAt: '2026-05-07T13:00:00.000Z',
|
||||
filters: { connectionId: null, decision: 'all' },
|
||||
summary: {
|
||||
total: labels.length,
|
||||
accepted: labels.filter((item) => item.decision === 'accepted').length,
|
||||
rejected: labels.filter((item) => item.decision === 'rejected').length,
|
||||
connections: new Set(labels.map((item) => item.connectionId)).size,
|
||||
runs: new Set(labels.map((item) => `${item.connectionId}:${item.runId}`)).size,
|
||||
},
|
||||
labels,
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship feedback calibration', () => {
|
||||
it('builds score buckets and threshold-band summary from feedback labels', () => {
|
||||
const report = buildKloRelationshipFeedbackCalibrationReport(
|
||||
feedback([
|
||||
label({
|
||||
candidateId: 'orders:orders.customer_id->customers:customers.id',
|
||||
decision: 'accepted',
|
||||
score: 0.91,
|
||||
pkScore: 0.97,
|
||||
fkScore: 0.91,
|
||||
}),
|
||||
label({
|
||||
candidateId: 'orders:orders.account_id->accounts:accounts.id',
|
||||
decision: 'accepted',
|
||||
score: 0.61,
|
||||
pkScore: 0.88,
|
||||
fkScore: 0.61,
|
||||
}),
|
||||
label({
|
||||
candidateId: 'orders:orders.note_id->notes:notes.id',
|
||||
decision: 'rejected',
|
||||
score: 0.21,
|
||||
pkScore: 0.4,
|
||||
fkScore: 0.21,
|
||||
}),
|
||||
label({
|
||||
candidateId: 'orders:orders.region_id->regions:regions.id',
|
||||
decision: 'rejected',
|
||||
score: 0.88,
|
||||
pkScore: 0.9,
|
||||
fkScore: 0.88,
|
||||
}),
|
||||
]),
|
||||
{
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
},
|
||||
);
|
||||
|
||||
expect(report.thresholds).toEqual({ accept: 0.85, review: 0.55 });
|
||||
expect(report.summary).toEqual({
|
||||
total: 4,
|
||||
scored: 4,
|
||||
unscored: 0,
|
||||
acceptedLabels: 2,
|
||||
rejectedLabels: 2,
|
||||
predictedAccepted: 2,
|
||||
predictedReview: 1,
|
||||
predictedRejected: 1,
|
||||
acceptedBandPrecision: 0.5,
|
||||
rejectedBandPrecision: 1,
|
||||
reviewBandAcceptedRate: 1,
|
||||
meanAcceptedScore: 0.76,
|
||||
meanRejectedScore: 0.545,
|
||||
});
|
||||
expect(report.buckets.map((bucket) => [bucket.label, bucket.total, bucket.accepted, bucket.rejected, bucket.acceptanceRate])).toEqual([
|
||||
['0.00-0.24', 1, 0, 1, 0],
|
||||
['0.25-0.49', 0, 0, 0, null],
|
||||
['0.50-0.74', 1, 1, 0, 1],
|
||||
['0.75-1.00', 2, 1, 1, 0.5],
|
||||
]);
|
||||
expect(report.labels.map((item) => [item.candidateId, item.predictedStatus, item.bucket])).toEqual([
|
||||
['orders:orders.account_id->accounts:accounts.id', 'review', '0.50-0.74'],
|
||||
['orders:orders.customer_id->customers:customers.id', 'accepted', '0.75-1.00'],
|
||||
['orders:orders.note_id->notes:notes.id', 'rejected', '0.00-0.24'],
|
||||
['orders:orders.region_id->regions:regions.id', 'accepted', '0.75-1.00'],
|
||||
]);
|
||||
});
|
||||
|
||||
it('keeps unscored labels visible without treating them as threshold predictions', () => {
|
||||
const report = buildKloRelationshipFeedbackCalibrationReport(
|
||||
feedback([
|
||||
label({
|
||||
candidateId: 'orders:orders.note_id->notes:notes.id',
|
||||
decision: 'rejected',
|
||||
score: null,
|
||||
confidence: 0.2,
|
||||
fkScore: null,
|
||||
}),
|
||||
]),
|
||||
{
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
},
|
||||
);
|
||||
|
||||
expect(report.summary).toMatchObject({
|
||||
total: 1,
|
||||
scored: 0,
|
||||
unscored: 1,
|
||||
predictedAccepted: 0,
|
||||
predictedReview: 0,
|
||||
predictedRejected: 0,
|
||||
acceptedBandPrecision: null,
|
||||
rejectedBandPrecision: null,
|
||||
reviewBandAcceptedRate: null,
|
||||
meanAcceptedScore: null,
|
||||
meanRejectedScore: null,
|
||||
});
|
||||
expect(report.labels[0]).toMatchObject({
|
||||
candidateId: 'orders:orders.note_id->notes:notes.id',
|
||||
predictedStatus: 'unscored',
|
||||
bucket: 'unscored',
|
||||
});
|
||||
});
|
||||
|
||||
it('formats a stable markdown summary for human CLI output', () => {
|
||||
const report = buildKloRelationshipFeedbackCalibrationReport(
|
||||
feedback([
|
||||
label({ candidateId: 'orders:orders.customer_id->customers:customers.id', decision: 'accepted', score: 0.91 }),
|
||||
label({ candidateId: 'orders:orders.note_id->notes:notes.id', decision: 'rejected', score: 0.21 }),
|
||||
]),
|
||||
{
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
},
|
||||
);
|
||||
|
||||
expect(formatKloRelationshipFeedbackCalibrationMarkdown(report)).toContain(
|
||||
'KLO relationship feedback calibration',
|
||||
);
|
||||
expect(formatKloRelationshipFeedbackCalibrationMarkdown(report)).toContain('Total labels: 2');
|
||||
expect(formatKloRelationshipFeedbackCalibrationMarkdown(report)).toContain('Accepted-band precision: 1.000');
|
||||
expect(formatKloRelationshipFeedbackCalibrationMarkdown(report)).toContain(
|
||||
'0.75-1.00: total=1 accepted=1 rejected=0 acceptanceRate=1.000',
|
||||
);
|
||||
});
|
||||
|
||||
it('wraps the feedback exporter and preserves exporter warnings', async () => {
|
||||
const project = { projectDir: '/tmp/klo-project' } as KloLocalProject;
|
||||
const exportLocalRelationshipFeedbackLabels = vi.fn(async () => ({
|
||||
...feedback([
|
||||
label({ candidateId: 'orders:orders.customer_id->customers:customers.id', decision: 'accepted', score: 0.91 }),
|
||||
]),
|
||||
warnings: [{ path: 'raw-sources/broken/live-database/sync/enrichment/relationship-review-decisions.json', message: 'Unexpected token' }],
|
||||
}));
|
||||
|
||||
const report = await calibrateLocalRelationshipFeedbackLabels(project, {
|
||||
connectionId: 'warehouse',
|
||||
decision: 'all',
|
||||
acceptThreshold: 0.9,
|
||||
reviewThreshold: 0.5,
|
||||
exportLocalRelationshipFeedbackLabels,
|
||||
});
|
||||
|
||||
expect(exportLocalRelationshipFeedbackLabels).toHaveBeenCalledWith(project, {
|
||||
connectionId: 'warehouse',
|
||||
decision: 'all',
|
||||
});
|
||||
expect(report.thresholds).toEqual({ accept: 0.9, review: 0.5 });
|
||||
expect(report.warnings).toEqual([
|
||||
{ path: 'raw-sources/broken/live-database/sync/enrichment/relationship-review-decisions.json', message: 'Unexpected token' },
|
||||
]);
|
||||
});
|
||||
});
|
||||
300
packages/context/src/scan/relationship-feedback-calibration.ts
Normal file
300
packages/context/src/scan/relationship-feedback-calibration.ts
Normal file
|
|
@ -0,0 +1,300 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import {
|
||||
exportLocalRelationshipFeedbackLabels,
|
||||
type ExportLocalRelationshipFeedbackLabelsInput,
|
||||
type ExportLocalRelationshipFeedbackLabelsResult,
|
||||
type KloRelationshipFeedbackExportWarning,
|
||||
type KloRelationshipFeedbackLabel,
|
||||
} from './relationship-feedback-export.js';
|
||||
import type { KloResolvedRelationshipStatus } from './relationship-graph-resolver.js';
|
||||
import type { KloRelationshipReviewDecisionValue } from './relationship-review-decisions.js';
|
||||
|
||||
const DEFAULT_ACCEPT_THRESHOLD = 0.85;
|
||||
const DEFAULT_REVIEW_THRESHOLD = 0.55;
|
||||
|
||||
type CalibrationPredictedStatus = KloResolvedRelationshipStatus | 'unscored';
|
||||
|
||||
interface Thresholds {
|
||||
accept: number;
|
||||
review: number;
|
||||
}
|
||||
|
||||
export interface BuildKloRelationshipFeedbackCalibrationReportInput {
|
||||
acceptThreshold?: number;
|
||||
reviewThreshold?: number;
|
||||
}
|
||||
|
||||
export interface CalibrateLocalRelationshipFeedbackLabelsInput
|
||||
extends ExportLocalRelationshipFeedbackLabelsInput,
|
||||
BuildKloRelationshipFeedbackCalibrationReportInput {
|
||||
exportLocalRelationshipFeedbackLabels?: typeof exportLocalRelationshipFeedbackLabels;
|
||||
}
|
||||
|
||||
export interface KloRelationshipFeedbackCalibrationBucket {
|
||||
label: string;
|
||||
minInclusive: number;
|
||||
maxInclusive: number;
|
||||
total: number;
|
||||
accepted: number;
|
||||
rejected: number;
|
||||
acceptanceRate: number | null;
|
||||
}
|
||||
|
||||
export interface KloRelationshipFeedbackCalibrationLabel {
|
||||
candidateId: string;
|
||||
decision: KloRelationshipReviewDecisionValue;
|
||||
previousStatus: KloRelationshipFeedbackLabel['previousStatus'];
|
||||
predictedStatus: CalibrationPredictedStatus;
|
||||
bucket: string;
|
||||
score: number | null;
|
||||
pkScore: number | null;
|
||||
fkScore: number | null;
|
||||
connectionId: string;
|
||||
runId: string;
|
||||
fromTable: string;
|
||||
fromColumns: string[];
|
||||
toTable: string;
|
||||
toColumns: string[];
|
||||
source: string;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KloRelationshipFeedbackCalibrationReport {
|
||||
generatedAt: string;
|
||||
filters: ExportLocalRelationshipFeedbackLabelsResult['filters'];
|
||||
thresholds: Thresholds;
|
||||
summary: {
|
||||
total: number;
|
||||
scored: number;
|
||||
unscored: number;
|
||||
acceptedLabels: number;
|
||||
rejectedLabels: number;
|
||||
predictedAccepted: number;
|
||||
predictedReview: number;
|
||||
predictedRejected: number;
|
||||
acceptedBandPrecision: number | null;
|
||||
rejectedBandPrecision: number | null;
|
||||
reviewBandAcceptedRate: number | null;
|
||||
meanAcceptedScore: number | null;
|
||||
meanRejectedScore: number | null;
|
||||
};
|
||||
buckets: KloRelationshipFeedbackCalibrationBucket[];
|
||||
labels: KloRelationshipFeedbackCalibrationLabel[];
|
||||
warnings: KloRelationshipFeedbackExportWarning[];
|
||||
}
|
||||
|
||||
const BUCKETS = [
|
||||
{ label: '0.00-0.24', minInclusive: 0, maxInclusive: 0.249999 },
|
||||
{ label: '0.25-0.49', minInclusive: 0.25, maxInclusive: 0.499999 },
|
||||
{ label: '0.50-0.74', minInclusive: 0.5, maxInclusive: 0.749999 },
|
||||
{ label: '0.75-1.00', minInclusive: 0.75, maxInclusive: 1 },
|
||||
] as const;
|
||||
|
||||
function thresholds(input: BuildKloRelationshipFeedbackCalibrationReportInput): Thresholds {
|
||||
return {
|
||||
accept: input.acceptThreshold ?? DEFAULT_ACCEPT_THRESHOLD,
|
||||
review: input.reviewThreshold ?? DEFAULT_REVIEW_THRESHOLD,
|
||||
};
|
||||
}
|
||||
|
||||
function roundMetric(value: number): number {
|
||||
return Math.round(value * 1000) / 1000;
|
||||
}
|
||||
|
||||
function ratio(numerator: number, denominator: number): number | null {
|
||||
return denominator === 0 ? null : roundMetric(numerator / denominator);
|
||||
}
|
||||
|
||||
function mean(values: readonly number[]): number | null {
|
||||
if (values.length === 0) {
|
||||
return null;
|
||||
}
|
||||
return roundMetric(values.reduce((sum, value) => sum + value, 0) / values.length);
|
||||
}
|
||||
|
||||
function scoreBucket(score: number | null): string {
|
||||
if (score === null) {
|
||||
return 'unscored';
|
||||
}
|
||||
return BUCKETS.find((bucket) => score >= bucket.minInclusive && score <= bucket.maxInclusive)?.label ?? 'unscored';
|
||||
}
|
||||
|
||||
function predictedStatus(score: number | null, currentThresholds: Thresholds): CalibrationPredictedStatus {
|
||||
if (score === null) {
|
||||
return 'unscored';
|
||||
}
|
||||
if (score >= currentThresholds.accept) {
|
||||
return 'accepted';
|
||||
}
|
||||
if (score >= currentThresholds.review) {
|
||||
return 'review';
|
||||
}
|
||||
return 'rejected';
|
||||
}
|
||||
|
||||
function calibrationLabel(
|
||||
label: KloRelationshipFeedbackLabel,
|
||||
currentThresholds: Thresholds,
|
||||
): KloRelationshipFeedbackCalibrationLabel {
|
||||
return {
|
||||
candidateId: label.candidateId,
|
||||
decision: label.decision,
|
||||
previousStatus: label.previousStatus,
|
||||
predictedStatus: predictedStatus(label.score, currentThresholds),
|
||||
bucket: scoreBucket(label.score),
|
||||
score: label.score,
|
||||
pkScore: label.pkScore,
|
||||
fkScore: label.fkScore,
|
||||
connectionId: label.connectionId,
|
||||
runId: label.runId,
|
||||
fromTable: label.fromTable,
|
||||
fromColumns: [...label.fromColumns],
|
||||
toTable: label.toTable,
|
||||
toColumns: [...label.toColumns],
|
||||
source: label.source,
|
||||
reasons: [...label.reasons],
|
||||
};
|
||||
}
|
||||
|
||||
function summarize(
|
||||
labels: readonly KloRelationshipFeedbackCalibrationLabel[],
|
||||
): KloRelationshipFeedbackCalibrationReport['summary'] {
|
||||
const scored = labels.filter((label) => label.score !== null);
|
||||
const predictedAccepted = scored.filter((label) => label.predictedStatus === 'accepted');
|
||||
const predictedReview = scored.filter((label) => label.predictedStatus === 'review');
|
||||
const predictedRejected = scored.filter((label) => label.predictedStatus === 'rejected');
|
||||
const acceptedLabels = labels.filter((label) => label.decision === 'accepted');
|
||||
const rejectedLabels = labels.filter((label) => label.decision === 'rejected');
|
||||
|
||||
return {
|
||||
total: labels.length,
|
||||
scored: scored.length,
|
||||
unscored: labels.length - scored.length,
|
||||
acceptedLabels: acceptedLabels.length,
|
||||
rejectedLabels: rejectedLabels.length,
|
||||
predictedAccepted: predictedAccepted.length,
|
||||
predictedReview: predictedReview.length,
|
||||
predictedRejected: predictedRejected.length,
|
||||
acceptedBandPrecision: ratio(
|
||||
predictedAccepted.filter((label) => label.decision === 'accepted').length,
|
||||
predictedAccepted.length,
|
||||
),
|
||||
rejectedBandPrecision: ratio(
|
||||
predictedRejected.filter((label) => label.decision === 'rejected').length,
|
||||
predictedRejected.length,
|
||||
),
|
||||
reviewBandAcceptedRate: ratio(
|
||||
predictedReview.filter((label) => label.decision === 'accepted').length,
|
||||
predictedReview.length,
|
||||
),
|
||||
meanAcceptedScore: mean(acceptedLabels.map((label) => label.score).filter((score): score is number => score !== null)),
|
||||
meanRejectedScore: mean(rejectedLabels.map((label) => label.score).filter((score): score is number => score !== null)),
|
||||
};
|
||||
}
|
||||
|
||||
function buildBuckets(
|
||||
labels: readonly KloRelationshipFeedbackCalibrationLabel[],
|
||||
): KloRelationshipFeedbackCalibrationBucket[] {
|
||||
return BUCKETS.map((bucket) => {
|
||||
const bucketLabels = labels.filter((label) => label.bucket === bucket.label);
|
||||
const accepted = bucketLabels.filter((label) => label.decision === 'accepted').length;
|
||||
const rejected = bucketLabels.filter((label) => label.decision === 'rejected').length;
|
||||
return {
|
||||
label: bucket.label,
|
||||
minInclusive: bucket.minInclusive,
|
||||
maxInclusive:
|
||||
bucket.maxInclusive === 0.249999
|
||||
? 0.24
|
||||
: bucket.maxInclusive === 0.499999
|
||||
? 0.49
|
||||
: bucket.maxInclusive === 0.749999
|
||||
? 0.74
|
||||
: 1,
|
||||
total: bucketLabels.length,
|
||||
accepted,
|
||||
rejected,
|
||||
acceptanceRate: ratio(accepted, bucketLabels.length),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
export function buildKloRelationshipFeedbackCalibrationReport(
|
||||
feedback: ExportLocalRelationshipFeedbackLabelsResult,
|
||||
input: BuildKloRelationshipFeedbackCalibrationReportInput = {},
|
||||
): KloRelationshipFeedbackCalibrationReport {
|
||||
const currentThresholds = thresholds(input);
|
||||
const labels = feedback.labels
|
||||
.map((label) => calibrationLabel(label, currentThresholds))
|
||||
.sort(
|
||||
(left, right) =>
|
||||
left.connectionId.localeCompare(right.connectionId) ||
|
||||
left.runId.localeCompare(right.runId) ||
|
||||
left.candidateId.localeCompare(right.candidateId),
|
||||
);
|
||||
|
||||
return {
|
||||
generatedAt: feedback.generatedAt,
|
||||
filters: feedback.filters,
|
||||
thresholds: currentThresholds,
|
||||
summary: summarize(labels),
|
||||
buckets: buildBuckets(labels),
|
||||
labels,
|
||||
warnings: [...feedback.warnings],
|
||||
};
|
||||
}
|
||||
|
||||
export async function calibrateLocalRelationshipFeedbackLabels(
|
||||
project: KloLocalProject,
|
||||
input: CalibrateLocalRelationshipFeedbackLabelsInput = {},
|
||||
): Promise<KloRelationshipFeedbackCalibrationReport> {
|
||||
const exporter = input.exportLocalRelationshipFeedbackLabels ?? exportLocalRelationshipFeedbackLabels;
|
||||
const feedback = await exporter(project, {
|
||||
connectionId: input.connectionId,
|
||||
decision: input.decision,
|
||||
});
|
||||
return buildKloRelationshipFeedbackCalibrationReport(feedback, input);
|
||||
}
|
||||
|
||||
function formatMetric(value: number | null): string {
|
||||
return value === null ? 'n/a' : value.toFixed(3);
|
||||
}
|
||||
|
||||
export function formatKloRelationshipFeedbackCalibrationMarkdown(
|
||||
report: KloRelationshipFeedbackCalibrationReport,
|
||||
): string {
|
||||
const lines = [
|
||||
'KLO relationship feedback calibration',
|
||||
`Generated: ${report.generatedAt}`,
|
||||
`Filter connection: ${report.filters.connectionId ?? 'all'}`,
|
||||
`Filter decision: ${report.filters.decision}`,
|
||||
`Thresholds: accept=${report.thresholds.accept.toFixed(2)} review=${report.thresholds.review.toFixed(2)}`,
|
||||
`Total labels: ${report.summary.total}`,
|
||||
`Scored labels: ${report.summary.scored}`,
|
||||
`Unscored labels: ${report.summary.unscored}`,
|
||||
`Accepted labels: ${report.summary.acceptedLabels}`,
|
||||
`Rejected labels: ${report.summary.rejectedLabels}`,
|
||||
`Predicted accepted: ${report.summary.predictedAccepted}`,
|
||||
`Predicted review: ${report.summary.predictedReview}`,
|
||||
`Predicted rejected: ${report.summary.predictedRejected}`,
|
||||
`Accepted-band precision: ${formatMetric(report.summary.acceptedBandPrecision)}`,
|
||||
`Rejected-band precision: ${formatMetric(report.summary.rejectedBandPrecision)}`,
|
||||
`Review-band accepted rate: ${formatMetric(report.summary.reviewBandAcceptedRate)}`,
|
||||
`Mean accepted score: ${formatMetric(report.summary.meanAcceptedScore)}`,
|
||||
`Mean rejected score: ${formatMetric(report.summary.meanRejectedScore)}`,
|
||||
'',
|
||||
'Score buckets',
|
||||
...report.buckets.map(
|
||||
(bucket) =>
|
||||
` - ${bucket.label}: total=${bucket.total} accepted=${bucket.accepted} rejected=${bucket.rejected} acceptanceRate=${formatMetric(bucket.acceptanceRate)}`,
|
||||
),
|
||||
];
|
||||
|
||||
if (report.warnings.length > 0) {
|
||||
lines.push('', 'Warnings');
|
||||
for (const warning of report.warnings.slice(0, 5)) {
|
||||
lines.push(` - ${warning.path}: ${warning.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return `${lines.join('\n')}\n`;
|
||||
}
|
||||
270
packages/context/src/scan/relationship-feedback-export.test.ts
Normal file
270
packages/context/src/scan/relationship-feedback-export.test.ts
Normal file
|
|
@ -0,0 +1,270 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import {
|
||||
exportLocalRelationshipFeedbackLabels,
|
||||
formatKloRelationshipFeedbackLabelsJsonl,
|
||||
} from './relationship-feedback-export.js';
|
||||
import type { KloRelationshipReviewDecisionArtifact } from './relationship-review-decisions.js';
|
||||
|
||||
function projectWithFiles(files: Record<string, unknown>): KloLocalProject {
|
||||
const contentByPath = new Map(
|
||||
Object.entries(files).map(([path, value]) => [
|
||||
path,
|
||||
typeof value === 'string' ? value : `${JSON.stringify(value, null, 2)}\n`,
|
||||
]),
|
||||
);
|
||||
return {
|
||||
projectDir: '/tmp/klo-project',
|
||||
fileStore: {
|
||||
async listFiles(path: string) {
|
||||
return {
|
||||
files: [...contentByPath.keys()].filter((file) => file.startsWith(`${path}/`)).sort(),
|
||||
};
|
||||
},
|
||||
async readFile(path: string) {
|
||||
const content = contentByPath.get(path);
|
||||
if (!content) {
|
||||
throw new Error(`missing file ${path}`);
|
||||
}
|
||||
return { content };
|
||||
},
|
||||
writeFile: vi.fn(),
|
||||
deleteFile: vi.fn(),
|
||||
getFileHistory: vi.fn(),
|
||||
forWorktree: vi.fn(),
|
||||
},
|
||||
} as unknown as KloLocalProject;
|
||||
}
|
||||
|
||||
function decisionsArtifact(input: {
|
||||
connectionId: string;
|
||||
runId: string;
|
||||
syncId: string;
|
||||
decisions: KloRelationshipReviewDecisionArtifact['decisions'];
|
||||
}): KloRelationshipReviewDecisionArtifact {
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
runId: input.runId,
|
||||
syncId: input.syncId,
|
||||
generatedAt: '2026-05-07T12:00:00.000Z',
|
||||
decisions: input.decisions,
|
||||
};
|
||||
}
|
||||
|
||||
const acceptedOrderCustomer = {
|
||||
candidateId: 'orders:orders.customer_id->customers:customers.id',
|
||||
decision: 'accepted' as const,
|
||||
previousStatus: 'review' as const,
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
decidedAt: '2026-05-07T12:00:00.000Z',
|
||||
reviewer: 'Andrey',
|
||||
note: 'Confirmed in warehouse docs',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'customers',
|
||||
columnIds: ['customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one' as const,
|
||||
source: 'deterministic_name',
|
||||
score: 0.62,
|
||||
confidence: 0.62,
|
||||
pkScore: 0.91,
|
||||
fkScore: 0.62,
|
||||
reasons: ['fk_score_review'],
|
||||
};
|
||||
|
||||
const rejectedOrderNote = {
|
||||
candidateId: 'orders:orders.note_id->notes:notes.id',
|
||||
decision: 'rejected' as const,
|
||||
previousStatus: 'rejected' as const,
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
decidedAt: '2026-05-07T12:05:00.000Z',
|
||||
reviewer: 'Andrey',
|
||||
note: null,
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.note_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['note_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'notes',
|
||||
columnIds: ['notes.id'],
|
||||
table: { catalog: null, db: 'public', name: 'notes' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one' as const,
|
||||
source: 'deterministic_name',
|
||||
score: 0.2,
|
||||
confidence: 0.2,
|
||||
pkScore: 0.4,
|
||||
fkScore: 0.2,
|
||||
reasons: ['low_source_coverage'],
|
||||
};
|
||||
|
||||
const acceptedInvoiceAccount = {
|
||||
candidateId: 'invoices:invoices.account_id->accounts:accounts.id',
|
||||
decision: 'accepted' as const,
|
||||
previousStatus: 'accepted' as const,
|
||||
connectionId: 'billing',
|
||||
runId: 'scan-run-b',
|
||||
syncId: 'sync-b',
|
||||
decidedAt: '2026-05-07T12:10:00.000Z',
|
||||
reviewer: 'klo',
|
||||
note: null,
|
||||
from: {
|
||||
tableId: 'invoices',
|
||||
columnIds: ['invoices.account_id'],
|
||||
table: { catalog: null, db: 'billing', name: 'invoices' },
|
||||
columns: ['account_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'accounts',
|
||||
columnIds: ['accounts.id'],
|
||||
table: { catalog: null, db: 'billing', name: 'accounts' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one' as const,
|
||||
source: 'formal_metadata',
|
||||
score: 1,
|
||||
confidence: 1,
|
||||
pkScore: 1,
|
||||
fkScore: 1,
|
||||
reasons: ['formal_metadata_relationship'],
|
||||
};
|
||||
|
||||
describe('relationship feedback export', () => {
|
||||
it('exports stable labels from all relationship review decision artifacts', async () => {
|
||||
const project = projectWithFiles({
|
||||
'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json': decisionsArtifact({
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
decisions: [rejectedOrderNote, acceptedOrderCustomer],
|
||||
}),
|
||||
'raw-sources/billing/live-database/sync-b/enrichment/relationship-review-decisions.json': decisionsArtifact({
|
||||
connectionId: 'billing',
|
||||
runId: 'scan-run-b',
|
||||
syncId: 'sync-b',
|
||||
decisions: [acceptedInvoiceAccount],
|
||||
}),
|
||||
'raw-sources/warehouse/live-database/sync-a/enrichment/relationships.json': { accepted: [], review: [], rejected: [] },
|
||||
});
|
||||
|
||||
const result = await exportLocalRelationshipFeedbackLabels(project, {
|
||||
now: () => new Date('2026-05-07T13:00:00.000Z'),
|
||||
});
|
||||
|
||||
expect(result.summary).toEqual({
|
||||
total: 3,
|
||||
accepted: 2,
|
||||
rejected: 1,
|
||||
connections: 2,
|
||||
runs: 2,
|
||||
});
|
||||
expect(result.labels.map((label) => label.candidateId)).toEqual([
|
||||
'invoices:invoices.account_id->accounts:accounts.id',
|
||||
'orders:orders.customer_id->customers:customers.id',
|
||||
'orders:orders.note_id->notes:notes.id',
|
||||
]);
|
||||
expect(result.labels[0]).toMatchObject({
|
||||
schemaVersion: 1,
|
||||
decision: 'accepted',
|
||||
connectionId: 'billing',
|
||||
source: 'formal_metadata',
|
||||
fromTable: 'billing.invoices',
|
||||
fromColumns: ['account_id'],
|
||||
toTable: 'billing.accounts',
|
||||
toColumns: ['id'],
|
||||
artifactPath: 'raw-sources/billing/live-database/sync-b/enrichment/relationship-review-decisions.json',
|
||||
});
|
||||
expect(result.warnings).toEqual([]);
|
||||
});
|
||||
|
||||
it('filters labels by connection and decision', async () => {
|
||||
const project = projectWithFiles({
|
||||
'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json': decisionsArtifact({
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
decisions: [rejectedOrderNote, acceptedOrderCustomer],
|
||||
}),
|
||||
'raw-sources/billing/live-database/sync-b/enrichment/relationship-review-decisions.json': decisionsArtifact({
|
||||
connectionId: 'billing',
|
||||
runId: 'scan-run-b',
|
||||
syncId: 'sync-b',
|
||||
decisions: [acceptedInvoiceAccount],
|
||||
}),
|
||||
});
|
||||
|
||||
const result = await exportLocalRelationshipFeedbackLabels(project, {
|
||||
connectionId: 'warehouse',
|
||||
decision: 'rejected',
|
||||
now: () => new Date('2026-05-07T13:00:00.000Z'),
|
||||
});
|
||||
|
||||
expect(result.summary).toMatchObject({ total: 1, accepted: 0, rejected: 1 });
|
||||
expect(result.labels).toHaveLength(1);
|
||||
expect(result.labels[0]?.candidateId).toBe('orders:orders.note_id->notes:notes.id');
|
||||
});
|
||||
|
||||
it('formats JSONL with one stable label object per line', async () => {
|
||||
const project = projectWithFiles({
|
||||
'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json': decisionsArtifact({
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
decisions: [acceptedOrderCustomer],
|
||||
}),
|
||||
});
|
||||
const result = await exportLocalRelationshipFeedbackLabels(project, {
|
||||
now: () => new Date('2026-05-07T13:00:00.000Z'),
|
||||
});
|
||||
|
||||
const lines = formatKloRelationshipFeedbackLabelsJsonl(result).trim().split('\n').map((line) => JSON.parse(line));
|
||||
|
||||
expect(lines).toHaveLength(1);
|
||||
expect(lines[0]).toMatchObject({
|
||||
schemaVersion: 1,
|
||||
candidateId: 'orders:orders.customer_id->customers:customers.id',
|
||||
decision: 'accepted',
|
||||
relationshipType: 'many_to_one',
|
||||
});
|
||||
});
|
||||
|
||||
it('records parse warnings and continues exporting readable decision artifacts', async () => {
|
||||
const project = projectWithFiles({
|
||||
'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json': decisionsArtifact({
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
decisions: [acceptedOrderCustomer],
|
||||
}),
|
||||
'raw-sources/broken/live-database/sync-b/enrichment/relationship-review-decisions.json': '{not-json',
|
||||
});
|
||||
|
||||
const result = await exportLocalRelationshipFeedbackLabels(project, {
|
||||
now: () => new Date('2026-05-07T13:00:00.000Z'),
|
||||
});
|
||||
|
||||
expect(result.summary.total).toBe(1);
|
||||
expect(result.warnings).toEqual([
|
||||
{
|
||||
path: 'raw-sources/broken/live-database/sync-b/enrichment/relationship-review-decisions.json',
|
||||
message: expect.any(String),
|
||||
},
|
||||
]);
|
||||
expect(result.warnings[0]?.message.length).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
179
packages/context/src/scan/relationship-feedback-export.ts
Normal file
179
packages/context/src/scan/relationship-feedback-export.ts
Normal file
|
|
@ -0,0 +1,179 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import type {
|
||||
KloRelationshipReviewDecisionArtifact,
|
||||
KloRelationshipReviewDecisionEntry,
|
||||
KloRelationshipReviewDecisionValue,
|
||||
} from './relationship-review-decisions.js';
|
||||
|
||||
const DECISION_ARTIFACT_SUFFIX = '/enrichment/relationship-review-decisions.json';
|
||||
const FEEDBACK_SCHEMA_VERSION = 1;
|
||||
|
||||
export type KloRelationshipFeedbackDecisionFilter = KloRelationshipReviewDecisionValue | 'all';
|
||||
|
||||
export interface ExportLocalRelationshipFeedbackLabelsInput {
|
||||
connectionId?: string | null;
|
||||
decision?: KloRelationshipFeedbackDecisionFilter;
|
||||
now?: () => Date;
|
||||
}
|
||||
|
||||
export interface KloRelationshipFeedbackLabel {
|
||||
schemaVersion: 1;
|
||||
candidateId: string;
|
||||
decision: KloRelationshipReviewDecisionValue;
|
||||
previousStatus: KloRelationshipReviewDecisionEntry['previousStatus'];
|
||||
connectionId: string;
|
||||
runId: string;
|
||||
syncId: string;
|
||||
decidedAt: string;
|
||||
reviewer: string;
|
||||
note: string | null;
|
||||
relationshipType: KloRelationshipReviewDecisionEntry['relationshipType'];
|
||||
source: string;
|
||||
score: number | null;
|
||||
confidence: number;
|
||||
pkScore: number | null;
|
||||
fkScore: number | null;
|
||||
fromTable: string;
|
||||
fromColumns: string[];
|
||||
toTable: string;
|
||||
toColumns: string[];
|
||||
reasons: string[];
|
||||
artifactPath: string;
|
||||
}
|
||||
|
||||
export interface KloRelationshipFeedbackExportWarning {
|
||||
path: string;
|
||||
message: string;
|
||||
}
|
||||
|
||||
export interface ExportLocalRelationshipFeedbackLabelsResult {
|
||||
generatedAt: string;
|
||||
filters: {
|
||||
connectionId: string | null;
|
||||
decision: KloRelationshipFeedbackDecisionFilter;
|
||||
};
|
||||
summary: {
|
||||
total: number;
|
||||
accepted: number;
|
||||
rejected: number;
|
||||
connections: number;
|
||||
runs: number;
|
||||
};
|
||||
labels: KloRelationshipFeedbackLabel[];
|
||||
warnings: KloRelationshipFeedbackExportWarning[];
|
||||
}
|
||||
|
||||
function qualifiedTableName(entry: KloRelationshipReviewDecisionEntry, side: 'from' | 'to'): string {
|
||||
const table = entry[side].table;
|
||||
return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.');
|
||||
}
|
||||
|
||||
function labelFromDecision(entry: KloRelationshipReviewDecisionEntry, artifactPath: string): KloRelationshipFeedbackLabel {
|
||||
return {
|
||||
schemaVersion: FEEDBACK_SCHEMA_VERSION,
|
||||
candidateId: entry.candidateId,
|
||||
decision: entry.decision,
|
||||
previousStatus: entry.previousStatus,
|
||||
connectionId: entry.connectionId,
|
||||
runId: entry.runId,
|
||||
syncId: entry.syncId,
|
||||
decidedAt: entry.decidedAt,
|
||||
reviewer: entry.reviewer,
|
||||
note: entry.note,
|
||||
relationshipType: entry.relationshipType,
|
||||
source: entry.source,
|
||||
score: entry.score,
|
||||
confidence: entry.confidence,
|
||||
pkScore: entry.pkScore,
|
||||
fkScore: entry.fkScore,
|
||||
fromTable: qualifiedTableName(entry, 'from'),
|
||||
fromColumns: [...entry.from.columns],
|
||||
toTable: qualifiedTableName(entry, 'to'),
|
||||
toColumns: [...entry.to.columns],
|
||||
reasons: [...entry.reasons],
|
||||
artifactPath,
|
||||
};
|
||||
}
|
||||
|
||||
function sortLabels(labels: KloRelationshipFeedbackLabel[]): KloRelationshipFeedbackLabel[] {
|
||||
return [...labels].sort((left, right) => {
|
||||
return (
|
||||
left.connectionId.localeCompare(right.connectionId) ||
|
||||
left.runId.localeCompare(right.runId) ||
|
||||
left.candidateId.localeCompare(right.candidateId) ||
|
||||
left.decidedAt.localeCompare(right.decidedAt)
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
function passesFilters(
|
||||
label: KloRelationshipFeedbackLabel,
|
||||
filters: { connectionId: string | null; decision: KloRelationshipFeedbackDecisionFilter },
|
||||
): boolean {
|
||||
if (filters.connectionId && label.connectionId !== filters.connectionId) {
|
||||
return false;
|
||||
}
|
||||
return filters.decision === 'all' || label.decision === filters.decision;
|
||||
}
|
||||
|
||||
function messageFromUnknownError(error: unknown): string {
|
||||
return error instanceof Error ? error.message : String(error);
|
||||
}
|
||||
|
||||
async function readDecisionLabels(
|
||||
project: KloLocalProject,
|
||||
artifactPath: string,
|
||||
): Promise<KloRelationshipFeedbackLabel[]> {
|
||||
const raw = await project.fileStore.readFile(artifactPath);
|
||||
const parsed = JSON.parse(raw.content) as KloRelationshipReviewDecisionArtifact;
|
||||
const decisions = Array.isArray(parsed.decisions) ? parsed.decisions : [];
|
||||
return decisions.map((entry) => labelFromDecision(entry, artifactPath));
|
||||
}
|
||||
|
||||
function summarize(labels: KloRelationshipFeedbackLabel[]): ExportLocalRelationshipFeedbackLabelsResult['summary'] {
|
||||
return {
|
||||
total: labels.length,
|
||||
accepted: labels.filter((label) => label.decision === 'accepted').length,
|
||||
rejected: labels.filter((label) => label.decision === 'rejected').length,
|
||||
connections: new Set(labels.map((label) => label.connectionId)).size,
|
||||
runs: new Set(labels.map((label) => `${label.connectionId}:${label.runId}`)).size,
|
||||
};
|
||||
}
|
||||
|
||||
export async function exportLocalRelationshipFeedbackLabels(
|
||||
project: KloLocalProject,
|
||||
input: ExportLocalRelationshipFeedbackLabelsInput = {},
|
||||
): Promise<ExportLocalRelationshipFeedbackLabelsResult> {
|
||||
const filters = {
|
||||
connectionId: input.connectionId ?? null,
|
||||
decision: input.decision ?? 'all',
|
||||
};
|
||||
const listed = await project.fileStore.listFiles('raw-sources');
|
||||
const artifactPaths = listed.files.filter((path) => path.endsWith(DECISION_ARTIFACT_SUFFIX)).sort();
|
||||
const labels: KloRelationshipFeedbackLabel[] = [];
|
||||
const warnings: KloRelationshipFeedbackExportWarning[] = [];
|
||||
|
||||
for (const artifactPath of artifactPaths) {
|
||||
try {
|
||||
labels.push(...(await readDecisionLabels(project, artifactPath)));
|
||||
} catch (error) {
|
||||
warnings.push({ path: artifactPath, message: messageFromUnknownError(error) });
|
||||
}
|
||||
}
|
||||
|
||||
const filtered = sortLabels(labels.filter((label) => passesFilters(label, filters)));
|
||||
return {
|
||||
generatedAt: (input.now?.() ?? new Date()).toISOString(),
|
||||
filters,
|
||||
summary: summarize(filtered),
|
||||
labels: filtered,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
|
||||
export function formatKloRelationshipFeedbackLabelsJsonl(result: ExportLocalRelationshipFeedbackLabelsResult): string {
|
||||
if (result.labels.length === 0) {
|
||||
return '';
|
||||
}
|
||||
return `${result.labels.map((label) => JSON.stringify(label)).join('\n')}\n`;
|
||||
}
|
||||
134
packages/context/src/scan/relationship-formal-metadata.test.ts
Normal file
134
packages/context/src/scan/relationship-formal-metadata.test.ts
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KloEnrichedRelationship, KloEnrichedSchema } from './enrichment-types.js';
|
||||
import { collectKloFormalMetadataRelationships } from './relationship-formal-metadata.js';
|
||||
|
||||
function schema(relationships: KloEnrichedRelationship[]): KloEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables: [
|
||||
{
|
||||
id: 'accounts',
|
||||
ref: { catalog: null, db: null, name: 'accounts' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'accounts.id',
|
||||
tableId: 'accounts',
|
||||
tableRef: { catalog: null, db: null, name: 'accounts' },
|
||||
name: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'orders',
|
||||
ref: { catalog: null, db: null, name: 'orders' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'orders.account_id',
|
||||
tableId: 'orders',
|
||||
tableRef: { catalog: null, db: null, name: 'orders' },
|
||||
name: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
relationships,
|
||||
};
|
||||
}
|
||||
|
||||
function formalRelationship(overrides: Partial<KloEnrichedRelationship> = {}): KloEnrichedRelationship {
|
||||
return {
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'formal',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.account_id'],
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
columns: ['account_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'accounts',
|
||||
columnIds: ['accounts.id'],
|
||||
table: { catalog: null, db: null, name: 'accounts' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.6,
|
||||
isPrimaryKeyReference: false,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('formal metadata relationship collection', () => {
|
||||
it('accepts valid formal relationships with ground-truth confidence', () => {
|
||||
const result = collectKloFormalMetadataRelationships(schema([formalRelationship()]));
|
||||
|
||||
expect(result.accepted).toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'orders:orders.account_id->accounts:accounts.id',
|
||||
source: 'formal',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
}),
|
||||
]);
|
||||
expect(result.skipped).toEqual([]);
|
||||
expect(result.acceptedIds).toEqual(new Set(['orders:orders.account_id->accounts:accounts.id']));
|
||||
});
|
||||
|
||||
it('skips duplicate and invalid formal relationships with reasons', () => {
|
||||
const result = collectKloFormalMetadataRelationships(
|
||||
schema([
|
||||
formalRelationship(),
|
||||
formalRelationship(),
|
||||
formalRelationship({
|
||||
id: 'orders:orders.missing_account_id->accounts:accounts.id',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.missing_account_id'],
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
columns: ['missing_account_id'],
|
||||
},
|
||||
}),
|
||||
formalRelationship({
|
||||
id: 'manual-edge',
|
||||
source: 'manual',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
||||
expect(result.accepted).toHaveLength(1);
|
||||
expect(result.skipped).toEqual([
|
||||
{
|
||||
relationshipId: 'orders:orders.account_id->accounts:accounts.id',
|
||||
reason: 'formal_metadata_duplicate',
|
||||
},
|
||||
{
|
||||
relationshipId: 'orders:orders.missing_account_id->accounts:accounts.id',
|
||||
reason: 'formal_metadata_endpoint_not_found',
|
||||
},
|
||||
]);
|
||||
});
|
||||
});
|
||||
61
packages/context/src/scan/relationship-formal-metadata.ts
Normal file
61
packages/context/src/scan/relationship-formal-metadata.ts
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
import type { KloEnrichedRelationship, KloEnrichedSchema, KloSkippedRelationship } from './enrichment-types.js';
|
||||
|
||||
export interface KloFormalMetadataRelationshipCollection {
|
||||
accepted: KloEnrichedRelationship[];
|
||||
skipped: KloSkippedRelationship[];
|
||||
acceptedIds: Set<string>;
|
||||
}
|
||||
|
||||
function relationshipEndpointExists(schema: KloEnrichedSchema, relationship: KloEnrichedRelationship): boolean {
|
||||
const fromTable = schema.tables.find((table) => table.id === relationship.from.tableId && table.enabled);
|
||||
const toTable = schema.tables.find((table) => table.id === relationship.to.tableId && table.enabled);
|
||||
const fromColumn = fromTable?.columns.some(
|
||||
(column) => relationship.from.columnIds.includes(column.id) && relationship.from.columns.includes(column.name),
|
||||
);
|
||||
const toColumn = toTable?.columns.some(
|
||||
(column) => relationship.to.columnIds.includes(column.id) && relationship.to.columns.includes(column.name),
|
||||
);
|
||||
return Boolean(fromTable && toTable && fromColumn && toColumn);
|
||||
}
|
||||
|
||||
export function collectKloFormalMetadataRelationships(
|
||||
schema: KloEnrichedSchema,
|
||||
): KloFormalMetadataRelationshipCollection {
|
||||
const accepted: KloEnrichedRelationship[] = [];
|
||||
const skipped: KloSkippedRelationship[] = [];
|
||||
const acceptedIds = new Set<string>();
|
||||
|
||||
for (const relationship of schema.relationships) {
|
||||
if (relationship.source !== 'formal') {
|
||||
continue;
|
||||
}
|
||||
if (acceptedIds.has(relationship.id)) {
|
||||
skipped.push({
|
||||
relationshipId: relationship.id,
|
||||
reason: 'formal_metadata_duplicate',
|
||||
});
|
||||
continue;
|
||||
}
|
||||
if (!relationshipEndpointExists(schema, relationship)) {
|
||||
skipped.push({
|
||||
relationshipId: relationship.id,
|
||||
reason: 'formal_metadata_endpoint_not_found',
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
acceptedIds.add(relationship.id);
|
||||
accepted.push({
|
||||
...relationship,
|
||||
source: 'formal',
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
accepted: accepted.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
skipped,
|
||||
acceptedIds,
|
||||
};
|
||||
}
|
||||
649
packages/context/src/scan/relationship-graph-resolver.test.ts
Normal file
649
packages/context/src/scan/relationship-graph-resolver.test.ts
Normal file
|
|
@ -0,0 +1,649 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type {
|
||||
KloEnrichedColumn,
|
||||
KloEnrichedSchema,
|
||||
KloEnrichedTable,
|
||||
KloRelationshipEndpoint,
|
||||
} from './enrichment-types.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type { KloValidatedRelationshipDiscoveryCandidate } from './relationship-validation.js';
|
||||
import { resolveKloRelationshipGraph } from './relationship-graph-resolver.js';
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KloEnrichedColumn> = {}): KloEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(overrides: { accountsPrimaryKey?: boolean } = {}): KloEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables: [
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false, primaryKey: overrides.accountsPrimaryKey ?? false }),
|
||||
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('account_archive', [column('account_archive', 'id', { nullable: false })]),
|
||||
table('users', [
|
||||
column('users', 'id', { nullable: false }),
|
||||
column('users', 'account_id', { nullable: false }),
|
||||
]),
|
||||
],
|
||||
relationships: [],
|
||||
};
|
||||
}
|
||||
|
||||
function endpoint(tableName: string, columnName: string): KloRelationshipEndpoint {
|
||||
return {
|
||||
tableId: tableName,
|
||||
columnIds: [`${tableName}.${columnName}`],
|
||||
table: { catalog: null, db: null, name: tableName },
|
||||
columns: [columnName],
|
||||
};
|
||||
}
|
||||
|
||||
function profiles(): KloRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [
|
||||
{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 3 },
|
||||
{ table: { catalog: null, db: null, name: 'account_archive' }, rowCount: 3 },
|
||||
{ table: { catalog: null, db: null, name: 'users' }, rowCount: 3 },
|
||||
],
|
||||
columns: {
|
||||
'accounts.id': {
|
||||
table: { catalog: null, db: null, name: 'accounts' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'account_archive.id': {
|
||||
table: { catalog: null, db: null, name: 'account_archive' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'users.account_id': {
|
||||
table: { catalog: null, db: null, name: 'users' },
|
||||
column: 'account_id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
|
||||
function validatedCandidate(
|
||||
overrides: Partial<KloValidatedRelationshipDiscoveryCandidate> = {},
|
||||
): KloValidatedRelationshipDiscoveryCandidate {
|
||||
const from = overrides.from ?? endpoint('users', 'account_id');
|
||||
const to = overrides.to ?? endpoint('accounts', 'id');
|
||||
return {
|
||||
id: `${from.tableId}:(${from.columnIds.join(',')})->${to.tableId}:(${to.columnIds.join(',')})`,
|
||||
from,
|
||||
to,
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: overrides.confidence ?? 0.95,
|
||||
source: overrides.source ?? 'normalized_table_match',
|
||||
status: overrides.status ?? 'accepted',
|
||||
score: overrides.score ?? 0.96,
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: to.table.name,
|
||||
targetColumnBase: to.columns[0] ?? '',
|
||||
targetKeyScore: 0.92,
|
||||
nameScore: 0.92,
|
||||
reasons: ['foreign_key_suffix', 'normalized_table_name', 'target_key_like'],
|
||||
...overrides.evidence,
|
||||
},
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 3,
|
||||
parentDistinct: 3,
|
||||
overlap: 3,
|
||||
checkedValues: 3,
|
||||
reasons: ['validation_passed'],
|
||||
...overrides.validation,
|
||||
},
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship graph resolver', () => {
|
||||
it('promotes validated relationship discovery references to accepted relationships and inferred PKs', () => {
|
||||
const result = resolveKloRelationshipGraph({
|
||||
schema: schema(),
|
||||
profiles: profiles(),
|
||||
candidates: [validatedCandidate()],
|
||||
});
|
||||
|
||||
expect(result.pks).toContainEqual({
|
||||
table: 'accounts',
|
||||
columns: ['id'],
|
||||
pkScore: expect.any(Number),
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 1,
|
||||
evidence: {
|
||||
declaredPrimaryKey: false,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 1,
|
||||
incomingReviewCount: 0,
|
||||
reasons: expect.arrayContaining(['unique_target_column', 'incoming_validated_reference']),
|
||||
},
|
||||
});
|
||||
expect(result.pks.find((pk) => pk.table === 'accounts')?.pkScore).toBeGreaterThanOrEqual(0.85);
|
||||
expect(result.relationships).toHaveLength(1);
|
||||
expect(result.relationships[0]).toMatchObject({
|
||||
from: { table: { name: 'users' }, columns: ['account_id'] },
|
||||
to: { table: { name: 'accounts' }, columns: ['id'] },
|
||||
status: 'accepted',
|
||||
pkScore: expect.any(Number),
|
||||
fkScore: expect.any(Number),
|
||||
graph: {
|
||||
reasons: expect.arrayContaining(['target_pk_score_passed', 'fk_score_passed']),
|
||||
},
|
||||
});
|
||||
expect(result.relationships[0]?.fkScore).toBeGreaterThanOrEqual(0.85);
|
||||
});
|
||||
|
||||
it('keeps validation-unavailable candidates in review even when name evidence is strong', () => {
|
||||
const result = resolveKloRelationshipGraph({
|
||||
schema: schema(),
|
||||
profiles: { ...profiles(), sqlAvailable: false, columns: {}, warnings: ['read_only_sql_unavailable'] },
|
||||
candidates: [
|
||||
validatedCandidate({
|
||||
status: 'review',
|
||||
score: 0.57,
|
||||
validation: {
|
||||
targetUniqueness: 0,
|
||||
sourceCoverage: 0,
|
||||
violationCount: 0,
|
||||
violationRatio: 1,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 0,
|
||||
parentDistinct: 0,
|
||||
overlap: 0,
|
||||
checkedValues: 0,
|
||||
reasons: ['validation_unavailable'],
|
||||
},
|
||||
}),
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.relationships).toHaveLength(1);
|
||||
expect(result.relationships[0]).toMatchObject({
|
||||
status: 'review',
|
||||
graph: {
|
||||
reasons: expect.arrayContaining(['validation_unavailable_review_only']),
|
||||
},
|
||||
});
|
||||
expect(result.relationships[0]?.fkScore).toBeGreaterThanOrEqual(0.55);
|
||||
});
|
||||
|
||||
it('accepts at most one target per source column and rejects the lower-scored conflict loser', () => {
|
||||
const winner = validatedCandidate({ confidence: 0.95, score: 0.96 });
|
||||
const loser = validatedCandidate({
|
||||
from: endpoint('users', 'account_id'),
|
||||
to: endpoint('account_archive', 'id'),
|
||||
confidence: 0.85,
|
||||
score: 0.9,
|
||||
evidence: {
|
||||
sourceColumnBase: 'account',
|
||||
targetTableBase: 'account_archive',
|
||||
targetColumnBase: 'id',
|
||||
targetKeyScore: 0.92,
|
||||
nameScore: 0.78,
|
||||
reasons: ['foreign_key_suffix', 'inflection', 'target_key_like'],
|
||||
},
|
||||
});
|
||||
|
||||
const result = resolveKloRelationshipGraph({
|
||||
schema: schema(),
|
||||
profiles: profiles(),
|
||||
candidates: [loser, winner],
|
||||
});
|
||||
|
||||
expect(result.relationships.map((relationship) => relationship.status)).toEqual(['accepted', 'rejected']);
|
||||
expect(result.relationships[0]?.to.table.name).toBe('accounts');
|
||||
expect(result.relationships[1]).toMatchObject({
|
||||
to: { table: { name: 'account_archive' }, columns: ['id'] },
|
||||
status: 'rejected',
|
||||
graph: {
|
||||
reasons: expect.arrayContaining(['conflict_lost']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('preserves declared primary keys as accepted even without incoming candidates', () => {
|
||||
const result = resolveKloRelationshipGraph({
|
||||
schema: schema({ accountsPrimaryKey: true }),
|
||||
profiles: profiles(),
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual([]);
|
||||
expect(result.pks).toContainEqual({
|
||||
table: 'accounts',
|
||||
columns: ['id'],
|
||||
pkScore: 1,
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 0,
|
||||
evidence: {
|
||||
declaredPrimaryKey: true,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 0,
|
||||
incomingReviewCount: 0,
|
||||
reasons: ['declared_primary_key'],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('infers profile-only key-like columns without incoming relationship candidates', () => {
|
||||
const baseSchema = schema();
|
||||
const invoices = table('invoices', [
|
||||
column('invoices', 'id', { nullable: false }),
|
||||
column('invoices', 'invoice_number', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
}),
|
||||
column('invoices', 'amount', {
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
}),
|
||||
]);
|
||||
const baseProfiles = profiles();
|
||||
const result = resolveKloRelationshipGraph({
|
||||
schema: { ...baseSchema, tables: [...baseSchema.tables, invoices] },
|
||||
profiles: {
|
||||
...baseProfiles,
|
||||
tables: [...baseProfiles.tables, { table: invoices.ref, rowCount: 3 }],
|
||||
columns: {
|
||||
...baseProfiles.columns,
|
||||
'invoices.id': {
|
||||
table: invoices.ref,
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2', '3'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'invoices.invoice_number': {
|
||||
table: invoices.ref,
|
||||
column: 'invoice_number',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['INV-1', 'INV-2', 'INV-3'],
|
||||
minTextLength: 5,
|
||||
maxTextLength: 5,
|
||||
},
|
||||
'invoices.amount': {
|
||||
table: invoices.ref,
|
||||
column: 'amount',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 2 / 3,
|
||||
nullRate: 0,
|
||||
sampleValues: ['100', '200'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
expect(result.relationships).toEqual([]);
|
||||
expect(result.pks).toContainEqual({
|
||||
table: 'invoices',
|
||||
columns: ['id'],
|
||||
pkScore: 1,
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 0,
|
||||
evidence: {
|
||||
declaredPrimaryKey: false,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 0,
|
||||
incomingReviewCount: 0,
|
||||
reasons: expect.arrayContaining([
|
||||
'unique_target_column',
|
||||
'profile_key_name',
|
||||
'not_null_profile',
|
||||
'profile_only_primary_key',
|
||||
'no_incoming_references',
|
||||
]),
|
||||
},
|
||||
});
|
||||
expect(result.pks).toContainEqual(
|
||||
expect.objectContaining({
|
||||
table: 'invoices',
|
||||
columns: ['invoice_number'],
|
||||
status: 'review',
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
|
||||
}),
|
||||
}),
|
||||
);
|
||||
expect(result.pks.some((pk) => pk.table === 'invoices' && pk.columns[0] === 'amount')).toBe(false);
|
||||
});
|
||||
|
||||
it('pins single-incoming column_suffix_match resolver scores', () => {
|
||||
const schema = {
|
||||
connectionId: 'warehouse',
|
||||
relationships: [],
|
||||
tables: [
|
||||
{
|
||||
id: 'plans-id',
|
||||
ref: { catalog: null, db: null, name: 'stg_plans' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'plan-code-col',
|
||||
tableId: 'plans-id',
|
||||
tableRef: { catalog: null, db: null, name: 'stg_plans' },
|
||||
name: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'segments-id',
|
||||
ref: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'current-plan-code-col',
|
||||
tableId: 'segments-id',
|
||||
tableRef: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
name: 'current_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
} satisfies KloEnrichedSchema;
|
||||
const profiles = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite' as const,
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'stg_plans.plan_code': {
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
},
|
||||
};
|
||||
const result = resolveKloRelationshipGraph({
|
||||
schema,
|
||||
profiles,
|
||||
candidates: [
|
||||
{
|
||||
id: 'segments:(current_plan_code)->plans:(plan_code)',
|
||||
from: {
|
||||
tableId: 'segments-id',
|
||||
columnIds: ['current-plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
columns: ['current_plan_code'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'plans-id',
|
||||
columnIds: ['plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
columns: ['plan_code'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.902,
|
||||
source: 'column_suffix_match',
|
||||
evidence: {
|
||||
sourceColumnBase: 'current_plan',
|
||||
targetTableBase: 'plan',
|
||||
targetColumnBase: 'plan_code',
|
||||
targetKeyScore: 0.86,
|
||||
nameScore: 0.78,
|
||||
reasons: ['column_suffix_match', 'profile_unique_target'],
|
||||
},
|
||||
status: 'accepted',
|
||||
score: 0.98,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
sourceNullRate: 0,
|
||||
targetNullRate: 0,
|
||||
childDistinct: 4,
|
||||
parentDistinct: 4,
|
||||
overlap: 4,
|
||||
checkedValues: 4,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.pks).toEqual([
|
||||
expect.objectContaining({
|
||||
table: 'stg_plans',
|
||||
columns: ['plan_code'],
|
||||
pkScore: 0.922,
|
||||
status: 'accepted',
|
||||
}),
|
||||
]);
|
||||
expect(result.relationships).toEqual([
|
||||
expect.objectContaining({
|
||||
source: 'column_suffix_match',
|
||||
status: 'accepted',
|
||||
pkScore: 0.922,
|
||||
fkScore: 0.953,
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it('keeps strong profile-only primary key evidence when name evidence is weak', () => {
|
||||
const baseSchema = schema();
|
||||
baseSchema.tables.push(
|
||||
table('events', [
|
||||
column('events', 'warehouse_key', {
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
||||
const baseProfiles = profiles();
|
||||
baseProfiles.tables.push({ table: { catalog: null, db: null, name: 'events' }, rowCount: 3 });
|
||||
baseProfiles.columns['events.warehouse_key'] = {
|
||||
table: { catalog: null, db: null, name: 'events' },
|
||||
column: 'warehouse_key',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['100', '101', '102'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
};
|
||||
|
||||
const result = resolveKloRelationshipGraph({
|
||||
schema: baseSchema,
|
||||
profiles: baseProfiles,
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
expect(result.pks).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
table: 'events',
|
||||
columns: ['warehouse_key'],
|
||||
status: 'review',
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
|
||||
}),
|
||||
}),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('keeps strong profile-only primary key evidence when the column is not key-shaped', () => {
|
||||
const baseSchema = schema();
|
||||
baseSchema.tables.push(
|
||||
table('events', [
|
||||
column('events', 'opaque_reference', {
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
}),
|
||||
]),
|
||||
);
|
||||
|
||||
const baseProfiles = profiles();
|
||||
baseProfiles.tables.push({ table: { catalog: null, db: null, name: 'events' }, rowCount: 3 });
|
||||
baseProfiles.columns['events.opaque_reference'] = {
|
||||
table: { catalog: null, db: null, name: 'events' },
|
||||
column: 'opaque_reference',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['100', '101', '102'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
};
|
||||
|
||||
const result = resolveKloRelationshipGraph({
|
||||
schema: baseSchema,
|
||||
profiles: baseProfiles,
|
||||
candidates: [],
|
||||
});
|
||||
|
||||
const inferredPk = result.pks.find((candidate) => candidate.table === 'events');
|
||||
expect(inferredPk).toMatchObject({
|
||||
table: 'events',
|
||||
columns: ['opaque_reference'],
|
||||
status: 'review',
|
||||
evidence: expect.objectContaining({
|
||||
reasons: expect.arrayContaining(['profile_only_primary_key', 'weak_name_profile_key']),
|
||||
}),
|
||||
});
|
||||
expect(inferredPk?.pkScore).toBeGreaterThanOrEqual(0.55);
|
||||
});
|
||||
});
|
||||
508
packages/context/src/scan/relationship-graph-resolver.ts
Normal file
508
packages/context/src/scan/relationship-graph-resolver.ts
Normal file
|
|
@ -0,0 +1,508 @@
|
|||
import type {
|
||||
KloEnrichedColumn,
|
||||
KloEnrichedSchema,
|
||||
KloEnrichedTable,
|
||||
KloRelationshipEndpoint,
|
||||
} from './enrichment-types.js';
|
||||
import { normalizeKloRelationshipName } from './relationship-candidates.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import { scoreKloRelationshipCandidate } from './relationship-scoring.js';
|
||||
import type { KloValidatedRelationshipDiscoveryCandidate } from './relationship-validation.js';
|
||||
|
||||
export type KloResolvedRelationshipStatus = 'accepted' | 'review' | 'rejected';
|
||||
|
||||
export interface KloRelationshipGraphResolverSettings {
|
||||
acceptThreshold: number;
|
||||
reviewThreshold: number;
|
||||
minTargetPkScoreForAcceptance: number;
|
||||
validationRequiredForManifest: boolean;
|
||||
}
|
||||
|
||||
export interface KloResolvedRelationshipPkEvidence {
|
||||
declaredPrimaryKey: boolean;
|
||||
targetUniqueness: number;
|
||||
incomingAcceptedCount: number;
|
||||
incomingReviewCount: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KloResolvedRelationshipPk {
|
||||
table: string;
|
||||
columns: string[];
|
||||
pkScore: number;
|
||||
status: KloResolvedRelationshipStatus;
|
||||
incomingCandidateCount: number;
|
||||
evidence: KloResolvedRelationshipPkEvidence;
|
||||
}
|
||||
|
||||
export interface KloResolvedRelationshipGraphEvidence {
|
||||
targetPkScore: number;
|
||||
incomingCandidateCount: number;
|
||||
conflictRank: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KloResolvedRelationshipDiscoveryCandidate
|
||||
extends Omit<KloValidatedRelationshipDiscoveryCandidate, 'status'> {
|
||||
status: KloResolvedRelationshipStatus;
|
||||
pkScore: number;
|
||||
fkScore: number;
|
||||
graph: KloResolvedRelationshipGraphEvidence;
|
||||
}
|
||||
|
||||
export interface KloRelationshipGraphResolutionResult {
|
||||
pks: KloResolvedRelationshipPk[];
|
||||
relationships: KloResolvedRelationshipDiscoveryCandidate[];
|
||||
}
|
||||
|
||||
export interface ResolveKloRelationshipGraphInput {
|
||||
schema: KloEnrichedSchema;
|
||||
profiles: KloRelationshipProfileArtifact;
|
||||
candidates: readonly KloValidatedRelationshipDiscoveryCandidate[];
|
||||
settings?: Partial<KloRelationshipGraphResolverSettings>;
|
||||
}
|
||||
|
||||
const DEFAULT_SETTINGS: KloRelationshipGraphResolverSettings = {
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
minTargetPkScoreForAcceptance: 0.78,
|
||||
validationRequiredForManifest: true,
|
||||
};
|
||||
|
||||
const PROFILE_ONLY_PK_MEASURE_NAME_TOKENS = new Set(['amount', 'count', 'price', 'quantity', 'subtotal', 'total']);
|
||||
|
||||
function mergeSettings(
|
||||
settings: Partial<KloRelationshipGraphResolverSettings> | undefined,
|
||||
): KloRelationshipGraphResolverSettings {
|
||||
return { ...DEFAULT_SETTINGS, ...settings };
|
||||
}
|
||||
|
||||
function roundScore(value: number): number {
|
||||
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
|
||||
}
|
||||
|
||||
function endpointKey(endpoint: KloRelationshipEndpoint): string {
|
||||
return `${endpoint.table.name}.${singleRelationshipColumn(endpoint)}`;
|
||||
}
|
||||
|
||||
function sourceKey(endpoint: KloRelationshipEndpoint): string {
|
||||
return `${endpoint.tableId}:${endpoint.columnIds.join(',')}`;
|
||||
}
|
||||
|
||||
function singleRelationshipColumn(endpoint: KloRelationshipEndpoint): string {
|
||||
const column = endpoint.columns[0];
|
||||
if (!column) {
|
||||
throw new Error(`Expected relationship endpoint ${endpoint.table.name} to contain one column`);
|
||||
}
|
||||
return column;
|
||||
}
|
||||
|
||||
function pkKey(pk: Pick<KloResolvedRelationshipPk, 'table' | 'columns'>): string {
|
||||
return `${pk.table}.(${pk.columns.join(',')})`;
|
||||
}
|
||||
|
||||
function candidateSortKey(candidate: Pick<KloValidatedRelationshipDiscoveryCandidate, 'from' | 'to'>): string {
|
||||
return `${candidate.from.table.name}.${singleRelationshipColumn(candidate.from)}->${candidate.to.table.name}.${singleRelationshipColumn(candidate.to)}`;
|
||||
}
|
||||
|
||||
function statusForScore(
|
||||
score: number,
|
||||
settings: KloRelationshipGraphResolverSettings,
|
||||
acceptedAllowed: boolean,
|
||||
): KloResolvedRelationshipStatus {
|
||||
if (acceptedAllowed && score >= settings.acceptThreshold) {
|
||||
return 'accepted';
|
||||
}
|
||||
if (score >= settings.reviewThreshold) {
|
||||
return 'review';
|
||||
}
|
||||
return 'rejected';
|
||||
}
|
||||
|
||||
function candidateHasValidationPassed(candidate: KloValidatedRelationshipDiscoveryCandidate): boolean {
|
||||
return candidate.validation.reasons.includes('validation_passed');
|
||||
}
|
||||
|
||||
function candidateIsValidationUnavailable(candidate: KloValidatedRelationshipDiscoveryCandidate): boolean {
|
||||
return (
|
||||
candidate.validation.reasons.includes('validation_unavailable') ||
|
||||
candidate.validation.reasons.includes('profile_unavailable')
|
||||
);
|
||||
}
|
||||
|
||||
function declaredPrimaryKeys(schema: KloEnrichedSchema): KloResolvedRelationshipPk[] {
|
||||
const pks: KloResolvedRelationshipPk[] = [];
|
||||
for (const table of schema.tables.filter((candidate) => candidate.enabled)) {
|
||||
for (const column of table.columns.filter((candidate) => candidate.primaryKey)) {
|
||||
pks.push({
|
||||
table: table.ref.name,
|
||||
columns: [column.name],
|
||||
pkScore: 1,
|
||||
status: 'accepted',
|
||||
incomingCandidateCount: 0,
|
||||
evidence: {
|
||||
declaredPrimaryKey: true,
|
||||
targetUniqueness: 1,
|
||||
incomingAcceptedCount: 0,
|
||||
incomingReviewCount: 0,
|
||||
reasons: ['declared_primary_key'],
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
return pks;
|
||||
}
|
||||
|
||||
function schemaTargetColumns(schema: KloEnrichedSchema): Array<{ table: KloEnrichedTable; column: KloEnrichedColumn }> {
|
||||
return schema.tables
|
||||
.filter((table) => table.enabled)
|
||||
.flatMap((table) => table.columns.map((column) => ({ table, column })));
|
||||
}
|
||||
|
||||
function profileUniqueness(profiles: KloRelationshipProfileArtifact, tableName: string, columnName: string): number {
|
||||
return profiles.columns[`${tableName}.${columnName}`]?.uniquenessRatio ?? 0;
|
||||
}
|
||||
|
||||
function profileNullRate(profiles: KloRelationshipProfileArtifact, tableName: string, columnName: string): number {
|
||||
return profiles.columns[`${tableName}.${columnName}`]?.nullRate ?? 1;
|
||||
}
|
||||
|
||||
function profileColumnExists(profiles: KloRelationshipProfileArtifact, tableName: string, columnName: string): boolean {
|
||||
return Boolean(profiles.columns[`${tableName}.${columnName}`]);
|
||||
}
|
||||
|
||||
function profileOnlyPkNameScore(tableName: string, columnName: string): number {
|
||||
const table = normalizeKloRelationshipName(tableName).singular;
|
||||
const column = normalizeKloRelationshipName(columnName).normalized;
|
||||
if (column === 'id') {
|
||||
return 1;
|
||||
}
|
||||
if (column === `${table}_id`) {
|
||||
return 0.96;
|
||||
}
|
||||
if (column === `${table}_key`) {
|
||||
return 0.88;
|
||||
}
|
||||
if (column === 'key' || column === 'uuid') {
|
||||
return 0.76;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function profileOnlyPkTypeCompatibility(columnName: string): number {
|
||||
const tokens = normalizeKloRelationshipName(columnName).normalized.split('_').filter(Boolean);
|
||||
return tokens.some((token) => PROFILE_ONLY_PK_MEASURE_NAME_TOKENS.has(token)) ? 0 : 1;
|
||||
}
|
||||
|
||||
function profileOnlyPkEvidence(input: {
|
||||
profiles: KloRelationshipProfileArtifact;
|
||||
tableName: string;
|
||||
columnName: string;
|
||||
}): { nameScore: number; nullRate: number; uniqueness: number; pkScore: number; weakName: boolean } | null {
|
||||
if (!profileColumnExists(input.profiles, input.tableName, input.columnName)) {
|
||||
return null;
|
||||
}
|
||||
const uniqueness = profileUniqueness(input.profiles, input.tableName, input.columnName);
|
||||
const nullRate = profileNullRate(input.profiles, input.tableName, input.columnName);
|
||||
const nameScore = profileOnlyPkNameScore(input.tableName, input.columnName);
|
||||
if (uniqueness < 0.98 || nullRate > 0.05) {
|
||||
return null;
|
||||
}
|
||||
const typeCompatibility = profileOnlyPkTypeCompatibility(input.columnName);
|
||||
const scoreBreakdown = scoreKloRelationshipCandidate(
|
||||
{
|
||||
nameSimilarity: nameScore,
|
||||
typeCompatibility,
|
||||
valueOverlap: 0,
|
||||
embeddingSimilarity: 0,
|
||||
profileUniqueness: uniqueness,
|
||||
profileNullRate: 1 - nullRate,
|
||||
structuralPrior: 0.65,
|
||||
},
|
||||
{
|
||||
nameSimilarity: 0.2,
|
||||
typeCompatibility: 0.08,
|
||||
valueOverlap: 0,
|
||||
embeddingSimilarity: 0,
|
||||
profileUniqueness: 0.48,
|
||||
profileNullRate: 0.2,
|
||||
structuralPrior: 0.04,
|
||||
},
|
||||
);
|
||||
|
||||
if (scoreBreakdown.score < DEFAULT_SETTINGS.reviewThreshold) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return { nameScore, nullRate, uniqueness, pkScore: scoreBreakdown.score, weakName: nameScore < 0.74 };
|
||||
}
|
||||
|
||||
function resolveTargetPk(input: {
|
||||
table: string;
|
||||
column: string;
|
||||
declared: KloResolvedRelationshipPk | undefined;
|
||||
profiles: KloRelationshipProfileArtifact;
|
||||
incoming: readonly KloValidatedRelationshipDiscoveryCandidate[];
|
||||
settings: KloRelationshipGraphResolverSettings;
|
||||
profileOnly?: { nameScore: number; nullRate: number; uniqueness: number; pkScore: number; weakName: boolean } | null;
|
||||
}): KloResolvedRelationshipPk {
|
||||
if (input.declared) {
|
||||
return input.declared;
|
||||
}
|
||||
|
||||
const targetUniqueness = profileUniqueness(input.profiles, input.table, input.column);
|
||||
const incomingAccepted = input.incoming.filter((candidate) => candidate.status === 'accepted');
|
||||
const incomingReview = input.incoming.filter((candidate) => candidate.status === 'review');
|
||||
const incomingQuality = Math.max(0, ...input.incoming.map((candidate) => candidate.score));
|
||||
const incomingVolume = Math.min(1, incomingAccepted.length * 0.3 + incomingReview.length * 0.15);
|
||||
const keyEvidence = Math.max(0, ...input.incoming.map((candidate) => candidate.evidence.targetKeyScore));
|
||||
const reasons: string[] = [];
|
||||
|
||||
if (targetUniqueness >= 0.9) {
|
||||
reasons.push('unique_target_column');
|
||||
}
|
||||
if (incomingAccepted.length > 0) {
|
||||
reasons.push('incoming_validated_reference');
|
||||
}
|
||||
if (incomingReview.length > 0) {
|
||||
reasons.push('incoming_review_reference');
|
||||
}
|
||||
if (keyEvidence >= 0.8) {
|
||||
reasons.push('target_key_like');
|
||||
}
|
||||
if (input.incoming.length === 0) {
|
||||
reasons.push('no_incoming_references');
|
||||
}
|
||||
|
||||
if (input.profileOnly) {
|
||||
reasons.push('not_null_profile', 'profile_only_primary_key');
|
||||
if (input.profileOnly.weakName) {
|
||||
reasons.push('weak_name_profile_key');
|
||||
} else {
|
||||
reasons.push('profile_key_name');
|
||||
}
|
||||
const pkScore = input.profileOnly.pkScore;
|
||||
return {
|
||||
table: input.table,
|
||||
columns: [input.column],
|
||||
pkScore,
|
||||
status: statusForScore(pkScore, input.settings, !input.profileOnly.weakName),
|
||||
incomingCandidateCount: 0,
|
||||
evidence: {
|
||||
declaredPrimaryKey: false,
|
||||
targetUniqueness,
|
||||
incomingAcceptedCount: 0,
|
||||
incomingReviewCount: 0,
|
||||
reasons,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const pkScore = roundScore(0.52 * targetUniqueness + 0.28 * incomingQuality + 0.12 * keyEvidence + 0.08 * incomingVolume);
|
||||
const acceptedAllowed = incomingAccepted.length > 0 && targetUniqueness >= 0.9;
|
||||
const status =
|
||||
incomingReview.length > 0 && pkScore < input.settings.reviewThreshold
|
||||
? 'review'
|
||||
: statusForScore(pkScore, input.settings, acceptedAllowed);
|
||||
|
||||
return {
|
||||
table: input.table,
|
||||
columns: [input.column],
|
||||
pkScore,
|
||||
status,
|
||||
incomingCandidateCount: input.incoming.length,
|
||||
evidence: {
|
||||
declaredPrimaryKey: false,
|
||||
targetUniqueness,
|
||||
incomingAcceptedCount: incomingAccepted.length,
|
||||
incomingReviewCount: incomingReview.length,
|
||||
reasons,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function baseRelationshipResolution(input: {
|
||||
candidate: KloValidatedRelationshipDiscoveryCandidate;
|
||||
pk: KloResolvedRelationshipPk;
|
||||
settings: KloRelationshipGraphResolverSettings;
|
||||
}): KloResolvedRelationshipDiscoveryCandidate {
|
||||
const reasons: string[] = [];
|
||||
if (input.candidate.status === 'rejected') {
|
||||
reasons.push('candidate_validation_rejected');
|
||||
}
|
||||
if (candidateIsValidationUnavailable(input.candidate)) {
|
||||
reasons.push('validation_unavailable_review_only');
|
||||
}
|
||||
if (input.pk.pkScore >= input.settings.minTargetPkScoreForAcceptance) {
|
||||
reasons.push('target_pk_score_passed');
|
||||
} else {
|
||||
reasons.push('target_pk_score_low');
|
||||
}
|
||||
if (candidateHasValidationPassed(input.candidate)) {
|
||||
reasons.push('validation_passed');
|
||||
}
|
||||
|
||||
const validationPassBonus = candidateHasValidationPassed(input.candidate) ? 1 : 0;
|
||||
let fkScore = roundScore(
|
||||
0.48 * input.candidate.score +
|
||||
0.3 * input.pk.pkScore +
|
||||
0.14 * input.candidate.confidence +
|
||||
0.08 * validationPassBonus,
|
||||
);
|
||||
let status: KloResolvedRelationshipStatus;
|
||||
|
||||
if (input.candidate.status === 'rejected') {
|
||||
status = 'rejected';
|
||||
} else if (candidateIsValidationUnavailable(input.candidate)) {
|
||||
status = 'review';
|
||||
fkScore = Math.max(fkScore, input.settings.reviewThreshold);
|
||||
} else {
|
||||
const acceptedAllowed =
|
||||
input.candidate.status === 'accepted' &&
|
||||
input.pk.pkScore >= input.settings.minTargetPkScoreForAcceptance &&
|
||||
(!input.settings.validationRequiredForManifest || candidateHasValidationPassed(input.candidate));
|
||||
status = statusForScore(fkScore, input.settings, acceptedAllowed);
|
||||
}
|
||||
|
||||
if (status === 'accepted') {
|
||||
reasons.push('fk_score_passed');
|
||||
} else if (status === 'review') {
|
||||
reasons.push('fk_score_review');
|
||||
} else {
|
||||
reasons.push('fk_score_rejected');
|
||||
}
|
||||
|
||||
return {
|
||||
...input.candidate,
|
||||
status,
|
||||
pkScore: input.pk.pkScore,
|
||||
fkScore,
|
||||
graph: {
|
||||
targetPkScore: input.pk.pkScore,
|
||||
incomingCandidateCount: input.pk.incomingCandidateCount,
|
||||
conflictRank: 1,
|
||||
reasons,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipRank(
|
||||
left: KloResolvedRelationshipDiscoveryCandidate,
|
||||
right: KloResolvedRelationshipDiscoveryCandidate,
|
||||
): number {
|
||||
return (
|
||||
right.fkScore - left.fkScore ||
|
||||
right.validation.sourceCoverage - left.validation.sourceCoverage ||
|
||||
right.pkScore - left.pkScore ||
|
||||
candidateSortKey(left).localeCompare(candidateSortKey(right))
|
||||
);
|
||||
}
|
||||
|
||||
function applySourceConflicts(
|
||||
relationships: readonly KloResolvedRelationshipDiscoveryCandidate[],
|
||||
): KloResolvedRelationshipDiscoveryCandidate[] {
|
||||
const bySource = new Map<string, KloResolvedRelationshipDiscoveryCandidate[]>();
|
||||
for (const relationship of relationships) {
|
||||
const key = sourceKey(relationship.from);
|
||||
bySource.set(key, [...(bySource.get(key) ?? []), relationship]);
|
||||
}
|
||||
|
||||
const resolved: KloResolvedRelationshipDiscoveryCandidate[] = [];
|
||||
for (const group of bySource.values()) {
|
||||
const ranked = [...group].sort(relationshipRank);
|
||||
let acceptedSeen = false;
|
||||
ranked.forEach((relationship, index) => {
|
||||
const conflictRank = index + 1;
|
||||
if (relationship.status === 'accepted' && acceptedSeen) {
|
||||
resolved.push({
|
||||
...relationship,
|
||||
status: 'rejected',
|
||||
graph: {
|
||||
...relationship.graph,
|
||||
conflictRank,
|
||||
reasons: [...relationship.graph.reasons.filter((reason) => reason !== 'fk_score_passed'), 'conflict_lost'],
|
||||
},
|
||||
});
|
||||
return;
|
||||
}
|
||||
if (relationship.status === 'accepted') {
|
||||
acceptedSeen = true;
|
||||
}
|
||||
resolved.push({
|
||||
...relationship,
|
||||
graph: {
|
||||
...relationship.graph,
|
||||
conflictRank,
|
||||
},
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return resolved.sort(relationshipRank);
|
||||
}
|
||||
|
||||
export function resolveKloRelationshipGraph(
|
||||
input: ResolveKloRelationshipGraphInput,
|
||||
): KloRelationshipGraphResolutionResult {
|
||||
const settings = mergeSettings(input.settings);
|
||||
const declared = declaredPrimaryKeys(input.schema);
|
||||
const declaredByKey = new Map(declared.map((pk) => [pkKey(pk), pk]));
|
||||
const incomingByTarget = new Map<string, KloValidatedRelationshipDiscoveryCandidate[]>();
|
||||
|
||||
for (const candidate of input.candidates) {
|
||||
const key = endpointKey(candidate.to);
|
||||
incomingByTarget.set(key, [...(incomingByTarget.get(key) ?? []), candidate]);
|
||||
}
|
||||
|
||||
const pkCandidates = new Map<string, KloResolvedRelationshipPk>();
|
||||
for (const item of schemaTargetColumns(input.schema)) {
|
||||
const key = `${item.table.ref.name}.(${item.column.name})`;
|
||||
const incoming = incomingByTarget.get(`${item.table.ref.name}.${item.column.name}`) ?? [];
|
||||
const profileOnly =
|
||||
incoming.length === 0 && !item.column.primaryKey
|
||||
? profileOnlyPkEvidence({
|
||||
profiles: input.profiles,
|
||||
tableName: item.table.ref.name,
|
||||
columnName: item.column.name,
|
||||
})
|
||||
: null;
|
||||
if (incoming.length === 0 && !item.column.primaryKey && !profileOnly) {
|
||||
continue;
|
||||
}
|
||||
const pk = resolveTargetPk({
|
||||
table: item.table.ref.name,
|
||||
column: item.column.name,
|
||||
declared: declaredByKey.get(key),
|
||||
profiles: input.profiles,
|
||||
incoming,
|
||||
settings,
|
||||
profileOnly,
|
||||
});
|
||||
pkCandidates.set(key, pk);
|
||||
}
|
||||
|
||||
const relationships = input.candidates.map((candidate) => {
|
||||
const toColumn = singleRelationshipColumn(candidate.to);
|
||||
const key = `${candidate.to.table.name}.(${toColumn})`;
|
||||
const pk =
|
||||
pkCandidates.get(key) ??
|
||||
resolveTargetPk({
|
||||
table: candidate.to.table.name,
|
||||
column: toColumn,
|
||||
declared: undefined,
|
||||
profiles: input.profiles,
|
||||
incoming: incomingByTarget.get(endpointKey(candidate.to)) ?? [],
|
||||
settings,
|
||||
profileOnly: null,
|
||||
});
|
||||
pkCandidates.set(key, pk);
|
||||
return baseRelationshipResolution({ candidate, pk, settings });
|
||||
});
|
||||
|
||||
return {
|
||||
pks: Array.from(pkCandidates.values()).sort(
|
||||
(left, right) => right.pkScore - left.pkScore || pkKey(left).localeCompare(pkKey(right)),
|
||||
),
|
||||
relationships: applySourceConflicts(relationships),
|
||||
};
|
||||
}
|
||||
240
packages/context/src/scan/relationship-llm-proposal.test.ts
Normal file
240
packages/context/src/scan/relationship-llm-proposal.test.ts
Normal file
|
|
@ -0,0 +1,240 @@
|
|||
import type { KloLlmProvider } from '@klo/llm';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import { proposeKloRelationshipCandidatesWithLlm } from './relationship-llm-proposal.js';
|
||||
|
||||
function llmProvider(provider = 'anthropic'): KloLlmProvider {
|
||||
const model = { modelId: 'claude-sonnet-4-6', provider };
|
||||
return {
|
||||
getModel: vi.fn(() => model as ReturnType<KloLlmProvider['getModel']>),
|
||||
getModelByName: vi.fn(() => model as ReturnType<KloLlmProvider['getModelByName']>),
|
||||
cacheMarker: vi.fn(),
|
||||
repairToolCallHandler: vi.fn(),
|
||||
thinkingProviderOptions: vi.fn(() => ({})),
|
||||
telemetryConfig: vi.fn(() => undefined),
|
||||
promptCachingConfig: vi.fn(
|
||||
() =>
|
||||
({
|
||||
enabled: false,
|
||||
systemTtl: '1h',
|
||||
toolsTtl: '1h',
|
||||
historyTtl: '5m',
|
||||
cacheSystem: true,
|
||||
cacheTools: true,
|
||||
cacheHistory: true,
|
||||
vertexFallbackTo5m: false,
|
||||
}) as ReturnType<KloLlmProvider['promptCachingConfig']>,
|
||||
),
|
||||
activeBackend: vi.fn(() => provider as ReturnType<KloLlmProvider['activeBackend']>),
|
||||
};
|
||||
}
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KloEnrichedColumn> = {}): KloEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(): KloEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
relationships: [],
|
||||
tables: [
|
||||
table('customers', [
|
||||
column('customers', 'id', { nullable: false }),
|
||||
column('customers', 'email', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('orders', [
|
||||
column('orders', 'id', { nullable: false }),
|
||||
column('orders', 'buyer_ref'),
|
||||
]),
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function profile(): KloRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 4,
|
||||
warnings: [],
|
||||
tables: [
|
||||
{ table: { catalog: null, db: null, name: 'customers' }, rowCount: 2 },
|
||||
{ table: { catalog: null, db: null, name: 'orders' }, rowCount: 2 },
|
||||
],
|
||||
columns: {
|
||||
'customers.id': {
|
||||
table: { catalog: null, db: null, name: 'customers' },
|
||||
column: 'id',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
'orders.buyer_ref': {
|
||||
table: { catalog: null, db: null, name: 'orders' },
|
||||
column: 'buyer_ref',
|
||||
nativeType: 'INTEGER',
|
||||
normalizedType: 'integer',
|
||||
rowCount: 2,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['1', '2'],
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship LLM proposals', () => {
|
||||
it('maps valid structured FK proposals into review candidates with rationale evidence', async () => {
|
||||
const generateText = vi.fn(async () => ({
|
||||
output: {
|
||||
pkCandidates: [{ table: 'customers', column: 'id', confidence: 0.94, rationale: 'Unique customer identifier.' }],
|
||||
fkCandidates: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'buyer_ref',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
confidence: 0.88,
|
||||
rationale: 'Buyer reference values match customer identifiers.',
|
||||
},
|
||||
],
|
||||
},
|
||||
}));
|
||||
|
||||
const result = await proposeKloRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmProvider: llmProvider(),
|
||||
generateText,
|
||||
});
|
||||
|
||||
expect(result.summary).toBe('completed');
|
||||
expect(result.llmCalls).toBe(1);
|
||||
expect(result.warnings).toEqual([]);
|
||||
expect(result.candidates).toHaveLength(1);
|
||||
expect(result.candidates[0]).toMatchObject({
|
||||
from: { tableId: 'orders', columnIds: ['orders.buyer_ref'], columns: ['buyer_ref'] },
|
||||
to: { tableId: 'customers', columnIds: ['customers.id'], columns: ['id'] },
|
||||
source: 'llm_proposal',
|
||||
status: 'review',
|
||||
evidence: {
|
||||
llmConfidence: 0.88,
|
||||
llmRationale: 'Buyer reference values match customer identifiers.',
|
||||
reasons: ['llm_proposal', 'llm_pk_proposal'],
|
||||
},
|
||||
});
|
||||
expect(generateText).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
messages: expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
role: 'user',
|
||||
content: expect.stringContaining('"tables"'),
|
||||
}),
|
||||
]),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('skips deterministic providers without calling generateText', async () => {
|
||||
const generateText = vi.fn();
|
||||
|
||||
const result = await proposeKloRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmProvider: llmProvider('deterministic'),
|
||||
generateText,
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({ candidates: [], llmCalls: 0, summary: 'skipped' });
|
||||
expect(result.warnings).toEqual([]);
|
||||
expect(generateText).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('returns recoverable warnings for invalid references and generation failures', async () => {
|
||||
const invalidReference = await proposeKloRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmProvider: llmProvider(),
|
||||
generateText: vi.fn(async () => ({
|
||||
output: {
|
||||
pkCandidates: [],
|
||||
fkCandidates: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'missing_column',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
confidence: 0.7,
|
||||
rationale: 'Invalid source column.',
|
||||
},
|
||||
],
|
||||
},
|
||||
})),
|
||||
});
|
||||
expect(invalidReference.candidates).toEqual([]);
|
||||
expect(invalidReference.summary).toBe('completed');
|
||||
expect(invalidReference.warnings[0]).toMatchObject({
|
||||
code: 'relationship_llm_invalid_reference',
|
||||
recoverable: true,
|
||||
});
|
||||
|
||||
const failed = await proposeKloRelationshipCandidatesWithLlm({
|
||||
connectionId: 'warehouse',
|
||||
schema: schema(),
|
||||
profile: profile(),
|
||||
llmProvider: llmProvider(),
|
||||
generateText: vi.fn(async () => {
|
||||
throw new Error('model unavailable');
|
||||
}),
|
||||
});
|
||||
expect(failed).toMatchObject({ candidates: [], llmCalls: 1, summary: 'failed' });
|
||||
expect(failed.warnings[0]).toMatchObject({
|
||||
code: 'relationship_llm_proposal_failed',
|
||||
message: 'KLO relationship LLM proposal failed: model unavailable',
|
||||
recoverable: true,
|
||||
});
|
||||
});
|
||||
});
|
||||
281
packages/context/src/scan/relationship-llm-proposal.ts
Normal file
281
packages/context/src/scan/relationship-llm-proposal.ts
Normal file
|
|
@ -0,0 +1,281 @@
|
|||
import type { KloLlmProvider } from '@klo/llm';
|
||||
import type { generateText } from 'ai';
|
||||
import { z } from 'zod';
|
||||
import { generateKloObject } from '../llm/index.js';
|
||||
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
|
||||
import {
|
||||
normalizeKloRelationshipName,
|
||||
type KloRelationshipDiscoveryCandidate,
|
||||
} from './relationship-candidates.js';
|
||||
import type { KloRelationshipColumnProfile, KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type { KloScanEnrichmentSummary, KloScanWarning, KloTableRef } from './types.js';
|
||||
|
||||
const relationshipLlmProposalSchema = z.object({
|
||||
pkCandidates: z.array(
|
||||
z.object({
|
||||
table: z.string(),
|
||||
column: z.string(),
|
||||
confidence: z.number(),
|
||||
rationale: z.string(),
|
||||
}),
|
||||
),
|
||||
fkCandidates: z.array(
|
||||
z.object({
|
||||
fromTable: z.string(),
|
||||
fromColumn: z.string(),
|
||||
toTable: z.string(),
|
||||
toColumn: z.string(),
|
||||
confidence: z.number(),
|
||||
rationale: z.string(),
|
||||
}),
|
||||
),
|
||||
});
|
||||
|
||||
type KloRelationshipLlmProposalOutput = z.infer<typeof relationshipLlmProposalSchema>;
|
||||
type GenerateTextInput = Parameters<typeof generateText>[0];
|
||||
export type KloRelationshipLlmProposalGenerateText = (
|
||||
input: GenerateTextInput,
|
||||
) => Promise<{ text?: string; output?: unknown }>;
|
||||
|
||||
export interface KloRelationshipLlmProposalSettings {
|
||||
maxTablesPerBatch: number;
|
||||
maxColumnsPerTable: number;
|
||||
maxSampleValuesPerColumn: number;
|
||||
minConfidence: number;
|
||||
}
|
||||
|
||||
export interface ProposeKloRelationshipCandidatesWithLlmInput {
|
||||
connectionId: string;
|
||||
schema: KloEnrichedSchema;
|
||||
profile: KloRelationshipProfileArtifact;
|
||||
llmProvider: KloLlmProvider | null;
|
||||
settings?: Partial<KloRelationshipLlmProposalSettings>;
|
||||
generateText?: KloRelationshipLlmProposalGenerateText;
|
||||
}
|
||||
|
||||
export interface KloRelationshipLlmProposalResult {
|
||||
candidates: KloRelationshipDiscoveryCandidate[];
|
||||
warnings: KloScanWarning[];
|
||||
llmCalls: number;
|
||||
summary: KloScanEnrichmentSummary['llmRelationshipValidation'];
|
||||
}
|
||||
|
||||
const DEFAULT_SETTINGS: KloRelationshipLlmProposalSettings = {
|
||||
maxTablesPerBatch: 40,
|
||||
maxColumnsPerTable: 80,
|
||||
maxSampleValuesPerColumn: 5,
|
||||
minConfidence: 0.55,
|
||||
};
|
||||
|
||||
function mergeSettings(
|
||||
settings: Partial<KloRelationshipLlmProposalSettings> | undefined,
|
||||
): KloRelationshipLlmProposalSettings {
|
||||
return { ...DEFAULT_SETTINGS, ...settings };
|
||||
}
|
||||
|
||||
function clampConfidence(value: number): number {
|
||||
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
|
||||
}
|
||||
|
||||
function modelIsDeterministic(llmProvider: KloLlmProvider): boolean {
|
||||
const model = llmProvider.getModel('candidateExtraction');
|
||||
return (model as { provider?: string }).provider === 'deterministic';
|
||||
}
|
||||
|
||||
function findTable(schema: KloEnrichedSchema, name: string): KloEnrichedTable | null {
|
||||
const normalized = name.toLowerCase();
|
||||
return schema.tables.find((table) => table.ref.name.toLowerCase() === normalized) ?? null;
|
||||
}
|
||||
|
||||
function findColumn(table: KloEnrichedTable, name: string): KloEnrichedColumn | null {
|
||||
const normalized = name.toLowerCase();
|
||||
return table.columns.find((column) => column.name.toLowerCase() === normalized) ?? null;
|
||||
}
|
||||
|
||||
function profileKey(table: KloTableRef, column: KloEnrichedColumn): string {
|
||||
return `${table.name}.${column.name}`;
|
||||
}
|
||||
|
||||
function profileForColumn(
|
||||
profile: KloRelationshipProfileArtifact,
|
||||
table: KloEnrichedTable,
|
||||
column: KloEnrichedColumn,
|
||||
): KloRelationshipColumnProfile | null {
|
||||
return profile.columns[profileKey(table.ref, column)] ?? null;
|
||||
}
|
||||
|
||||
function rowCountForTable(profile: KloRelationshipProfileArtifact, table: KloEnrichedTable): number | null {
|
||||
return profile.tables.find((item) => item.table.name.toLowerCase() === table.ref.name.toLowerCase())?.rowCount ?? null;
|
||||
}
|
||||
|
||||
function buildEvidencePacket(
|
||||
schema: KloEnrichedSchema,
|
||||
profile: KloRelationshipProfileArtifact,
|
||||
settings: KloRelationshipLlmProposalSettings,
|
||||
): Record<string, unknown> {
|
||||
return {
|
||||
connectionId: schema.connectionId,
|
||||
sqlAvailable: profile.sqlAvailable,
|
||||
tables: schema.tables
|
||||
.filter((table) => table.enabled)
|
||||
.slice(0, settings.maxTablesPerBatch)
|
||||
.map((table) => ({
|
||||
name: table.ref.name,
|
||||
catalog: table.ref.catalog,
|
||||
db: table.ref.db,
|
||||
rowCount: rowCountForTable(profile, table),
|
||||
columns: table.columns.slice(0, settings.maxColumnsPerTable).map((column) => {
|
||||
const columnProfile = profileForColumn(profile, table, column);
|
||||
return {
|
||||
name: column.name,
|
||||
nativeType: column.nativeType,
|
||||
normalizedType: column.normalizedType,
|
||||
dimensionType: column.dimensionType,
|
||||
nullable: column.nullable,
|
||||
declaredPrimaryKey: column.primaryKey,
|
||||
profile: columnProfile
|
||||
? {
|
||||
rowCount: columnProfile.rowCount,
|
||||
nullCount: columnProfile.nullCount,
|
||||
distinctCount: columnProfile.distinctCount,
|
||||
uniquenessRatio: columnProfile.uniquenessRatio,
|
||||
nullRate: columnProfile.nullRate,
|
||||
sampleValues: columnProfile.sampleValues.slice(0, settings.maxSampleValuesPerColumn),
|
||||
}
|
||||
: null,
|
||||
};
|
||||
}),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
function pkProposalKey(table: string, column: string): string {
|
||||
return `${table.toLowerCase()}.${column.toLowerCase()}`;
|
||||
}
|
||||
|
||||
function endpoint(table: KloEnrichedTable, column: KloEnrichedColumn) {
|
||||
return {
|
||||
tableId: table.id,
|
||||
columnIds: [column.id],
|
||||
table: table.ref,
|
||||
columns: [column.name],
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipId(fromTable: KloEnrichedTable, fromColumn: KloEnrichedColumn, toTable: KloEnrichedTable, toColumn: KloEnrichedColumn): string {
|
||||
return `${fromTable.id}:(${fromColumn.id})->${toTable.id}:(${toColumn.id})`;
|
||||
}
|
||||
|
||||
function invalidReferenceWarning(message: string, metadata: Record<string, unknown>): KloScanWarning {
|
||||
return {
|
||||
code: 'relationship_llm_invalid_reference',
|
||||
message,
|
||||
recoverable: true,
|
||||
metadata,
|
||||
};
|
||||
}
|
||||
|
||||
function mapValidProposals(
|
||||
schema: KloEnrichedSchema,
|
||||
output: KloRelationshipLlmProposalOutput,
|
||||
settings: KloRelationshipLlmProposalSettings,
|
||||
): { candidates: KloRelationshipDiscoveryCandidate[]; warnings: KloScanWarning[] } {
|
||||
const warnings: KloScanWarning[] = [];
|
||||
const pkProposals = new Set(output.pkCandidates.map((item) => pkProposalKey(item.table, item.column)));
|
||||
const candidates: KloRelationshipDiscoveryCandidate[] = [];
|
||||
|
||||
for (const item of output.fkCandidates) {
|
||||
if (item.confidence < settings.minConfidence) {
|
||||
continue;
|
||||
}
|
||||
const fromTable = findTable(schema, item.fromTable);
|
||||
const toTable = findTable(schema, item.toTable);
|
||||
const fromColumn = fromTable ? findColumn(fromTable, item.fromColumn) : null;
|
||||
const toColumn = toTable ? findColumn(toTable, item.toColumn) : null;
|
||||
if (!fromTable || !toTable || !fromColumn || !toColumn) {
|
||||
warnings.push(
|
||||
invalidReferenceWarning('KLO relationship LLM proposal referenced a table or column that is not in the schema.', {
|
||||
proposal: item,
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
const pkProposalExists = pkProposals.has(pkProposalKey(toTable.ref.name, toColumn.name));
|
||||
candidates.push({
|
||||
id: relationshipId(fromTable, fromColumn, toTable, toColumn),
|
||||
from: endpoint(fromTable, fromColumn),
|
||||
to: endpoint(toTable, toColumn),
|
||||
source: 'llm_proposal',
|
||||
status: 'review',
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: clampConfidence(item.confidence),
|
||||
evidence: {
|
||||
sourceColumnBase: normalizeKloRelationshipName(fromColumn.name).singular,
|
||||
targetTableBase: normalizeKloRelationshipName(toTable.ref.name).singular,
|
||||
targetColumnBase: normalizeKloRelationshipName(toColumn.name).singular,
|
||||
targetKeyScore: pkProposalExists ? 0.88 : 0.68,
|
||||
nameScore: 0.45,
|
||||
reasons: pkProposalExists ? ['llm_proposal', 'llm_pk_proposal'] : ['llm_proposal'],
|
||||
llmConfidence: clampConfidence(item.confidence),
|
||||
llmRationale: item.rationale,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
return { candidates, warnings };
|
||||
}
|
||||
|
||||
function generationFailureWarning(error: unknown): KloScanWarning {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
return {
|
||||
code: 'relationship_llm_proposal_failed',
|
||||
message: `KLO relationship LLM proposal failed: ${message}`,
|
||||
recoverable: true,
|
||||
};
|
||||
}
|
||||
|
||||
export async function proposeKloRelationshipCandidatesWithLlm(
|
||||
input: ProposeKloRelationshipCandidatesWithLlmInput,
|
||||
): Promise<KloRelationshipLlmProposalResult> {
|
||||
if (!input.llmProvider || modelIsDeterministic(input.llmProvider)) {
|
||||
return { candidates: [], warnings: [], llmCalls: 0, summary: 'skipped' };
|
||||
}
|
||||
|
||||
const settings = mergeSettings(input.settings);
|
||||
const evidence = buildEvidencePacket(input.schema, input.profile, settings);
|
||||
const prompt = [
|
||||
'You are helping KLO review possible SQL relationships before validation.',
|
||||
'Use only the compact schema evidence. Propose likely primary keys and foreign keys for later SQL validation.',
|
||||
'Return structured output only; never assume a join is accepted.',
|
||||
JSON.stringify(evidence),
|
||||
].join('\n\n');
|
||||
|
||||
try {
|
||||
const generated = await generateKloObject<
|
||||
KloRelationshipLlmProposalOutput,
|
||||
typeof relationshipLlmProposalSchema
|
||||
>({
|
||||
llmProvider: input.llmProvider,
|
||||
role: 'candidateExtraction',
|
||||
prompt,
|
||||
schema: relationshipLlmProposalSchema,
|
||||
generateText: input.generateText,
|
||||
});
|
||||
const output = relationshipLlmProposalSchema.parse(generated);
|
||||
const mapped = mapValidProposals(input.schema, output, settings);
|
||||
return {
|
||||
candidates: mapped.candidates,
|
||||
warnings: mapped.warnings,
|
||||
llmCalls: 1,
|
||||
summary: 'completed',
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
candidates: [],
|
||||
warnings: [generationFailureWarning(error)],
|
||||
llmCalls: 1,
|
||||
summary: 'failed',
|
||||
};
|
||||
}
|
||||
}
|
||||
151
packages/context/src/scan/relationship-locality.test.ts
Normal file
151
packages/context/src/scan/relationship-locality.test.ts
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KloEnrichedColumn, KloEnrichedTable } from './enrichment-types.js';
|
||||
import { localCandidateTables } from './relationship-locality.js';
|
||||
|
||||
function column(
|
||||
tableId: string,
|
||||
id: string,
|
||||
name: string,
|
||||
options: Partial<KloEnrichedColumn> = {},
|
||||
): KloEnrichedColumn {
|
||||
const tableRef = options.tableRef ?? { catalog: null, db: 'public', name: tableId };
|
||||
return {
|
||||
id,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: options.nativeType ?? 'INTEGER',
|
||||
normalizedType: options.normalizedType ?? 'integer',
|
||||
dimensionType: options.dimensionType ?? 'number',
|
||||
nullable: options.nullable ?? true,
|
||||
primaryKey: options.primaryKey ?? false,
|
||||
parentColumnId: options.parentColumnId ?? null,
|
||||
descriptions: options.descriptions ?? {},
|
||||
embedding: options.embedding ?? null,
|
||||
sampleValues: options.sampleValues ?? null,
|
||||
cardinality: options.cardinality ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
function table(id: string, name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
|
||||
const ref = { catalog: null, db: 'public', name };
|
||||
return {
|
||||
id,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: id, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship locality', () => {
|
||||
it('ranks the referenced parent table ahead of the child table for id-like source columns', () => {
|
||||
const artists = table('artist-id', 'Artist', [column('artist-id', 'artist-pk', 'ArtistId')]);
|
||||
const albums = table('album-id', 'Album', [
|
||||
column('album-id', 'album-pk', 'AlbumId'),
|
||||
column('album-id', 'artist-fk', 'ArtistId'),
|
||||
]);
|
||||
const unrelated = table('invoice-id', 'Invoice', [column('invoice-id', 'invoice-pk', 'InvoiceId')]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: albums,
|
||||
childColumn: albums.columns[1]!,
|
||||
parentTables: [albums, unrelated, artists],
|
||||
maxParentTables: 1,
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name)).toEqual(['Artist']);
|
||||
expect(ranked[0]).toMatchObject({
|
||||
score: expect.any(Number),
|
||||
tokenScore: expect.any(Number),
|
||||
embeddingScore: 0,
|
||||
reasons: expect.arrayContaining(['column_table_token_overlap']),
|
||||
});
|
||||
});
|
||||
|
||||
it('uses singular and plural variants so plan_code can rank stg_plans', () => {
|
||||
const plans = table('plans-id', 'stg_plans', [column('plans-id', 'plan-code', 'plan_code')]);
|
||||
const segments = table('segments-id', 'mart_account_segments', [
|
||||
column('segments-id', 'current-plan-code', 'current_plan_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
}),
|
||||
]);
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: segments,
|
||||
childColumn: segments.columns[0]!,
|
||||
parentTables: [accounts, segments, plans],
|
||||
maxParentTables: 1,
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name)).toEqual(['stg_plans']);
|
||||
expect(ranked[0]?.tokenScore).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('returns all tables when the schema is smaller than the default locality cap', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id', 'id'),
|
||||
column('invoices-id', 'account-id', 'account_id'),
|
||||
]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: invoices,
|
||||
childColumn: invoices.columns[1]!,
|
||||
parentTables: [invoices, accounts],
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name).sort()).toEqual(['accounts', 'invoices']);
|
||||
});
|
||||
|
||||
it('supports an explicit zero cap for deterministic tests', () => {
|
||||
const accounts = table('accounts-id', 'accounts', [column('accounts-id', 'account-id', 'id')]);
|
||||
const invoices = table('invoices-id', 'invoices', [
|
||||
column('invoices-id', 'invoice-id', 'id'),
|
||||
column('invoices-id', 'account-id', 'account_id'),
|
||||
]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: invoices,
|
||||
childColumn: invoices.columns[1]!,
|
||||
parentTables: [invoices, accounts],
|
||||
maxParentTables: 0,
|
||||
});
|
||||
|
||||
expect(ranked).toEqual([]);
|
||||
});
|
||||
|
||||
it('uses parent-column embeddings when token locality is weak', () => {
|
||||
const customers = table('customers-id', 'customers', [
|
||||
column('customers-id', 'customers-id-col', 'id', { embedding: [1, 0, 0] }),
|
||||
column('customers-id', 'customers-name-col', 'name', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
embedding: [0, 1, 0],
|
||||
}),
|
||||
]);
|
||||
const orders = table('orders-id', 'orders', [
|
||||
column('orders-id', 'orders-id-col', 'id', { embedding: [0, 0, 1] }),
|
||||
column('orders-id', 'buyer-ref-col', 'buyer_ref', { embedding: [0.995, 0.005, 0] }),
|
||||
]);
|
||||
const invoices = table('invoices-id', 'invoices', [column('invoices-id', 'invoice-id', 'id')]);
|
||||
|
||||
const ranked = localCandidateTables({
|
||||
childTable: orders,
|
||||
childColumn: orders.columns[1]!,
|
||||
parentTables: [invoices, customers],
|
||||
maxParentTables: 1,
|
||||
});
|
||||
|
||||
expect(ranked.map((item) => item.table.ref.name)).toEqual(['customers']);
|
||||
expect(ranked[0]).toMatchObject({
|
||||
embeddingScore: expect.any(Number),
|
||||
reasons: expect.arrayContaining(['embedding_similarity']),
|
||||
});
|
||||
expect(ranked[0]!.embeddingScore).toBeGreaterThan(0.99);
|
||||
});
|
||||
});
|
||||
164
packages/context/src/scan/relationship-locality.ts
Normal file
164
packages/context/src/scan/relationship-locality.ts
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
import type { KloEnrichedColumn, KloEnrichedTable } from './enrichment-types.js';
|
||||
import { normalizeKloRelationshipName, tokenizeKloRelationshipName } from './relationship-name-similarity.js';
|
||||
|
||||
export interface KloRelationshipLocalityCandidateTable {
|
||||
table: KloEnrichedTable;
|
||||
score: number;
|
||||
tokenScore: number;
|
||||
embeddingScore: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface LocalKloRelationshipCandidateTablesInput {
|
||||
childTable: KloEnrichedTable;
|
||||
childColumn: KloEnrichedColumn;
|
||||
parentTables: readonly KloEnrichedTable[];
|
||||
maxParentTables?: number;
|
||||
}
|
||||
|
||||
const DEFAULT_MAX_PARENT_TABLES = 20;
|
||||
const RELATIONSHIP_SUFFIX_TOKENS = new Set(['id', 'ids', 'key', 'keys', 'code', 'codes', 'uuid', 'uuids']);
|
||||
|
||||
function roundedScore(value: number): number {
|
||||
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
|
||||
}
|
||||
|
||||
function normalizedTokenVariants(name: string): string[] {
|
||||
const normalized = normalizeKloRelationshipName(name);
|
||||
return Array.from(
|
||||
new Set([
|
||||
...normalized.tokens,
|
||||
...tokenizeKloRelationshipName(normalized.singular),
|
||||
...tokenizeKloRelationshipName(normalized.plural),
|
||||
]),
|
||||
).filter(Boolean);
|
||||
}
|
||||
|
||||
function childColumnLocalityTokens(column: KloEnrichedColumn): string[] {
|
||||
const tokens = normalizedTokenVariants(column.name);
|
||||
const withoutSuffix = tokens.filter((token) => !RELATIONSHIP_SUFFIX_TOKENS.has(token));
|
||||
return withoutSuffix.length > 0 ? withoutSuffix : tokens;
|
||||
}
|
||||
|
||||
function uniqueTokens(values: readonly string[]): string[] {
|
||||
return Array.from(new Set(values.filter((value) => value.length > 0)));
|
||||
}
|
||||
|
||||
function jaccard(left: readonly string[], right: readonly string[]): number {
|
||||
if (left.length === 0 || right.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
const leftSet = new Set(left);
|
||||
const rightSet = new Set(right);
|
||||
const intersectionSize = Array.from(leftSet).filter((token) => rightSet.has(token)).length;
|
||||
const unionSize = new Set([...leftSet, ...rightSet]).size;
|
||||
return unionSize === 0 ? 0 : intersectionSize / unionSize;
|
||||
}
|
||||
|
||||
function cosineSimilarity(left: readonly number[] | null, right: readonly number[] | null): number {
|
||||
if (!left || !right || left.length === 0 || left.length !== right.length) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let dot = 0;
|
||||
let leftMagnitude = 0;
|
||||
let rightMagnitude = 0;
|
||||
for (let index = 0; index < left.length; index += 1) {
|
||||
const leftValue = left[index] ?? 0;
|
||||
const rightValue = right[index] ?? 0;
|
||||
dot += leftValue * rightValue;
|
||||
leftMagnitude += leftValue * leftValue;
|
||||
rightMagnitude += rightValue * rightValue;
|
||||
}
|
||||
|
||||
if (leftMagnitude === 0 || rightMagnitude === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return dot / (Math.sqrt(leftMagnitude) * Math.sqrt(rightMagnitude));
|
||||
}
|
||||
|
||||
function parentEmbeddingScore(childColumn: KloEnrichedColumn, parentTable: KloEnrichedTable): number {
|
||||
if (!Array.isArray(childColumn.embedding) || childColumn.embedding.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let best = 0;
|
||||
for (const parentColumn of parentTable.columns) {
|
||||
best = Math.max(best, cosineSimilarity(childColumn.embedding, parentColumn.embedding));
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
function tableTokenScore(input: {
|
||||
childTable: KloEnrichedTable;
|
||||
childColumn: KloEnrichedColumn;
|
||||
parentTable: KloEnrichedTable;
|
||||
}): number {
|
||||
const childTableTokens = normalizedTokenVariants(input.childTable.ref.name);
|
||||
const childColumnTokens = childColumnLocalityTokens(input.childColumn);
|
||||
const parentTokens = normalizedTokenVariants(input.parentTable.ref.name);
|
||||
const columnOnlyScore = jaccard(childColumnTokens, parentTokens);
|
||||
if (input.parentTable.id === input.childTable.id) {
|
||||
return columnOnlyScore;
|
||||
}
|
||||
const columnAndTableScore = jaccard(uniqueTokens([...childTableTokens, ...childColumnTokens]), parentTokens);
|
||||
return Math.max(columnOnlyScore, columnAndTableScore * 0.6);
|
||||
}
|
||||
|
||||
function localityScore(input: {
|
||||
childTable: KloEnrichedTable;
|
||||
childColumn: KloEnrichedColumn;
|
||||
parentTable: KloEnrichedTable;
|
||||
}): Omit<KloRelationshipLocalityCandidateTable, 'table'> {
|
||||
const tokenScore = roundedScore(tableTokenScore(input));
|
||||
const embeddingScore = roundedScore(parentEmbeddingScore(input.childColumn, input.parentTable));
|
||||
const score =
|
||||
embeddingScore > 0
|
||||
? roundedScore(Math.max(tokenScore, tokenScore * 0.8 + embeddingScore * 0.2, embeddingScore * 0.65))
|
||||
: tokenScore;
|
||||
const reasons: string[] = [];
|
||||
if (tokenScore > 0) {
|
||||
reasons.push('column_table_token_overlap');
|
||||
}
|
||||
if (embeddingScore > 0) {
|
||||
reasons.push('embedding_similarity');
|
||||
}
|
||||
if (reasons.length === 0) {
|
||||
reasons.push('locality_tie_breaker');
|
||||
}
|
||||
return {
|
||||
score,
|
||||
tokenScore,
|
||||
embeddingScore,
|
||||
reasons,
|
||||
};
|
||||
}
|
||||
|
||||
export function localCandidateTables(
|
||||
input: LocalKloRelationshipCandidateTablesInput,
|
||||
): KloRelationshipLocalityCandidateTable[] {
|
||||
const limit = input.maxParentTables ?? DEFAULT_MAX_PARENT_TABLES;
|
||||
if (!Number.isFinite(limit) || limit <= 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return input.parentTables
|
||||
.map((table) => ({
|
||||
table,
|
||||
...localityScore({
|
||||
childTable: input.childTable,
|
||||
childColumn: input.childColumn,
|
||||
parentTable: table,
|
||||
}),
|
||||
}))
|
||||
.sort(
|
||||
(left, right) =>
|
||||
right.score - left.score ||
|
||||
right.tokenScore - left.tokenScore ||
|
||||
right.embeddingScore - left.embeddingScore ||
|
||||
left.table.ref.name.localeCompare(right.table.ref.name) ||
|
||||
left.table.id.localeCompare(right.table.id),
|
||||
)
|
||||
.slice(0, Math.floor(limit));
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
normalizeKloRelationshipName,
|
||||
pluralizeKloRelationshipToken,
|
||||
singularizeKloRelationshipToken,
|
||||
tokenSimilarity,
|
||||
tokenizeKloRelationshipName,
|
||||
} from './relationship-name-similarity.js';
|
||||
|
||||
describe('relationship name similarity', () => {
|
||||
it('tokenizes common warehouse naming styles', () => {
|
||||
expect(normalizeKloRelationshipName('AlbumId')).toMatchObject({
|
||||
normalized: 'album_id',
|
||||
singular: 'album_id',
|
||||
plural: 'album_ids',
|
||||
tokens: ['album', 'id'],
|
||||
});
|
||||
expect(normalizeKloRelationshipName('artistID')).toMatchObject({
|
||||
normalized: 'artist_id',
|
||||
tokens: ['artist', 'id'],
|
||||
});
|
||||
expect(normalizeKloRelationshipName('SalesLT.CustomerID')).toMatchObject({
|
||||
normalized: 'sales_lt_customer_id',
|
||||
singular: 'sales_lt_customer_id',
|
||||
tokens: ['sales', 'lt', 'customer', 'id'],
|
||||
});
|
||||
expect(normalizeKloRelationshipName('SCREAMING_CUSTOMER_UUID')).toMatchObject({
|
||||
normalized: 'screaming_customer_uuid',
|
||||
tokens: ['screaming', 'customer', 'uuid'],
|
||||
});
|
||||
expect(normalizeKloRelationshipName('billing-account-key')).toMatchObject({
|
||||
normalized: 'billing_account_key',
|
||||
tokens: ['billing', 'account', 'key'],
|
||||
});
|
||||
});
|
||||
|
||||
it('removes only leading warehouse layer prefixes', () => {
|
||||
expect(normalizeKloRelationshipName('mart__Sales_Accounts')).toMatchObject({
|
||||
normalized: 'sales_accounts',
|
||||
singular: 'sales_account',
|
||||
plural: 'sales_accounts',
|
||||
tokens: ['sales', 'accounts'],
|
||||
});
|
||||
expect(normalizeKloRelationshipName('dim_users')).toMatchObject({
|
||||
normalized: 'users',
|
||||
singular: 'user',
|
||||
plural: 'users',
|
||||
tokens: ['users'],
|
||||
});
|
||||
expect(normalizeKloRelationshipName('customer_dim_id')).toMatchObject({
|
||||
normalized: 'customer_dim_id',
|
||||
tokens: ['customer', 'dim', 'id'],
|
||||
});
|
||||
});
|
||||
|
||||
it('folds accents and preserves non-suffix trailing s words', () => {
|
||||
expect(normalizeKloRelationshipName('KundénID')).toMatchObject({
|
||||
normalized: 'kunden_id',
|
||||
tokens: ['kunden', 'id'],
|
||||
});
|
||||
expect(singularizeKloRelationshipToken('address')).toBe('address');
|
||||
expect(singularizeKloRelationshipToken('addresses')).toBe('address');
|
||||
expect(singularizeKloRelationshipToken('status')).toBe('status');
|
||||
expect(pluralizeKloRelationshipToken('address')).toBe('addresses');
|
||||
expect(pluralizeKloRelationshipToken('company')).toBe('companies');
|
||||
});
|
||||
|
||||
it('returns deterministic tokens for direct tokenization calls', () => {
|
||||
expect(tokenizeKloRelationshipName('HTTPResponseCode')).toEqual(['http', 'response', 'code']);
|
||||
expect(tokenizeKloRelationshipName('customer2AddressID')).toEqual(['customer', '2', 'address', 'id']);
|
||||
});
|
||||
|
||||
it('scores token overlap and ordered suffix similarity', () => {
|
||||
expect(tokenSimilarity('artist_id', 'artist_id')).toBe(1);
|
||||
expect(tokenSimilarity('Album.ArtistId', 'ArtistID')).toBeGreaterThanOrEqual(0.74);
|
||||
expect(tokenSimilarity('customer_account_id', 'account_id')).toBeGreaterThan(
|
||||
tokenSimilarity('customer_account_id', 'invoice_id'),
|
||||
);
|
||||
expect(tokenSimilarity('', 'artist')).toBe(0);
|
||||
});
|
||||
});
|
||||
151
packages/context/src/scan/relationship-name-similarity.ts
Normal file
151
packages/context/src/scan/relationship-name-similarity.ts
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
export interface KloRelationshipNormalizedName {
|
||||
raw: string;
|
||||
normalized: string;
|
||||
singular: string;
|
||||
plural: string;
|
||||
tokens: string[];
|
||||
}
|
||||
|
||||
export type KloRelationshipTokenInput = string | readonly string[] | KloRelationshipNormalizedName;
|
||||
|
||||
const WAREHOUSE_LAYER_PREFIXES = new Set(['stg', 'stage', 'staging', 'dim', 'fct', 'fact', 'int', 'mart']);
|
||||
|
||||
function splitCaseBoundaries(value: string): string {
|
||||
return value
|
||||
.replace(/([\p{Lu}]+)([\p{Lu}][\p{Ll}])/gu, '$1_$2')
|
||||
.replace(/([\p{Ll}\p{N}])([\p{Lu}])/gu, '$1_$2')
|
||||
.replace(/(\p{L})(\p{N})/gu, '$1_$2')
|
||||
.replace(/(\p{N})(\p{L})/gu, '$1_$2');
|
||||
}
|
||||
|
||||
function foldAccents(value: string): string {
|
||||
return value
|
||||
.normalize('NFKD')
|
||||
.replace(/\p{Mark}+/gu, '')
|
||||
.replace(/ß/giu, 'ss')
|
||||
.replace(/æ/giu, 'ae')
|
||||
.replace(/œ/giu, 'oe');
|
||||
}
|
||||
|
||||
export function singularizeKloRelationshipToken(value: string): string {
|
||||
if (value.length <= 2) {
|
||||
return value;
|
||||
}
|
||||
if (value.endsWith('ies') && value.length > 3) {
|
||||
return `${value.slice(0, -3)}y`;
|
||||
}
|
||||
if (/(ches|shes|sses|xes|zes)$/u.test(value)) {
|
||||
return value.slice(0, -2);
|
||||
}
|
||||
if (value.endsWith('ves') && value.length > 4) {
|
||||
return `${value.slice(0, -3)}f`;
|
||||
}
|
||||
if (value.endsWith('s') && !/(ss|us|is)$/u.test(value)) {
|
||||
return value.slice(0, -1);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
export function pluralizeKloRelationshipToken(value: string): string {
|
||||
if (value.endsWith('y')) {
|
||||
return `${value.slice(0, -1)}ies`;
|
||||
}
|
||||
if (/(s|x|z|ch|sh)$/u.test(value)) {
|
||||
return `${value}es`;
|
||||
}
|
||||
return `${value}s`;
|
||||
}
|
||||
|
||||
function singularizeTokens(tokens: readonly string[]): string[] {
|
||||
if (tokens.length === 0) {
|
||||
return [];
|
||||
}
|
||||
const result = [...tokens];
|
||||
const last = result[result.length - 1];
|
||||
if (last) {
|
||||
result[result.length - 1] = singularizeKloRelationshipToken(last);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function pluralizeTokens(tokens: readonly string[]): string[] {
|
||||
if (tokens.length === 0) {
|
||||
return [];
|
||||
}
|
||||
const result = [...tokens];
|
||||
const last = result[result.length - 1];
|
||||
if (last) {
|
||||
result[result.length - 1] = pluralizeKloRelationshipToken(last);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
export function tokenizeKloRelationshipName(name: string): string[] {
|
||||
const boundarySeparated = splitCaseBoundaries(foldAccents(name.trim()));
|
||||
const tokens = boundarySeparated
|
||||
.toLowerCase()
|
||||
.replace(/[^\p{L}\p{N}]+/gu, '_')
|
||||
.replace(/^_+|_+$/gu, '')
|
||||
.split('_')
|
||||
.filter(Boolean);
|
||||
|
||||
return tokens.filter((token, index) => index > 0 || !WAREHOUSE_LAYER_PREFIXES.has(token));
|
||||
}
|
||||
|
||||
export function normalizeKloRelationshipName(name: string): KloRelationshipNormalizedName {
|
||||
const tokens = tokenizeKloRelationshipName(name);
|
||||
const singularTokens = singularizeTokens(tokens);
|
||||
const pluralTokens = pluralizeTokens(singularTokens);
|
||||
|
||||
return {
|
||||
raw: name,
|
||||
normalized: tokens.join('_'),
|
||||
singular: singularTokens.join('_'),
|
||||
plural: pluralTokens.join('_'),
|
||||
tokens,
|
||||
};
|
||||
}
|
||||
|
||||
function tokensFromInput(input: KloRelationshipTokenInput): string[] {
|
||||
if (typeof input === 'string') {
|
||||
return tokenizeKloRelationshipName(input);
|
||||
}
|
||||
if ('tokens' in input) {
|
||||
return input.tokens;
|
||||
}
|
||||
return input.map((token) => normalizeKloRelationshipName(token).normalized).filter(Boolean);
|
||||
}
|
||||
|
||||
function longestCommonSuffixLength(left: readonly string[], right: readonly string[]): number {
|
||||
let count = 0;
|
||||
while (
|
||||
count < left.length &&
|
||||
count < right.length &&
|
||||
left[left.length - 1 - count] === right[right.length - 1 - count]
|
||||
) {
|
||||
count += 1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
function roundedScore(value: number): number {
|
||||
return Number(Math.max(0, Math.min(1, value)).toFixed(3));
|
||||
}
|
||||
|
||||
export function tokenSimilarity(leftInput: KloRelationshipTokenInput, rightInput: KloRelationshipTokenInput): number {
|
||||
const left = tokensFromInput(leftInput);
|
||||
const right = tokensFromInput(rightInput);
|
||||
if (left.length === 0 || right.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const leftSet = new Set(left);
|
||||
const rightSet = new Set(right);
|
||||
const intersectionSize = Array.from(leftSet).filter((token) => rightSet.has(token)).length;
|
||||
const unionSize = new Set([...leftSet, ...rightSet]).size;
|
||||
const jaccard = unionSize === 0 ? 0 : intersectionSize / unionSize;
|
||||
const suffixLength = longestCommonSuffixLength(left, right);
|
||||
const suffixScore = suffixLength / Math.min(left.length, right.length);
|
||||
|
||||
return roundedScore(jaccard * 0.75 + suffixScore * 0.25);
|
||||
}
|
||||
354
packages/context/src/scan/relationship-profiling.test.ts
Normal file
354
packages/context/src/scan/relationship-profiling.test.ts
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
import { readFile } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import Database from 'better-sqlite3';
|
||||
import { afterEach, describe, expect, it } from 'vitest';
|
||||
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
|
||||
import { snapshotToKloEnrichedSchema } from './local-enrichment.js';
|
||||
import { loadKloRelationshipBenchmarkFixture, maskKloRelationshipBenchmarkSnapshot } from './relationship-benchmarks.js';
|
||||
import {
|
||||
createKloRelationshipProfileCache,
|
||||
formatKloRelationshipTableRef,
|
||||
profileKloRelationshipSchema,
|
||||
quoteKloRelationshipIdentifier,
|
||||
} from './relationship-profiling.js';
|
||||
import type { KloQueryResult, KloReadOnlyQueryInput, KloScanContext } from './types.js';
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
queryCount = 0;
|
||||
|
||||
executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
class FileSqliteExecutor {
|
||||
readonly db: Database.Database;
|
||||
queryCount = 0;
|
||||
|
||||
constructor(dataPath: string) {
|
||||
this.db = new Database(dataPath, { readonly: true, fileMustExist: true });
|
||||
}
|
||||
|
||||
executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KloEnrichedColumn> = {}): KloEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(tables: KloEnrichedTable[]): KloEnrichedSchema {
|
||||
return { connectionId: 'warehouse', tables, relationships: [] };
|
||||
}
|
||||
|
||||
describe('relationship profiling', () => {
|
||||
let executor: InMemorySqliteExecutor | null = null;
|
||||
|
||||
afterEach(() => {
|
||||
executor?.close();
|
||||
executor = null;
|
||||
});
|
||||
|
||||
it('keeps profiling on the batched table path', async () => {
|
||||
const source = await readFile(new URL('relationship-profiling.ts', import.meta.url), 'utf-8');
|
||||
|
||||
expect(source).not.toMatch(new RegExp('queryColumn' + 'Profile'));
|
||||
expect(source).not.toMatch(/for \(const column of table\.columns\)[\s\S]*executeReadOnly/);
|
||||
expect(source).toMatch(/queryTableProfile/);
|
||||
expect(source).toMatch(/UNION ALL/);
|
||||
});
|
||||
|
||||
it('quotes identifiers and formats table refs for supported local SQL drivers', () => {
|
||||
expect(quoteKloRelationshipIdentifier('sqlite', 'odd"name')).toBe('"odd""name"');
|
||||
expect(quoteKloRelationshipIdentifier('mysql', 'odd`name')).toBe('`odd``name`');
|
||||
expect(quoteKloRelationshipIdentifier('sqlserver', 'odd]name')).toBe('[odd]]name]');
|
||||
expect(formatKloRelationshipTableRef('sqlite', { catalog: null, db: null, name: 'accounts' })).toBe('"accounts"');
|
||||
expect(formatKloRelationshipTableRef('postgres', { catalog: null, db: 'analytics', name: 'accounts' })).toBe(
|
||||
'"analytics"."accounts"',
|
||||
);
|
||||
});
|
||||
|
||||
it('profiles row count, null rate, uniqueness, sample values, and text lengths', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, code TEXT, parent_id INTEGER);
|
||||
INSERT INTO accounts (id, code, parent_id) VALUES
|
||||
(1, 'A-1', NULL),
|
||||
(2, 'B-2', 1),
|
||||
(3, 'C-3', 1),
|
||||
(4, 'C-3', 2);
|
||||
`);
|
||||
|
||||
const result = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { primaryKey: false, nullable: false }),
|
||||
column('accounts', 'code', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
column('accounts', 'parent_id'),
|
||||
]),
|
||||
]),
|
||||
executor,
|
||||
ctx: { runId: 'profile-test' },
|
||||
sampleValuesPerColumn: 3,
|
||||
});
|
||||
|
||||
expect(result.sqlAvailable).toBe(true);
|
||||
expect(result.queryCount).toBe(1);
|
||||
expect(executor.queryCount).toBe(1);
|
||||
expect(result.tables).toHaveLength(1);
|
||||
expect(result.tables[0]).toMatchObject({ table: { name: 'accounts' }, rowCount: 4 });
|
||||
expect(result.columns['accounts.id']).toMatchObject({
|
||||
table: { name: 'accounts' },
|
||||
column: 'id',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
minTextLength: 1,
|
||||
maxTextLength: 1,
|
||||
});
|
||||
expect(result.columns['accounts.code']).toMatchObject({
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.75,
|
||||
sampleValues: ['C-3', 'A-1', 'B-2'],
|
||||
minTextLength: 3,
|
||||
maxTextLength: 3,
|
||||
});
|
||||
expect(result.columns['accounts.parent_id']).toMatchObject({
|
||||
nullCount: 1,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 0.5,
|
||||
nullRate: 0.25,
|
||||
});
|
||||
});
|
||||
|
||||
it('profiles each enabled table with one read-only SQL query', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, code TEXT, parent_id INTEGER);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, code, parent_id) VALUES
|
||||
(1, 'A-1', NULL),
|
||||
(2, 'B-2', 1),
|
||||
(3, 'C-3', 1),
|
||||
(4, 'C-3', 2);
|
||||
INSERT INTO users (id, account_id) VALUES
|
||||
(10, 1),
|
||||
(11, 1),
|
||||
(12, 2);
|
||||
`);
|
||||
|
||||
const result = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'code', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
column('accounts', 'parent_id'),
|
||||
]),
|
||||
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id')]),
|
||||
]),
|
||||
executor,
|
||||
ctx: { runId: 'profile-batched-query-count' },
|
||||
sampleValuesPerColumn: 3,
|
||||
});
|
||||
|
||||
expect(result.sqlAvailable).toBe(true);
|
||||
expect(result.queryCount).toBe(2);
|
||||
expect(executor.queryCount).toBe(2);
|
||||
expect(result.tables).toEqual([
|
||||
{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 4 },
|
||||
{ table: { catalog: null, db: null, name: 'users' }, rowCount: 3 },
|
||||
]);
|
||||
expect(result.columns['accounts.code']).toMatchObject({
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.75,
|
||||
sampleValues: ['C-3', 'A-1', 'B-2'],
|
||||
});
|
||||
expect(result.columns['users.account_id']).toMatchObject({
|
||||
rowCount: 3,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 2 / 3,
|
||||
});
|
||||
});
|
||||
|
||||
it('bounds column profile statistics with profileSampleRows', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, account_code TEXT NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'a1'), (2, 'a2'), (3, 'a3'), (4, 'a4');
|
||||
`);
|
||||
|
||||
const profiles = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'account_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
}),
|
||||
]),
|
||||
]),
|
||||
executor,
|
||||
ctx: { runId: 'profile-sample-rows' },
|
||||
profileSampleRows: 2,
|
||||
});
|
||||
|
||||
expect(profiles.queryCount).toBe(1);
|
||||
expect(executor.queryCount).toBe(1);
|
||||
expect(profiles.tables).toEqual([{ table: { catalog: null, db: null, name: 'accounts' }, rowCount: 4 }]);
|
||||
expect(profiles.columns['accounts.id']).toMatchObject({
|
||||
rowCount: 2,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 1,
|
||||
});
|
||||
expect(profiles.columns['accounts.account_code']?.sampleValues).toEqual(['a1', 'a2']);
|
||||
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('reuses a profile cache inside one scan run but re-queries with a fresh cache', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL, account_code TEXT NOT NULL);
|
||||
INSERT INTO accounts VALUES (1, 'a1'), (2, 'a2'), (3, 'a2');
|
||||
`);
|
||||
const relationshipSchema = schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'account_code', {
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
dimensionType: 'string',
|
||||
nullable: false,
|
||||
}),
|
||||
]),
|
||||
]);
|
||||
const cache = createKloRelationshipProfileCache();
|
||||
|
||||
const first = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: relationshipSchema,
|
||||
executor,
|
||||
ctx: { runId: 'profile-cache-run' },
|
||||
cache,
|
||||
});
|
||||
const second = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: relationshipSchema,
|
||||
executor,
|
||||
ctx: { runId: 'profile-cache-run' },
|
||||
cache,
|
||||
});
|
||||
const third = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: relationshipSchema,
|
||||
executor,
|
||||
ctx: { runId: 'profile-cache-fresh-run' },
|
||||
cache: createKloRelationshipProfileCache(),
|
||||
});
|
||||
|
||||
expect(first.queryCount).toBe(1);
|
||||
expect(second.queryCount).toBe(0);
|
||||
expect(third.queryCount).toBe(1);
|
||||
expect(executor.queryCount).toBe(2);
|
||||
expect(second.tables).toEqual(first.tables);
|
||||
expect(second.columns).toEqual(first.columns);
|
||||
});
|
||||
|
||||
it('profiles the checked-in scale stress fixture with one query per table', async () => {
|
||||
const fixtureRoot = new URL('../../test/fixtures/relationship-benchmarks/', import.meta.url);
|
||||
const fixture = await loadKloRelationshipBenchmarkFixture(join(fixtureRoot.pathname, 'scale_stress_no_declared_constraints'));
|
||||
if (!fixture.dataPath) {
|
||||
throw new Error('scale_stress_no_declared_constraints is missing data.sqlite');
|
||||
}
|
||||
const maskedSnapshot = maskKloRelationshipBenchmarkSnapshot(
|
||||
fixture.snapshot,
|
||||
'declared_pks_and_declared_fks_removed',
|
||||
);
|
||||
const scaleExecutor = new FileSqliteExecutor(fixture.dataPath);
|
||||
try {
|
||||
const result = await profileKloRelationshipSchema({
|
||||
connectionId: fixture.snapshot.connectionId,
|
||||
driver: fixture.snapshot.driver,
|
||||
schema: snapshotToKloEnrichedSchema(maskedSnapshot, new Map()),
|
||||
executor: scaleExecutor,
|
||||
ctx: { runId: 'scale-stress-profile-query-count' },
|
||||
profileSampleRows: 3,
|
||||
});
|
||||
|
||||
expect(fixture.snapshot.tables).toHaveLength(400);
|
||||
expect(result.queryCount).toBe(400);
|
||||
expect(result.queryCount).toBeLessThanOrEqual(2 * fixture.snapshot.tables.length);
|
||||
expect(scaleExecutor.queryCount).toBe(400);
|
||||
} finally {
|
||||
scaleExecutor.close();
|
||||
}
|
||||
});
|
||||
});
|
||||
467
packages/context/src/scan/relationship-profiling.ts
Normal file
467
packages/context/src/scan/relationship-profiling.ts
Normal file
|
|
@ -0,0 +1,467 @@
|
|||
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
|
||||
import type {
|
||||
KloConnectionDriver,
|
||||
KloQueryResult,
|
||||
KloReadOnlyQueryInput,
|
||||
KloScanContext,
|
||||
KloTableRef,
|
||||
} from './types.js';
|
||||
|
||||
export interface KloRelationshipReadOnlyExecutor {
|
||||
executeReadOnly(input: KloReadOnlyQueryInput, ctx: KloScanContext): Promise<KloQueryResult>;
|
||||
}
|
||||
|
||||
export interface KloRelationshipColumnProfile {
|
||||
table: KloTableRef;
|
||||
column: string;
|
||||
nativeType: string;
|
||||
normalizedType: string;
|
||||
rowCount: number;
|
||||
nullCount: number;
|
||||
distinctCount: number;
|
||||
uniquenessRatio: number;
|
||||
nullRate: number;
|
||||
sampleValues: string[];
|
||||
minTextLength: number | null;
|
||||
maxTextLength: number | null;
|
||||
}
|
||||
|
||||
export interface KloRelationshipTableProfile {
|
||||
table: KloTableRef;
|
||||
rowCount: number;
|
||||
}
|
||||
|
||||
export interface KloRelationshipProfileArtifact {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
sqlAvailable: boolean;
|
||||
queryCount: number;
|
||||
tables: KloRelationshipTableProfile[];
|
||||
columns: Record<string, KloRelationshipColumnProfile>;
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
interface KloRelationshipCachedTableProfile {
|
||||
table: KloRelationshipTableProfile;
|
||||
columns: Record<string, KloRelationshipColumnProfile>;
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
export interface KloRelationshipProfileCache {
|
||||
readonly tableProfiles: Map<string, KloRelationshipCachedTableProfile>;
|
||||
}
|
||||
|
||||
export interface ProfileKloRelationshipSchemaInput {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
schema: KloEnrichedSchema;
|
||||
executor: KloRelationshipReadOnlyExecutor | null;
|
||||
ctx: KloScanContext;
|
||||
sampleValuesPerColumn?: number;
|
||||
profileSampleRows?: number;
|
||||
cache?: KloRelationshipProfileCache;
|
||||
}
|
||||
|
||||
export function createKloRelationshipProfileCache(): KloRelationshipProfileCache {
|
||||
return { tableProfiles: new Map() };
|
||||
}
|
||||
|
||||
const SAMPLE_VALUE_DELIMITER = '\u001f';
|
||||
|
||||
type QuoteStyle = 'double' | 'backtick' | 'bracket';
|
||||
|
||||
function quoteStyle(driver: KloConnectionDriver): QuoteStyle {
|
||||
if (driver === 'mysql' || driver === 'clickhouse' || driver === 'posthog') {
|
||||
return 'backtick';
|
||||
}
|
||||
if (driver === 'sqlserver') {
|
||||
return 'bracket';
|
||||
}
|
||||
return 'double';
|
||||
}
|
||||
|
||||
export function quoteKloRelationshipIdentifier(driver: KloConnectionDriver, identifier: string): string {
|
||||
switch (quoteStyle(driver)) {
|
||||
case 'backtick':
|
||||
return `\`${identifier.replace(/`/g, '``')}\``;
|
||||
case 'bracket':
|
||||
return `[${identifier.replace(/\]/g, ']]')}]`;
|
||||
case 'double':
|
||||
return `"${identifier.replace(/"/g, '""')}"`;
|
||||
}
|
||||
}
|
||||
|
||||
export function formatKloRelationshipTableRef(driver: KloConnectionDriver, table: KloTableRef): string {
|
||||
const parts =
|
||||
driver === 'sqlite' || driver === 'posthog'
|
||||
? [table.name]
|
||||
: [table.catalog, table.db, table.name].filter((value): value is string => Boolean(value));
|
||||
return parts.map((part) => quoteKloRelationshipIdentifier(driver, part)).join('.');
|
||||
}
|
||||
|
||||
function textLengthExpression(driver: KloConnectionDriver, columnSql: string): string {
|
||||
if (driver === 'mysql') {
|
||||
return `CHAR_LENGTH(CAST(${columnSql} AS CHAR))`;
|
||||
}
|
||||
if (driver === 'sqlserver') {
|
||||
return `LEN(CAST(${columnSql} AS NVARCHAR(MAX)))`;
|
||||
}
|
||||
if (driver === 'bigquery') {
|
||||
return `LENGTH(CAST(${columnSql} AS STRING))`;
|
||||
}
|
||||
if (driver === 'clickhouse' || driver === 'posthog') {
|
||||
return `length(toString(${columnSql}))`;
|
||||
}
|
||||
return `LENGTH(CAST(${columnSql} AS TEXT))`;
|
||||
}
|
||||
|
||||
function limitSql(driver: KloConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return '';
|
||||
}
|
||||
return ` LIMIT ${Math.max(1, Math.floor(limit))}`;
|
||||
}
|
||||
|
||||
function topSql(driver: KloConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return ` TOP (${Math.max(1, Math.floor(limit))})`;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function sampledTableSql(driver: KloConnectionDriver, tableSql: string, limit: number): string {
|
||||
const safeLimit = Math.max(1, Math.floor(limit));
|
||||
if (driver === 'sqlserver') {
|
||||
return `(SELECT TOP (${safeLimit}) * FROM ${tableSql}) AS relationship_profile_sample`;
|
||||
}
|
||||
return `(SELECT * FROM ${tableSql}${limitSql(driver, safeLimit)}) AS relationship_profile_sample`;
|
||||
}
|
||||
|
||||
function firstRow(result: KloQueryResult): unknown[] {
|
||||
return result.rows[0] ?? [];
|
||||
}
|
||||
|
||||
function headerIndex(result: KloQueryResult, header: string): number {
|
||||
return result.headers.findIndex((candidate) => candidate.toLowerCase() === header.toLowerCase());
|
||||
}
|
||||
|
||||
function valueAt(result: KloQueryResult, row: unknown[], header: string): unknown {
|
||||
return row[headerIndex(result, header)];
|
||||
}
|
||||
|
||||
function numberFromValue(value: unknown): number {
|
||||
if (typeof value === 'number') {
|
||||
return value;
|
||||
}
|
||||
if (typeof value === 'bigint') {
|
||||
return Number(value);
|
||||
}
|
||||
if (typeof value === 'string' && value.trim() !== '') {
|
||||
return Number(value);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function nullableNumberFromValue(value: unknown): number | null {
|
||||
if (value === null || value === undefined) {
|
||||
return null;
|
||||
}
|
||||
if (typeof value === 'number') {
|
||||
return value;
|
||||
}
|
||||
if (typeof value === 'bigint') {
|
||||
return Number(value);
|
||||
}
|
||||
if (typeof value === 'string' && value.trim() !== '') {
|
||||
return Number(value);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function numberAt(result: KloQueryResult, header: string): number {
|
||||
return numberFromValue(valueAt(result, firstRow(result), header));
|
||||
}
|
||||
|
||||
function columnKey(table: KloEnrichedTable, column: KloEnrichedColumn): string {
|
||||
return `${table.ref.name}.${column.name}`;
|
||||
}
|
||||
|
||||
function tableProfileCacheKey(input: {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
ctx: KloScanContext;
|
||||
table: KloTableRef;
|
||||
sampleValuesPerColumn: number;
|
||||
profileSampleRows: number;
|
||||
}): string {
|
||||
return [
|
||||
input.ctx.runId,
|
||||
input.connectionId,
|
||||
input.driver,
|
||||
input.table.catalog ?? '',
|
||||
input.table.db ?? '',
|
||||
input.table.name,
|
||||
String(input.sampleValuesPerColumn),
|
||||
String(input.profileSampleRows),
|
||||
].join('\u001e');
|
||||
}
|
||||
|
||||
function sqlStringLiteral(value: string): string {
|
||||
return `'${value.replace(/'/g, "''")}'`;
|
||||
}
|
||||
|
||||
function sampleAggregateSql(driver: KloConnectionDriver, innerSql: string): string {
|
||||
if (driver === 'postgres') {
|
||||
return `(SELECT STRING_AGG(CAST(value AS TEXT), CHR(31)) FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
if (driver === 'bigquery') {
|
||||
return `(SELECT STRING_AGG(CAST(value AS STRING), '\\u001F') FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
if (driver === 'mysql') {
|
||||
return `(SELECT GROUP_CONCAT(CAST(value AS CHAR) SEPARATOR CHAR(31)) FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
if (driver === 'sqlserver') {
|
||||
return `(SELECT STRING_AGG(CAST(value AS NVARCHAR(MAX)), CHAR(31)) FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
if (driver === 'clickhouse' || driver === 'posthog') {
|
||||
return `(SELECT arrayStringConcat(groupArray(toString(value)), '\\x1F') FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
return `(SELECT GROUP_CONCAT(CAST(value AS TEXT), char(31)) FROM (${innerSql}) AS relationship_profile_values)`;
|
||||
}
|
||||
|
||||
function sampleValuesSql(input: {
|
||||
driver: KloConnectionDriver;
|
||||
tableSql: string;
|
||||
columnSql: string;
|
||||
limit: number;
|
||||
}): string {
|
||||
return [
|
||||
`SELECT${topSql(input.driver, input.limit)} ${input.columnSql} AS value`,
|
||||
`FROM ${input.tableSql}`,
|
||||
`WHERE ${input.columnSql} IS NOT NULL`,
|
||||
`GROUP BY ${input.columnSql}`,
|
||||
`ORDER BY COUNT(*) DESC, ${input.columnSql} ASC`,
|
||||
limitSql(input.driver, input.limit),
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
function columnProfileSelectSql(input: {
|
||||
connectionDriver: KloConnectionDriver;
|
||||
tableSql: string;
|
||||
profileTableSql: string;
|
||||
column: KloEnrichedColumn;
|
||||
sampleValuesPerColumn: number;
|
||||
}): string {
|
||||
const columnSql = quoteKloRelationshipIdentifier(input.connectionDriver, input.column.name);
|
||||
const textLengthSql = textLengthExpression(input.connectionDriver, columnSql);
|
||||
const samplesSql = sampleAggregateSql(
|
||||
input.connectionDriver,
|
||||
sampleValuesSql({
|
||||
driver: input.connectionDriver,
|
||||
tableSql: input.profileTableSql,
|
||||
columnSql,
|
||||
limit: input.sampleValuesPerColumn,
|
||||
}),
|
||||
);
|
||||
return [
|
||||
'SELECT',
|
||||
`${sqlStringLiteral(input.column.name)} AS column_name,`,
|
||||
`(SELECT COUNT(*) FROM ${input.tableSql}) AS table_row_count,`,
|
||||
'COUNT(*) AS row_count,',
|
||||
`SUM(CASE WHEN ${columnSql} IS NULL THEN 1 ELSE 0 END) AS null_count,`,
|
||||
`COUNT(DISTINCT ${columnSql}) AS distinct_count,`,
|
||||
`MIN(${textLengthSql}) AS min_text_length,`,
|
||||
`MAX(${textLengthSql}) AS max_text_length,`,
|
||||
`${samplesSql} AS sample_values`,
|
||||
`FROM ${input.profileTableSql}`,
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
function splitSampleValues(value: unknown): string[] {
|
||||
if (value === null || value === undefined) {
|
||||
return [];
|
||||
}
|
||||
const text = String(value);
|
||||
if (text === '') {
|
||||
return [];
|
||||
}
|
||||
return text.split(SAMPLE_VALUE_DELIMITER).filter((item) => item !== '');
|
||||
}
|
||||
|
||||
async function queryCount(input: {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
table: KloTableRef;
|
||||
executor: KloRelationshipReadOnlyExecutor;
|
||||
ctx: KloScanContext;
|
||||
}): Promise<{ rowCount: number; queryCount: number }> {
|
||||
const tableSql = formatKloRelationshipTableRef(input.driver, input.table);
|
||||
const result = await input.executor.executeReadOnly(
|
||||
{ connectionId: input.connectionId, sql: `SELECT COUNT(*) AS row_count FROM ${tableSql}`, maxRows: 1 },
|
||||
input.ctx,
|
||||
);
|
||||
return { rowCount: numberAt(result, 'row_count'), queryCount: 1 };
|
||||
}
|
||||
|
||||
async function queryTableProfile(input: {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
table: KloEnrichedTable;
|
||||
executor: KloRelationshipReadOnlyExecutor;
|
||||
ctx: KloScanContext;
|
||||
sampleValuesPerColumn: number;
|
||||
profileSampleRows: number;
|
||||
}): Promise<{
|
||||
table: KloRelationshipTableProfile;
|
||||
columns: Record<string, KloRelationshipColumnProfile>;
|
||||
queryCount: number;
|
||||
}> {
|
||||
if (input.table.columns.length === 0) {
|
||||
const rowCount = await queryCount({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
table: input.table.ref,
|
||||
executor: input.executor,
|
||||
ctx: input.ctx,
|
||||
});
|
||||
return {
|
||||
table: { table: input.table.ref, rowCount: rowCount.rowCount },
|
||||
columns: {},
|
||||
queryCount: rowCount.queryCount,
|
||||
};
|
||||
}
|
||||
|
||||
const tableSql = formatKloRelationshipTableRef(input.driver, input.table.ref);
|
||||
const profileTableSql = sampledTableSql(input.driver, tableSql, input.profileSampleRows);
|
||||
const sql = input.table.columns
|
||||
.map((column) =>
|
||||
columnProfileSelectSql({
|
||||
connectionDriver: input.driver,
|
||||
tableSql,
|
||||
profileTableSql,
|
||||
column,
|
||||
sampleValuesPerColumn: input.sampleValuesPerColumn,
|
||||
}),
|
||||
)
|
||||
.join(' UNION ALL ');
|
||||
const result = await input.executor.executeReadOnly(
|
||||
{ connectionId: input.connectionId, sql, maxRows: input.table.columns.length },
|
||||
input.ctx,
|
||||
);
|
||||
const columnsByName = new Map(input.table.columns.map((column) => [column.name, column]));
|
||||
const profiles: Record<string, KloRelationshipColumnProfile> = {};
|
||||
let tableRowCount = 0;
|
||||
|
||||
for (const row of result.rows) {
|
||||
const columnName = String(valueAt(result, row, 'column_name'));
|
||||
const column = columnsByName.get(columnName);
|
||||
if (!column) {
|
||||
continue;
|
||||
}
|
||||
const rowCount = numberFromValue(valueAt(result, row, 'row_count'));
|
||||
const nullCount = numberFromValue(valueAt(result, row, 'null_count'));
|
||||
const distinctCount = numberFromValue(valueAt(result, row, 'distinct_count'));
|
||||
tableRowCount = Math.max(tableRowCount, numberFromValue(valueAt(result, row, 'table_row_count')));
|
||||
profiles[columnKey(input.table, column)] = {
|
||||
table: input.table.ref,
|
||||
column: column.name,
|
||||
nativeType: column.nativeType,
|
||||
normalizedType: column.normalizedType,
|
||||
rowCount,
|
||||
nullCount,
|
||||
distinctCount,
|
||||
uniquenessRatio: rowCount === 0 ? 0 : distinctCount / rowCount,
|
||||
nullRate: rowCount === 0 ? 0 : nullCount / rowCount,
|
||||
sampleValues: splitSampleValues(valueAt(result, row, 'sample_values')),
|
||||
minTextLength: nullableNumberFromValue(valueAt(result, row, 'min_text_length')),
|
||||
maxTextLength: nullableNumberFromValue(valueAt(result, row, 'max_text_length')),
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
table: { table: input.table.ref, rowCount: tableRowCount },
|
||||
columns: profiles,
|
||||
queryCount: 1,
|
||||
};
|
||||
}
|
||||
|
||||
export async function profileKloRelationshipSchema(
|
||||
input: ProfileKloRelationshipSchemaInput,
|
||||
): Promise<KloRelationshipProfileArtifact> {
|
||||
if (!input.executor) {
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
sqlAvailable: false,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
columns: {},
|
||||
warnings: ['read_only_sql_unavailable'],
|
||||
};
|
||||
}
|
||||
|
||||
let queryTotal = 0;
|
||||
const tables: KloRelationshipTableProfile[] = [];
|
||||
const columns: Record<string, KloRelationshipColumnProfile> = {};
|
||||
const warnings: string[] = [];
|
||||
|
||||
for (const table of input.schema.tables.filter((candidate) => candidate.enabled)) {
|
||||
const sampleValuesPerColumn = input.sampleValuesPerColumn ?? 5;
|
||||
const profileSampleRows = input.profileSampleRows ?? 10000;
|
||||
const cacheKey = tableProfileCacheKey({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
ctx: input.ctx,
|
||||
table: table.ref,
|
||||
sampleValuesPerColumn,
|
||||
profileSampleRows,
|
||||
});
|
||||
const cached = input.cache?.tableProfiles.get(cacheKey);
|
||||
if (cached) {
|
||||
tables.push(cached.table);
|
||||
Object.assign(columns, cached.columns);
|
||||
for (const warning of cached.warnings) {
|
||||
warnings.push(warning);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const tableProfile = await queryTableProfile({
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
table,
|
||||
executor: input.executor,
|
||||
ctx: input.ctx,
|
||||
sampleValuesPerColumn,
|
||||
profileSampleRows,
|
||||
});
|
||||
queryTotal += tableProfile.queryCount;
|
||||
tables.push(tableProfile.table);
|
||||
Object.assign(columns, tableProfile.columns);
|
||||
input.cache?.tableProfiles.set(cacheKey, {
|
||||
table: tableProfile.table,
|
||||
columns: tableProfile.columns,
|
||||
warnings: [],
|
||||
});
|
||||
} catch (error) {
|
||||
const failureWarning = `profile_failed:${table.ref.name}:${error instanceof Error ? error.message : String(error)}`;
|
||||
warnings.push(failureWarning);
|
||||
input.cache?.tableProfiles.set(cacheKey, {
|
||||
table: { table: table.ref, rowCount: 0 },
|
||||
columns: {},
|
||||
warnings: [failureWarning],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
connectionId: input.connectionId,
|
||||
driver: input.driver,
|
||||
sqlAvailable: true,
|
||||
queryCount: queryTotal,
|
||||
tables,
|
||||
columns,
|
||||
warnings,
|
||||
};
|
||||
}
|
||||
352
packages/context/src/scan/relationship-review-apply.test.ts
Normal file
352
packages/context/src/scan/relationship-review-apply.test.ts
Normal file
|
|
@ -0,0 +1,352 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import type { KloLocalProject } from '../project/index.js';
|
||||
import { initKloProject } from '../project/index.js';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { applyLocalScanRelationshipReviewDecisions } from './relationship-review-apply.js';
|
||||
import type { KloRelationshipReviewDecisionArtifact } from './relationship-review-decisions.js';
|
||||
import type { ReadLocalScanRelationshipArtifactsResult } from './relationship-artifacts.js';
|
||||
import type { WriteLocalScanManifestShardsResult } from './local-enrichment-artifacts.js';
|
||||
import type { KloSchemaSnapshot } from './types.js';
|
||||
|
||||
const acceptedDecisionArtifact: KloRelationshipReviewDecisionArtifact = {
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
generatedAt: '2026-05-07T12:00:00.000Z',
|
||||
decisions: [
|
||||
{
|
||||
candidateId: 'orders:orders.customer_id->customers:customers.id',
|
||||
decision: 'accepted',
|
||||
previousStatus: 'review',
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
decidedAt: '2026-05-07T12:01:00.000Z',
|
||||
reviewer: 'Andrey',
|
||||
note: 'Customer link is valid.',
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.customers',
|
||||
columnIds: ['public.customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'deterministic_name',
|
||||
score: 0.81,
|
||||
confidence: 0.81,
|
||||
pkScore: 0.93,
|
||||
fkScore: 0.81,
|
||||
reasons: ['review_threshold'],
|
||||
},
|
||||
{
|
||||
candidateId: 'orders:orders.note_id->notes:notes.id',
|
||||
decision: 'rejected',
|
||||
previousStatus: 'review',
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
decidedAt: '2026-05-07T12:02:00.000Z',
|
||||
reviewer: 'Andrey',
|
||||
note: null,
|
||||
from: {
|
||||
tableId: 'public.orders',
|
||||
columnIds: ['public.orders.note_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['note_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'public.notes',
|
||||
columnIds: ['public.notes.id'],
|
||||
table: { catalog: null, db: 'public', name: 'notes' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'embedding_similarity',
|
||||
score: 0.7,
|
||||
confidence: 0.7,
|
||||
pkScore: 0.7,
|
||||
fkScore: 0.7,
|
||||
reasons: ['review_threshold'],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const artifacts: ReadLocalScanRelationshipArtifactsResult = {
|
||||
runId: 'scan-run-a',
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-a',
|
||||
report: {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
syncId: 'sync-a',
|
||||
runId: 'scan-run-a',
|
||||
trigger: 'cli',
|
||||
mode: 'relationships',
|
||||
dryRun: false,
|
||||
artifactPaths: {
|
||||
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-a',
|
||||
reportPath: 'raw-sources/warehouse/live-database/sync-a/scan-report.json',
|
||||
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
|
||||
enrichmentArtifacts: ['raw-sources/warehouse/live-database/sync-a/enrichment/relationships.json'],
|
||||
},
|
||||
diffSummary: {
|
||||
tablesAdded: 0,
|
||||
tablesModified: 0,
|
||||
tablesDeleted: 0,
|
||||
tablesUnchanged: 2,
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
manifestShardsWritten: 1,
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 0,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'completed',
|
||||
},
|
||||
capabilityGaps: [],
|
||||
warnings: [],
|
||||
relationships: { accepted: 0, review: 1, rejected: 1, skipped: 0 },
|
||||
enrichmentState: { resumedStages: [], completedStages: ['relationships'], failedStages: [] },
|
||||
createdAt: '2026-05-07T12:00:00.000Z',
|
||||
},
|
||||
relationships: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [],
|
||||
review: [],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
diagnostics: null,
|
||||
profile: null,
|
||||
paths: {
|
||||
relationships: 'raw-sources/warehouse/live-database/sync-a/enrichment/relationships.json',
|
||||
diagnostics: null,
|
||||
profile: null,
|
||||
},
|
||||
};
|
||||
|
||||
const snapshot: KloSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-05-07T12:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 2,
|
||||
foreignKeys: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
async function projectWithDecisions(
|
||||
decisions = acceptedDecisionArtifact,
|
||||
): Promise<{ project: KloLocalProject; tempDir: string }> {
|
||||
const tempDir = await mkdtemp(join(tmpdir(), 'klo-relationship-review-apply-'));
|
||||
const project = await initKloProject({
|
||||
projectDir: join(tempDir, 'project'),
|
||||
projectName: 'warehouse',
|
||||
});
|
||||
await project.fileStore.writeFile(
|
||||
'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json',
|
||||
`${JSON.stringify(decisions)}\n`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed relationship review decisions',
|
||||
);
|
||||
return { project, tempDir };
|
||||
}
|
||||
|
||||
function manifestResult(): WriteLocalScanManifestShardsResult {
|
||||
return {
|
||||
manifestShards: ['semantic-layer/warehouse/_schema/public.yaml'],
|
||||
manifestShardsWritten: 1,
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship review apply', () => {
|
||||
it('previews all accepted decisions without writing manifest shards', async () => {
|
||||
const { project, tempDir } = await projectWithDecisions();
|
||||
const writeLocalScanManifestShards = vi.fn(async () => manifestResult());
|
||||
try {
|
||||
const result = await applyLocalScanRelationshipReviewDecisions(project, {
|
||||
runId: 'scan-run-a',
|
||||
applyAllAccepted: true,
|
||||
dryRun: true,
|
||||
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
|
||||
readLocalScanStructuralSnapshot: vi.fn(async () => snapshot),
|
||||
writeLocalScanManifestShards,
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({
|
||||
runId: 'scan-run-a',
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-a',
|
||||
dryRun: true,
|
||||
selectedDecisions: 1,
|
||||
appliedRelationships: 1,
|
||||
manifestShards: [],
|
||||
manifestShardsWritten: 0,
|
||||
});
|
||||
expect(result.relationships[0]).toMatchObject({
|
||||
id: 'orders:orders.customer_id->customers:customers.id',
|
||||
source: 'manual',
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 1,
|
||||
});
|
||||
expect(writeLocalScanManifestShards).not.toHaveBeenCalled();
|
||||
} finally {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('writes selected accepted decisions as manual manifest relationships', async () => {
|
||||
const { project, tempDir } = await projectWithDecisions();
|
||||
const readLocalScanStructuralSnapshot = vi.fn(async () => snapshot);
|
||||
const writeLocalScanManifestShards = vi.fn(async () => manifestResult());
|
||||
try {
|
||||
const result = await applyLocalScanRelationshipReviewDecisions(project, {
|
||||
runId: 'scan-run-a',
|
||||
candidateIds: ['orders:orders.customer_id->customers:customers.id'],
|
||||
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
|
||||
readLocalScanStructuralSnapshot,
|
||||
writeLocalScanManifestShards,
|
||||
});
|
||||
|
||||
expect(readLocalScanStructuralSnapshot).toHaveBeenCalledWith({
|
||||
project: expect.any(Object),
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
rawSourcesDir: 'raw-sources/warehouse/live-database/sync-a',
|
||||
extractedAtFallback: '2026-05-07T12:00:00.000Z',
|
||||
});
|
||||
expect(writeLocalScanManifestShards).toHaveBeenCalledWith({
|
||||
project: expect.any(Object),
|
||||
connectionId: 'warehouse',
|
||||
syncId: 'sync-a',
|
||||
driver: 'postgres',
|
||||
snapshot,
|
||||
dryRun: false,
|
||||
relationshipUpdate: {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [
|
||||
expect.objectContaining({
|
||||
id: 'orders:orders.customer_id->customers:customers.id',
|
||||
source: 'manual',
|
||||
from: expect.objectContaining({ columns: ['customer_id'] }),
|
||||
to: expect.objectContaining({ columns: ['id'] }),
|
||||
}),
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
},
|
||||
});
|
||||
expect(result.manifestShardsWritten).toBe(1);
|
||||
} finally {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('rejects ambiguous apply selection input', async () => {
|
||||
const { project, tempDir } = await projectWithDecisions();
|
||||
try {
|
||||
await expect(
|
||||
applyLocalScanRelationshipReviewDecisions(project, {
|
||||
runId: 'scan-run-a',
|
||||
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
|
||||
}),
|
||||
).rejects.toThrow('Pass --all-accepted or at least one --candidate to choose review decisions to apply');
|
||||
|
||||
await expect(
|
||||
applyLocalScanRelationshipReviewDecisions(project, {
|
||||
runId: 'scan-run-a',
|
||||
applyAllAccepted: true,
|
||||
candidateIds: ['orders:orders.customer_id->customers:customers.id'],
|
||||
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
|
||||
}),
|
||||
).rejects.toThrow('Use either --all-accepted or --candidate, not both');
|
||||
} finally {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('refuses rejected decisions and missing candidate ids', async () => {
|
||||
const { project, tempDir } = await projectWithDecisions();
|
||||
try {
|
||||
await expect(
|
||||
applyLocalScanRelationshipReviewDecisions(project, {
|
||||
runId: 'scan-run-a',
|
||||
candidateIds: ['orders:orders.note_id->notes:notes.id'],
|
||||
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
|
||||
}),
|
||||
).rejects.toThrow('Relationship review decision "orders:orders.note_id->notes:notes.id" is rejected, not accepted');
|
||||
|
||||
await expect(
|
||||
applyLocalScanRelationshipReviewDecisions(project, {
|
||||
runId: 'scan-run-a',
|
||||
candidateIds: ['missing'],
|
||||
readLocalScanRelationshipArtifacts: vi.fn(async () => artifacts),
|
||||
}),
|
||||
).rejects.toThrow('Relationship review decision "missing" was not found for scan run "scan-run-a"');
|
||||
} finally {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
});
|
||||
231
packages/context/src/scan/relationship-review-apply.ts
Normal file
231
packages/context/src/scan/relationship-review-apply.ts
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import {
|
||||
readLocalScanRelationshipArtifacts,
|
||||
type ReadLocalScanRelationshipArtifactsResult,
|
||||
} from './relationship-artifacts.js';
|
||||
import {
|
||||
readLocalScanStructuralSnapshot,
|
||||
type ReadLocalScanStructuralSnapshotInput,
|
||||
} from './local-structural-artifacts.js';
|
||||
import {
|
||||
writeLocalScanManifestShards,
|
||||
type WriteLocalScanManifestShardsInput,
|
||||
type WriteLocalScanManifestShardsResult,
|
||||
} from './local-enrichment-artifacts.js';
|
||||
import type { KloEnrichedRelationship, KloRelationshipUpdate } from './enrichment-types.js';
|
||||
import type {
|
||||
KloRelationshipReviewDecisionArtifact,
|
||||
KloRelationshipReviewDecisionEntry,
|
||||
} from './relationship-review-decisions.js';
|
||||
|
||||
const DECISIONS_FILE = 'relationship-review-decisions.json';
|
||||
|
||||
export interface ApplyLocalScanRelationshipReviewDecisionsInput {
|
||||
runId: string;
|
||||
applyAllAccepted?: boolean;
|
||||
candidateIds?: readonly string[];
|
||||
dryRun?: boolean;
|
||||
readLocalScanRelationshipArtifacts?: typeof readLocalScanRelationshipArtifacts;
|
||||
readLocalScanStructuralSnapshot?: (
|
||||
input: ReadLocalScanStructuralSnapshotInput,
|
||||
) => Promise<WriteLocalScanManifestShardsInput['snapshot']>;
|
||||
writeLocalScanManifestShards?: (
|
||||
input: WriteLocalScanManifestShardsInput,
|
||||
) => Promise<WriteLocalScanManifestShardsResult>;
|
||||
}
|
||||
|
||||
export interface AppliedRelationshipReviewDecision {
|
||||
candidateId: string;
|
||||
decidedAt: string;
|
||||
reviewer: string;
|
||||
note: string | null;
|
||||
relationship: KloEnrichedRelationship;
|
||||
}
|
||||
|
||||
export interface ApplyLocalScanRelationshipReviewDecisionsResult {
|
||||
runId: string;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
dryRun: boolean;
|
||||
decisionsPath: string;
|
||||
selectedDecisions: number;
|
||||
appliedRelationships: number;
|
||||
relationships: KloEnrichedRelationship[];
|
||||
manifestShards: string[];
|
||||
manifestShardsWritten: number;
|
||||
}
|
||||
|
||||
function decisionsPathFromRelationshipsPath(relationshipsPath: string): string {
|
||||
return relationshipsPath.replace(/relationships\.json$/u, DECISIONS_FILE);
|
||||
}
|
||||
|
||||
async function readDecisionArtifact(
|
||||
project: KloLocalProject,
|
||||
path: string,
|
||||
runId: string,
|
||||
): Promise<KloRelationshipReviewDecisionArtifact> {
|
||||
let raw: { content: string };
|
||||
try {
|
||||
raw = await project.fileStore.readFile(path);
|
||||
} catch {
|
||||
throw new Error(`Relationship review decisions were not found for scan run "${runId}"`);
|
||||
}
|
||||
const parsed = JSON.parse(raw.content) as KloRelationshipReviewDecisionArtifact;
|
||||
return {
|
||||
connectionId: parsed.connectionId,
|
||||
runId: parsed.runId,
|
||||
syncId: parsed.syncId,
|
||||
generatedAt: parsed.generatedAt,
|
||||
decisions: Array.isArray(parsed.decisions) ? parsed.decisions : [],
|
||||
};
|
||||
}
|
||||
|
||||
function assertSelection(input: ApplyLocalScanRelationshipReviewDecisionsInput): void {
|
||||
const candidateIds = input.candidateIds ?? [];
|
||||
if (input.applyAllAccepted === true && candidateIds.length > 0) {
|
||||
throw new Error('Use either --all-accepted or --candidate, not both');
|
||||
}
|
||||
if (input.applyAllAccepted !== true && candidateIds.length === 0) {
|
||||
throw new Error('Pass --all-accepted or at least one --candidate to choose review decisions to apply');
|
||||
}
|
||||
}
|
||||
|
||||
function selectAcceptedDecisions(
|
||||
artifact: KloRelationshipReviewDecisionArtifact,
|
||||
input: ApplyLocalScanRelationshipReviewDecisionsInput,
|
||||
): KloRelationshipReviewDecisionEntry[] {
|
||||
assertSelection(input);
|
||||
if (input.applyAllAccepted === true) {
|
||||
return artifact.decisions.filter((decision) => decision.decision === 'accepted');
|
||||
}
|
||||
|
||||
const decisionsById = new Map(artifact.decisions.map((decision) => [decision.candidateId, decision]));
|
||||
const selected: KloRelationshipReviewDecisionEntry[] = [];
|
||||
for (const candidateId of input.candidateIds ?? []) {
|
||||
const decision = decisionsById.get(candidateId);
|
||||
if (!decision) {
|
||||
throw new Error(`Relationship review decision "${candidateId}" was not found for scan run "${input.runId}"`);
|
||||
}
|
||||
if (decision.decision !== 'accepted') {
|
||||
throw new Error(`Relationship review decision "${candidateId}" is ${decision.decision}, not accepted`);
|
||||
}
|
||||
selected.push(decision);
|
||||
}
|
||||
return selected;
|
||||
}
|
||||
|
||||
function tableId(table: KloRelationshipReviewDecisionEntry['from']['table']): string {
|
||||
return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.');
|
||||
}
|
||||
|
||||
function columnIds(table: KloRelationshipReviewDecisionEntry['from']['table'], columns: readonly string[]): string[] {
|
||||
const prefix = tableId(table);
|
||||
return columns.map((column) => `${prefix}.${column}`);
|
||||
}
|
||||
|
||||
function relationshipFromDecision(decision: KloRelationshipReviewDecisionEntry): KloEnrichedRelationship {
|
||||
return {
|
||||
id: decision.candidateId,
|
||||
source: 'manual',
|
||||
from: {
|
||||
tableId: tableId(decision.from.table),
|
||||
columnIds: columnIds(decision.from.table, decision.from.columns),
|
||||
table: decision.from.table,
|
||||
columns: [...decision.from.columns],
|
||||
},
|
||||
to: {
|
||||
tableId: tableId(decision.to.table),
|
||||
columnIds: columnIds(decision.to.table, decision.to.columns),
|
||||
table: decision.to.table,
|
||||
columns: [...decision.to.columns],
|
||||
},
|
||||
relationshipType: decision.relationshipType,
|
||||
confidence: 1,
|
||||
isPrimaryKeyReference: true,
|
||||
};
|
||||
}
|
||||
|
||||
function relationshipUpdate(
|
||||
connectionId: string,
|
||||
relationships: readonly KloEnrichedRelationship[],
|
||||
): KloRelationshipUpdate {
|
||||
return {
|
||||
connectionId,
|
||||
accepted: [...relationships],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
};
|
||||
}
|
||||
|
||||
function assertApplyableArtifacts(artifacts: ReadLocalScanRelationshipArtifactsResult): string {
|
||||
const rawSourcesDir = artifacts.report.artifactPaths.rawSourcesDir;
|
||||
if (!rawSourcesDir) {
|
||||
throw new Error(`Scan run "${artifacts.runId}" does not have raw source artifacts for manifest rewriting`);
|
||||
}
|
||||
return rawSourcesDir;
|
||||
}
|
||||
|
||||
export async function applyLocalScanRelationshipReviewDecisions(
|
||||
project: KloLocalProject,
|
||||
input: ApplyLocalScanRelationshipReviewDecisionsInput,
|
||||
): Promise<ApplyLocalScanRelationshipReviewDecisionsResult> {
|
||||
const readArtifacts = input.readLocalScanRelationshipArtifacts ?? readLocalScanRelationshipArtifacts;
|
||||
const artifacts = await readArtifacts(project, input.runId);
|
||||
if (!artifacts) {
|
||||
throw new Error(`Scan run "${input.runId}" was not found`);
|
||||
}
|
||||
|
||||
const decisionsPath = decisionsPathFromRelationshipsPath(artifacts.paths.relationships);
|
||||
const decisions = await readDecisionArtifact(project, decisionsPath, input.runId);
|
||||
const selected = selectAcceptedDecisions(decisions, input);
|
||||
const relationships = selected.map((decision) => relationshipFromDecision(decision));
|
||||
const dryRun = input.dryRun === true;
|
||||
|
||||
if (dryRun || relationships.length === 0) {
|
||||
return {
|
||||
runId: artifacts.runId,
|
||||
connectionId: artifacts.connectionId,
|
||||
syncId: artifacts.syncId,
|
||||
dryRun,
|
||||
decisionsPath,
|
||||
selectedDecisions: selected.length,
|
||||
appliedRelationships: relationships.length,
|
||||
relationships,
|
||||
manifestShards: [],
|
||||
manifestShardsWritten: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const rawSourcesDir = assertApplyableArtifacts(artifacts);
|
||||
const readSnapshot = input.readLocalScanStructuralSnapshot ?? readLocalScanStructuralSnapshot;
|
||||
const writeManifestShards = input.writeLocalScanManifestShards ?? writeLocalScanManifestShards;
|
||||
const snapshot = await readSnapshot({
|
||||
project,
|
||||
connectionId: artifacts.connectionId,
|
||||
driver: artifacts.report.driver,
|
||||
rawSourcesDir,
|
||||
extractedAtFallback: artifacts.report.createdAt,
|
||||
});
|
||||
const manifest = await writeManifestShards({
|
||||
project,
|
||||
connectionId: artifacts.connectionId,
|
||||
syncId: artifacts.syncId,
|
||||
driver: artifacts.report.driver,
|
||||
snapshot,
|
||||
dryRun: false,
|
||||
relationshipUpdate: relationshipUpdate(artifacts.connectionId, relationships),
|
||||
});
|
||||
|
||||
return {
|
||||
runId: artifacts.runId,
|
||||
connectionId: artifacts.connectionId,
|
||||
syncId: artifacts.syncId,
|
||||
dryRun,
|
||||
decisionsPath,
|
||||
selectedDecisions: selected.length,
|
||||
appliedRelationships: relationships.length,
|
||||
relationships,
|
||||
manifestShards: manifest.manifestShards,
|
||||
manifestShardsWritten: manifest.manifestShardsWritten,
|
||||
};
|
||||
}
|
||||
365
packages/context/src/scan/relationship-review-decisions.test.ts
Normal file
365
packages/context/src/scan/relationship-review-decisions.test.ts
Normal file
|
|
@ -0,0 +1,365 @@
|
|||
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { dirname, join } from 'node:path';
|
||||
import { runLocalStageOnlyIngest, type SourceAdapter } from '../ingest/index.js';
|
||||
import { initKloProject, loadKloProject } from '../project/index.js';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { writeLocalScanRelationshipReviewDecision } from './relationship-review-decisions.js';
|
||||
import type { KloRelationshipArtifact, KloRelationshipDiagnosticsArtifact } from './relationship-diagnostics.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import type { KloScanReport } from './types.js';
|
||||
|
||||
const RUN_ID = 'scan-run-review';
|
||||
const SYNC_ID = '2026-05-07-100000-scan-run-review';
|
||||
|
||||
async function writeProjectFile(projectDir: string, relativePath: string, content: string): Promise<void> {
|
||||
const absolutePath = join(projectDir, relativePath);
|
||||
await mkdir(dirname(absolutePath), { recursive: true });
|
||||
await writeFile(absolutePath, content, 'utf-8');
|
||||
}
|
||||
|
||||
async function createProject(projectDir: string): Promise<void> {
|
||||
await initKloProject({ projectDir, projectName: 'warehouse' });
|
||||
await writeFile(
|
||||
join(projectDir, 'klo.yaml'),
|
||||
[
|
||||
'project: warehouse',
|
||||
'connections:',
|
||||
' warehouse:',
|
||||
' driver: sqlite',
|
||||
' path: warehouse.db',
|
||||
' readonly: true',
|
||||
'ingest:',
|
||||
' adapters:',
|
||||
' - live-database',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
);
|
||||
}
|
||||
|
||||
function liveDatabaseAdapter(): SourceAdapter {
|
||||
return {
|
||||
source: 'live-database',
|
||||
skillNames: ['live_database_ingest'],
|
||||
async fetch(_pullConfig, stagedDir) {
|
||||
await mkdir(join(stagedDir, 'tables'), { recursive: true });
|
||||
await writeFile(join(stagedDir, 'connection.json'), '{"connectionId":"warehouse"}\n', 'utf-8');
|
||||
await writeFile(join(stagedDir, 'foreign-keys.json'), '{"foreignKeys":[]}\n', 'utf-8');
|
||||
await writeFile(
|
||||
join(stagedDir, 'tables', 'orders.json'),
|
||||
'{"name":"orders","db":"public","columns":[{"name":"id","type":"integer","nullable":false,"primaryKey":true}]}\n',
|
||||
'utf-8',
|
||||
);
|
||||
},
|
||||
async detect(stagedDir) {
|
||||
await writeFile(join(stagedDir, 'connection.json'), '{"connectionId":"warehouse"}\n', 'utf-8');
|
||||
return true;
|
||||
},
|
||||
async chunk() {
|
||||
return {
|
||||
workUnits: [
|
||||
{
|
||||
unitKey: 'live-database-public-orders',
|
||||
rawFiles: ['tables/orders.json'],
|
||||
dependencyPaths: ['connection.json', 'foreign-keys.json'],
|
||||
peerFileIndex: [],
|
||||
},
|
||||
],
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async function createLiveDatabaseRun(projectDir: string): Promise<void> {
|
||||
await createProject(projectDir);
|
||||
const project = await loadKloProject({ projectDir });
|
||||
await runLocalStageOnlyIngest({
|
||||
project,
|
||||
adapters: [liveDatabaseAdapter()],
|
||||
adapter: 'live-database',
|
||||
connectionId: 'warehouse',
|
||||
jobId: RUN_ID,
|
||||
now: () => new Date('2026-05-07T10:00:00.000Z'),
|
||||
});
|
||||
}
|
||||
|
||||
function reviewRelationships(): KloRelationshipArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
accepted: [],
|
||||
review: [
|
||||
{
|
||||
id: 'orders:orders.customer_id->customers:customers.id',
|
||||
status: 'review',
|
||||
source: 'deterministic_name',
|
||||
from: {
|
||||
tableId: 'orders',
|
||||
columnIds: ['orders.customer_id'],
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
columns: ['customer_id'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'customers',
|
||||
columnIds: ['customers.id'],
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
columns: ['id'],
|
||||
},
|
||||
relationshipType: 'many_to_one',
|
||||
confidence: 0.62,
|
||||
pkScore: 0.91,
|
||||
fkScore: 0.62,
|
||||
score: 0.62,
|
||||
evidence: { sources: ['table_suffix'] },
|
||||
validation: { status: 'passed' },
|
||||
graph: { reasons: ['fk_score_review'] },
|
||||
reasons: ['fk_score_review'],
|
||||
},
|
||||
],
|
||||
rejected: [],
|
||||
skipped: [],
|
||||
};
|
||||
}
|
||||
|
||||
function diagnostics(): KloRelationshipDiagnosticsArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
generatedAt: '2026-05-07T10:00:00.000Z',
|
||||
summary: { accepted: 0, review: 1, rejected: 0, skipped: 0 },
|
||||
noAcceptedReason: 'relationship candidates require review before manifest writes',
|
||||
candidateCountsBySource: { deterministic_name: 1 },
|
||||
validation: { available: true, sqlAvailable: true, queryCount: 3 },
|
||||
thresholds: { acceptThreshold: 0.85, reviewThreshold: 0.55 },
|
||||
policy: {
|
||||
validationRequiredForManifest: true,
|
||||
maxCandidatesPerColumn: 25,
|
||||
profileSampleRows: 10000,
|
||||
validationConcurrency: 4,
|
||||
},
|
||||
warnings: [],
|
||||
profileWarnings: [],
|
||||
};
|
||||
}
|
||||
|
||||
function profile(): KloRelationshipProfileArtifact {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
tables: [],
|
||||
columns: {},
|
||||
queryCount: 3,
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
|
||||
function report(): KloScanReport {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
syncId: SYNC_ID,
|
||||
runId: RUN_ID,
|
||||
trigger: 'cli',
|
||||
mode: 'relationships',
|
||||
dryRun: false,
|
||||
artifactPaths: {
|
||||
rawSourcesDir: `raw-sources/warehouse/live-database/${SYNC_ID}`,
|
||||
reportPath: `raw-sources/warehouse/live-database/${SYNC_ID}/scan-report.json`,
|
||||
manifestShards: [],
|
||||
enrichmentArtifacts: [
|
||||
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationships.json`,
|
||||
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationship-diagnostics.json`,
|
||||
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationship-profile.json`,
|
||||
],
|
||||
},
|
||||
diffSummary: {
|
||||
tablesAdded: 0,
|
||||
tablesModified: 0,
|
||||
tablesDeleted: 0,
|
||||
tablesUnchanged: 2,
|
||||
columnsAdded: 0,
|
||||
columnsModified: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
manifestShardsWritten: 0,
|
||||
structuralSyncStats: {
|
||||
tablesCreated: 0,
|
||||
tablesUpdated: 0,
|
||||
tablesDeleted: 0,
|
||||
columnsCreated: 0,
|
||||
columnsUpdated: 0,
|
||||
columnsDeleted: 0,
|
||||
},
|
||||
enrichment: {
|
||||
dataDictionary: 'skipped',
|
||||
tableDescriptions: 'skipped',
|
||||
columnDescriptions: 'skipped',
|
||||
embeddings: 'skipped',
|
||||
deterministicRelationships: 'completed',
|
||||
llmRelationshipValidation: 'skipped',
|
||||
statisticalValidation: 'completed',
|
||||
},
|
||||
relationships: { accepted: 0, review: 1, rejected: 0, skipped: 0 },
|
||||
enrichmentState: {
|
||||
resumedStages: [],
|
||||
completedStages: ['relationships'],
|
||||
failedStages: [],
|
||||
},
|
||||
warnings: [],
|
||||
capabilityGaps: [],
|
||||
createdAt: '2026-05-07T10:00:00.000Z',
|
||||
};
|
||||
}
|
||||
|
||||
async function writeScanArtifacts(projectDir: string): Promise<void> {
|
||||
await writeProjectFile(
|
||||
projectDir,
|
||||
`raw-sources/warehouse/live-database/${SYNC_ID}/scan-report.json`,
|
||||
JSON.stringify(report(), null, 2),
|
||||
);
|
||||
await writeProjectFile(
|
||||
projectDir,
|
||||
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationships.json`,
|
||||
JSON.stringify(reviewRelationships(), null, 2),
|
||||
);
|
||||
await writeProjectFile(
|
||||
projectDir,
|
||||
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationship-diagnostics.json`,
|
||||
JSON.stringify(diagnostics(), null, 2),
|
||||
);
|
||||
await writeProjectFile(
|
||||
projectDir,
|
||||
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationship-profile.json`,
|
||||
JSON.stringify(profile(), null, 2),
|
||||
);
|
||||
}
|
||||
|
||||
describe('relationship review decisions', () => {
|
||||
it('writes an accepted decision beside the scan relationship artifacts', async () => {
|
||||
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-review-decisions-'));
|
||||
try {
|
||||
await createLiveDatabaseRun(projectDir);
|
||||
await writeScanArtifacts(projectDir);
|
||||
const project = await loadKloProject({ projectDir });
|
||||
|
||||
const result = await writeLocalScanRelationshipReviewDecision(project, {
|
||||
runId: 'scan-run-review',
|
||||
candidateId: 'orders:orders.customer_id->customers:customers.id',
|
||||
decision: 'accepted',
|
||||
reviewer: 'Andrey',
|
||||
note: 'Matches the warehouse model',
|
||||
decidedAt: '2026-05-07T12:00:00.000Z',
|
||||
});
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
if (!result) {
|
||||
throw new Error('Expected relationship review decision to be written');
|
||||
}
|
||||
expect(result.path).toBe(
|
||||
`raw-sources/warehouse/live-database/${SYNC_ID}/enrichment/relationship-review-decisions.json`,
|
||||
);
|
||||
expect(result.artifact.decisions).toHaveLength(1);
|
||||
expect(result.decision).toMatchObject({
|
||||
candidateId: 'orders:orders.customer_id->customers:customers.id',
|
||||
decision: 'accepted',
|
||||
previousStatus: 'review',
|
||||
reviewer: 'Andrey',
|
||||
note: 'Matches the warehouse model',
|
||||
source: 'deterministic_name',
|
||||
relationshipType: 'many_to_one',
|
||||
score: 0.62,
|
||||
reasons: ['fk_score_review'],
|
||||
});
|
||||
await expect(project.fileStore.readFile(result.path)).resolves.toMatchObject({
|
||||
path: result.path,
|
||||
content: expect.stringContaining('"decision": "accepted"'),
|
||||
});
|
||||
} finally {
|
||||
await rm(projectDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('replaces the existing decision for the same candidate id', async () => {
|
||||
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-review-replace-'));
|
||||
try {
|
||||
await createLiveDatabaseRun(projectDir);
|
||||
await writeScanArtifacts(projectDir);
|
||||
const project = await loadKloProject({ projectDir });
|
||||
|
||||
await writeLocalScanRelationshipReviewDecision(project, {
|
||||
runId: 'scan-run-review',
|
||||
candidateId: 'orders:orders.customer_id->customers:customers.id',
|
||||
decision: 'accepted',
|
||||
reviewer: 'Andrey',
|
||||
note: 'First decision',
|
||||
decidedAt: '2026-05-07T12:00:00.000Z',
|
||||
});
|
||||
const replacement = await writeLocalScanRelationshipReviewDecision(project, {
|
||||
runId: 'scan-run-review',
|
||||
candidateId: 'orders:orders.customer_id->customers:customers.id',
|
||||
decision: 'rejected',
|
||||
reviewer: 'Andrey',
|
||||
note: 'Reviewed against source data and rejected',
|
||||
decidedAt: '2026-05-07T12:05:00.000Z',
|
||||
});
|
||||
|
||||
expect(replacement).not.toBeNull();
|
||||
if (!replacement) {
|
||||
throw new Error('Expected replacement relationship review decision to be written');
|
||||
}
|
||||
expect(replacement.artifact.decisions).toHaveLength(1);
|
||||
expect(replacement.artifact.decisions[0]).toMatchObject({
|
||||
decision: 'rejected',
|
||||
note: 'Reviewed against source data and rejected',
|
||||
decidedAt: '2026-05-07T12:05:00.000Z',
|
||||
});
|
||||
} finally {
|
||||
await rm(projectDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('returns null when the scan run does not exist', async () => {
|
||||
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-review-missing-run-'));
|
||||
try {
|
||||
await createProject(projectDir);
|
||||
const project = await loadKloProject({ projectDir });
|
||||
|
||||
await expect(
|
||||
writeLocalScanRelationshipReviewDecision(project, {
|
||||
runId: 'missing-run',
|
||||
candidateId: 'orders:orders.customer_id->customers:customers.id',
|
||||
decision: 'accepted',
|
||||
reviewer: 'Andrey',
|
||||
note: null,
|
||||
decidedAt: '2026-05-07T12:00:00.000Z',
|
||||
}),
|
||||
).resolves.toBeNull();
|
||||
} finally {
|
||||
await rm(projectDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('rejects unknown candidate ids for an existing scan run', async () => {
|
||||
const projectDir = await mkdtemp(join(tmpdir(), 'klo-relationship-review-missing-candidate-'));
|
||||
try {
|
||||
await createLiveDatabaseRun(projectDir);
|
||||
await writeScanArtifacts(projectDir);
|
||||
const project = await loadKloProject({ projectDir });
|
||||
|
||||
await expect(
|
||||
writeLocalScanRelationshipReviewDecision(project, {
|
||||
runId: 'scan-run-review',
|
||||
candidateId: 'orders:orders.unknown_id->customers:customers.id',
|
||||
decision: 'accepted',
|
||||
reviewer: 'Andrey',
|
||||
note: null,
|
||||
decidedAt: '2026-05-07T12:00:00.000Z',
|
||||
}),
|
||||
).rejects.toThrow(
|
||||
'Relationship candidate "orders:orders.unknown_id->customers:customers.id" was not found in scan run "scan-run-review"',
|
||||
);
|
||||
} finally {
|
||||
await rm(projectDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
});
|
||||
182
packages/context/src/scan/relationship-review-decisions.ts
Normal file
182
packages/context/src/scan/relationship-review-decisions.ts
Normal file
|
|
@ -0,0 +1,182 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import type { KloRelationshipType } from './enrichment-types.js';
|
||||
import { readLocalScanRelationshipArtifacts } from './relationship-artifacts.js';
|
||||
import type {
|
||||
KloRelationshipArtifactEdge,
|
||||
KloRelationshipArtifactEndpoint,
|
||||
} from './relationship-diagnostics.js';
|
||||
import type { KloResolvedRelationshipStatus } from './relationship-graph-resolver.js';
|
||||
|
||||
const LOCAL_AUTHOR = 'klo';
|
||||
const LOCAL_AUTHOR_EMAIL = 'klo@example.com';
|
||||
const DECISIONS_FILE = 'relationship-review-decisions.json';
|
||||
|
||||
export type KloRelationshipReviewDecisionValue = 'accepted' | 'rejected';
|
||||
|
||||
export interface WriteLocalScanRelationshipReviewDecisionInput {
|
||||
runId: string;
|
||||
candidateId: string;
|
||||
decision: KloRelationshipReviewDecisionValue;
|
||||
reviewer: string;
|
||||
note: string | null;
|
||||
decidedAt?: string;
|
||||
}
|
||||
|
||||
export interface KloRelationshipReviewDecisionEntry {
|
||||
candidateId: string;
|
||||
decision: KloRelationshipReviewDecisionValue;
|
||||
previousStatus: KloResolvedRelationshipStatus;
|
||||
connectionId: string;
|
||||
runId: string;
|
||||
syncId: string;
|
||||
decidedAt: string;
|
||||
reviewer: string;
|
||||
note: string | null;
|
||||
from: KloRelationshipArtifactEndpoint;
|
||||
to: KloRelationshipArtifactEndpoint;
|
||||
relationshipType: KloRelationshipType;
|
||||
source: string;
|
||||
score: number | null;
|
||||
confidence: number;
|
||||
pkScore: number | null;
|
||||
fkScore: number | null;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KloRelationshipReviewDecisionArtifact {
|
||||
connectionId: string;
|
||||
runId: string;
|
||||
syncId: string;
|
||||
generatedAt: string;
|
||||
decisions: KloRelationshipReviewDecisionEntry[];
|
||||
}
|
||||
|
||||
export interface WriteLocalScanRelationshipReviewDecisionResult {
|
||||
path: string;
|
||||
decision: KloRelationshipReviewDecisionEntry;
|
||||
artifact: KloRelationshipReviewDecisionArtifact;
|
||||
}
|
||||
|
||||
function reviewDecisionPath(relationshipsPath: string): string {
|
||||
return relationshipsPath.replace(/relationships\.json$/u, DECISIONS_FILE);
|
||||
}
|
||||
|
||||
function allCandidateEdges(result: Awaited<ReturnType<typeof readLocalScanRelationshipArtifacts>>): KloRelationshipArtifactEdge[] {
|
||||
if (!result) {
|
||||
return [];
|
||||
}
|
||||
return [...result.relationships.accepted, ...result.relationships.review, ...result.relationships.rejected];
|
||||
}
|
||||
|
||||
async function readExistingDecisions(
|
||||
project: KloLocalProject,
|
||||
path: string,
|
||||
fallback: Omit<KloRelationshipReviewDecisionArtifact, 'decisions'>,
|
||||
): Promise<KloRelationshipReviewDecisionArtifact> {
|
||||
try {
|
||||
const raw = await project.fileStore.readFile(path);
|
||||
const parsed = JSON.parse(raw.content) as KloRelationshipReviewDecisionArtifact;
|
||||
return {
|
||||
connectionId: parsed.connectionId,
|
||||
runId: parsed.runId,
|
||||
syncId: parsed.syncId,
|
||||
generatedAt: parsed.generatedAt,
|
||||
decisions: Array.isArray(parsed.decisions) ? parsed.decisions : [],
|
||||
};
|
||||
} catch {
|
||||
return { ...fallback, decisions: [] };
|
||||
}
|
||||
}
|
||||
|
||||
function decisionEntry(input: {
|
||||
candidate: KloRelationshipArtifactEdge;
|
||||
connectionId: string;
|
||||
runId: string;
|
||||
syncId: string;
|
||||
decision: KloRelationshipReviewDecisionValue;
|
||||
reviewer: string;
|
||||
note: string | null;
|
||||
decidedAt: string;
|
||||
}): KloRelationshipReviewDecisionEntry {
|
||||
return {
|
||||
candidateId: input.candidate.id,
|
||||
decision: input.decision,
|
||||
previousStatus: input.candidate.status,
|
||||
connectionId: input.connectionId,
|
||||
runId: input.runId,
|
||||
syncId: input.syncId,
|
||||
decidedAt: input.decidedAt,
|
||||
reviewer: input.reviewer,
|
||||
note: input.note,
|
||||
from: input.candidate.from,
|
||||
to: input.candidate.to,
|
||||
relationshipType: input.candidate.relationshipType,
|
||||
source: input.candidate.source,
|
||||
score: input.candidate.score,
|
||||
confidence: input.candidate.confidence,
|
||||
pkScore: input.candidate.pkScore,
|
||||
fkScore: input.candidate.fkScore,
|
||||
reasons: [...input.candidate.reasons],
|
||||
};
|
||||
}
|
||||
|
||||
function upsertDecision(
|
||||
existing: readonly KloRelationshipReviewDecisionEntry[],
|
||||
next: KloRelationshipReviewDecisionEntry,
|
||||
): KloRelationshipReviewDecisionEntry[] {
|
||||
return [...existing.filter((item) => item.candidateId !== next.candidateId), next].sort((left, right) =>
|
||||
left.candidateId.localeCompare(right.candidateId),
|
||||
);
|
||||
}
|
||||
|
||||
export async function writeLocalScanRelationshipReviewDecision(
|
||||
project: KloLocalProject,
|
||||
input: WriteLocalScanRelationshipReviewDecisionInput,
|
||||
): Promise<WriteLocalScanRelationshipReviewDecisionResult | null> {
|
||||
const artifacts = await readLocalScanRelationshipArtifacts(project, input.runId);
|
||||
if (!artifacts) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const candidate = allCandidateEdges(artifacts).find((edge) => edge.id === input.candidateId);
|
||||
if (!candidate) {
|
||||
throw new Error(`Relationship candidate "${input.candidateId}" was not found in scan run "${input.runId}"`);
|
||||
}
|
||||
|
||||
const decidedAt = input.decidedAt ?? new Date().toISOString();
|
||||
const path = reviewDecisionPath(artifacts.paths.relationships);
|
||||
const fallback = {
|
||||
connectionId: artifacts.connectionId,
|
||||
runId: artifacts.runId,
|
||||
syncId: artifacts.syncId,
|
||||
generatedAt: decidedAt,
|
||||
};
|
||||
const existing = await readExistingDecisions(project, path, fallback);
|
||||
const decision = decisionEntry({
|
||||
candidate,
|
||||
connectionId: artifacts.connectionId,
|
||||
runId: artifacts.runId,
|
||||
syncId: artifacts.syncId,
|
||||
decision: input.decision,
|
||||
reviewer: input.reviewer,
|
||||
note: input.note,
|
||||
decidedAt,
|
||||
});
|
||||
const artifact: KloRelationshipReviewDecisionArtifact = {
|
||||
connectionId: artifacts.connectionId,
|
||||
runId: artifacts.runId,
|
||||
syncId: artifacts.syncId,
|
||||
generatedAt: decidedAt,
|
||||
decisions: upsertDecision(existing.decisions, decision),
|
||||
};
|
||||
|
||||
await project.fileStore.writeFile(
|
||||
path,
|
||||
`${JSON.stringify(artifact, null, 2)}\n`,
|
||||
LOCAL_AUTHOR,
|
||||
LOCAL_AUTHOR_EMAIL,
|
||||
`scan(live-database): record relationship review decision runId=${input.runId}`,
|
||||
);
|
||||
|
||||
return { path, decision, artifact };
|
||||
}
|
||||
108
packages/context/src/scan/relationship-scoring.test.ts
Normal file
108
packages/context/src/scan/relationship-scoring.test.ts
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
calibrateWeightsFromSyntheticFixtures,
|
||||
defaultKloRelationshipScoreWeights,
|
||||
normalizeKloRelationshipScoreWeights,
|
||||
scoreKloRelationshipCandidate,
|
||||
type KloRelationshipSignalVector,
|
||||
} from './relationship-scoring.js';
|
||||
|
||||
function signals(overrides: Partial<KloRelationshipSignalVector> = {}): KloRelationshipSignalVector {
|
||||
return {
|
||||
nameSimilarity: 0.5,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0,
|
||||
embeddingSimilarity: 0,
|
||||
profileUniqueness: 0.5,
|
||||
profileNullRate: 0.5,
|
||||
structuralPrior: 0.5,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship scoring', () => {
|
||||
it('scores stronger evidence higher without hard-gating on names', () => {
|
||||
const weakNameStrongProfile = scoreKloRelationshipCandidate(
|
||||
signals({
|
||||
nameSimilarity: 0.05,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0.7,
|
||||
profileUniqueness: 1,
|
||||
profileNullRate: 1,
|
||||
structuralPrior: 0.7,
|
||||
}),
|
||||
);
|
||||
const strongNameWeakProfile = scoreKloRelationshipCandidate(
|
||||
signals({
|
||||
nameSimilarity: 0.95,
|
||||
typeCompatibility: 1,
|
||||
valueOverlap: 0,
|
||||
profileUniqueness: 0.3,
|
||||
profileNullRate: 0.4,
|
||||
structuralPrior: 0.5,
|
||||
}),
|
||||
);
|
||||
|
||||
expect(weakNameStrongProfile.score).toBeGreaterThan(strongNameWeakProfile.score);
|
||||
expect(weakNameStrongProfile.contributions.profileUniqueness).toBeGreaterThan(0);
|
||||
expect(weakNameStrongProfile.contributions.nameSimilarity).toBeLessThan(0.02);
|
||||
});
|
||||
|
||||
it('normalizes partial and invalid weights into a usable vector', () => {
|
||||
const weights = normalizeKloRelationshipScoreWeights({
|
||||
nameSimilarity: 3,
|
||||
typeCompatibility: -1,
|
||||
valueOverlap: Number.POSITIVE_INFINITY,
|
||||
profileUniqueness: 1,
|
||||
});
|
||||
|
||||
const total = Object.values(weights).reduce((sum, value) => sum + value, 0);
|
||||
expect(total).toBeCloseTo(1, 6);
|
||||
expect(weights.nameSimilarity).toBeGreaterThan(weights.profileUniqueness);
|
||||
expect(weights.typeCompatibility).toBe(0);
|
||||
expect(weights.valueOverlap).toBe(0);
|
||||
});
|
||||
|
||||
it('returns deterministic defaults as a defensive copy', () => {
|
||||
const first = defaultKloRelationshipScoreWeights();
|
||||
const second = defaultKloRelationshipScoreWeights();
|
||||
|
||||
expect(first).toEqual(second);
|
||||
expect(first).not.toBe(second);
|
||||
expect(Object.values(first).reduce((sum, value) => sum + value, 0)).toBeCloseTo(1, 6);
|
||||
});
|
||||
|
||||
it('calibrates only from synthetic observations', () => {
|
||||
expect(() =>
|
||||
calibrateWeightsFromSyntheticFixtures([
|
||||
{
|
||||
fixtureId: 'chinook_with_declared_metadata',
|
||||
origin: 'public',
|
||||
expectedRelationship: true,
|
||||
signals: signals({ nameSimilarity: 1 }),
|
||||
},
|
||||
]),
|
||||
).toThrow(/synthetic/i);
|
||||
});
|
||||
|
||||
it('calibrates deterministic weights from positive and negative synthetic observations', () => {
|
||||
const weights = calibrateWeightsFromSyntheticFixtures([
|
||||
{
|
||||
fixtureId: 'synthetic_positive',
|
||||
origin: 'synthetic',
|
||||
expectedRelationship: true,
|
||||
signals: signals({ nameSimilarity: 0.8, valueOverlap: 0.9, profileUniqueness: 1, profileNullRate: 1 }),
|
||||
},
|
||||
{
|
||||
fixtureId: 'synthetic_negative',
|
||||
origin: 'synthetic',
|
||||
expectedRelationship: false,
|
||||
signals: signals({ nameSimilarity: 0.2, valueOverlap: 0.1, profileUniqueness: 0.4, profileNullRate: 0.5 }),
|
||||
},
|
||||
]);
|
||||
|
||||
expect(Object.values(weights).reduce((sum, value) => sum + value, 0)).toBeCloseTo(1, 6);
|
||||
expect(weights.valueOverlap).toBeGreaterThan(weights.structuralPrior);
|
||||
expect(weights.profileUniqueness).toBeGreaterThan(weights.embeddingSimilarity);
|
||||
});
|
||||
});
|
||||
155
packages/context/src/scan/relationship-scoring.ts
Normal file
155
packages/context/src/scan/relationship-scoring.ts
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
export const KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS = [
|
||||
'nameSimilarity',
|
||||
'typeCompatibility',
|
||||
'valueOverlap',
|
||||
'embeddingSimilarity',
|
||||
'profileUniqueness',
|
||||
'profileNullRate',
|
||||
'structuralPrior',
|
||||
] as const;
|
||||
|
||||
export type KloRelationshipScoreSignal = (typeof KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS)[number];
|
||||
|
||||
export type KloRelationshipFixtureOrigin = 'synthetic' | 'public' | 'customer';
|
||||
|
||||
export interface KloRelationshipSignalVector {
|
||||
nameSimilarity: number;
|
||||
typeCompatibility: number;
|
||||
valueOverlap: number;
|
||||
embeddingSimilarity: number;
|
||||
profileUniqueness: number;
|
||||
profileNullRate: number;
|
||||
structuralPrior: number;
|
||||
}
|
||||
|
||||
export type KloRelationshipScoreWeights = Record<KloRelationshipScoreSignal, number>;
|
||||
|
||||
export interface KloRelationshipScoreBreakdown {
|
||||
score: number;
|
||||
signals: KloRelationshipSignalVector;
|
||||
weights: KloRelationshipScoreWeights;
|
||||
contributions: KloRelationshipScoreWeights;
|
||||
}
|
||||
|
||||
export interface KloRelationshipScoringCalibrationObservation {
|
||||
fixtureId: string;
|
||||
origin: KloRelationshipFixtureOrigin;
|
||||
expectedRelationship: boolean;
|
||||
signals: KloRelationshipSignalVector;
|
||||
}
|
||||
|
||||
const DEFAULT_WEIGHTS: KloRelationshipScoreWeights = {
|
||||
nameSimilarity: 0.24,
|
||||
typeCompatibility: 0.1,
|
||||
valueOverlap: 0.22,
|
||||
embeddingSimilarity: 0.1,
|
||||
profileUniqueness: 0.22,
|
||||
profileNullRate: 0.08,
|
||||
structuralPrior: 0.04,
|
||||
};
|
||||
|
||||
function clampScore(value: number): number {
|
||||
if (!Number.isFinite(value)) {
|
||||
return 0;
|
||||
}
|
||||
return Math.max(0, Math.min(1, value));
|
||||
}
|
||||
|
||||
function roundScore(value: number): number {
|
||||
return Number(clampScore(value).toFixed(3));
|
||||
}
|
||||
|
||||
function sanitizeSignalVector(signals: KloRelationshipSignalVector): KloRelationshipSignalVector {
|
||||
return {
|
||||
nameSimilarity: roundScore(signals.nameSimilarity),
|
||||
typeCompatibility: roundScore(signals.typeCompatibility),
|
||||
valueOverlap: roundScore(signals.valueOverlap),
|
||||
embeddingSimilarity: roundScore(signals.embeddingSimilarity),
|
||||
profileUniqueness: roundScore(signals.profileUniqueness),
|
||||
profileNullRate: roundScore(signals.profileNullRate),
|
||||
structuralPrior: roundScore(signals.structuralPrior),
|
||||
};
|
||||
}
|
||||
|
||||
export function defaultKloRelationshipScoreWeights(): KloRelationshipScoreWeights {
|
||||
return { ...DEFAULT_WEIGHTS };
|
||||
}
|
||||
|
||||
export function normalizeKloRelationshipScoreWeights(
|
||||
weights: Partial<KloRelationshipScoreWeights> = DEFAULT_WEIGHTS,
|
||||
): KloRelationshipScoreWeights {
|
||||
const rawEntries = KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS.map((key) => {
|
||||
const value = weights[key] ?? 0;
|
||||
return [key, Number.isFinite(value) ? Math.max(0, value) : 0] as const;
|
||||
});
|
||||
const total = rawEntries.reduce((sum, [, value]) => sum + value, 0);
|
||||
if (total <= 0) {
|
||||
return defaultKloRelationshipScoreWeights();
|
||||
}
|
||||
|
||||
return Object.fromEntries(rawEntries.map(([key, value]) => [key, value / total])) as KloRelationshipScoreWeights;
|
||||
}
|
||||
|
||||
export function scoreKloRelationshipCandidate(
|
||||
signals: KloRelationshipSignalVector,
|
||||
weights: Partial<KloRelationshipScoreWeights> = DEFAULT_WEIGHTS,
|
||||
): KloRelationshipScoreBreakdown {
|
||||
const sanitizedSignals = sanitizeSignalVector(signals);
|
||||
const normalizedWeights = normalizeKloRelationshipScoreWeights(weights);
|
||||
const contributions = Object.fromEntries(
|
||||
KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS.map((key) => [
|
||||
key,
|
||||
Number((sanitizedSignals[key] * normalizedWeights[key]).toFixed(6)),
|
||||
]),
|
||||
) as KloRelationshipScoreWeights;
|
||||
const rawWeightedScore = KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS.reduce((sum, key) => sum + contributions[key], 0);
|
||||
const scoredConfidence = sanitizedSignals.typeCompatibility <= 0 ? 0 : 0.56 + rawWeightedScore * 0.65;
|
||||
|
||||
return {
|
||||
score: roundScore(scoredConfidence),
|
||||
signals: sanitizedSignals,
|
||||
weights: normalizedWeights,
|
||||
contributions,
|
||||
};
|
||||
}
|
||||
|
||||
function averageSignal(
|
||||
observations: readonly KloRelationshipScoringCalibrationObservation[],
|
||||
key: KloRelationshipScoreSignal,
|
||||
): number {
|
||||
if (observations.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
return observations.reduce((sum, observation) => sum + clampScore(observation.signals[key]), 0) / observations.length;
|
||||
}
|
||||
|
||||
export function calibrateWeightsFromSyntheticFixtures(
|
||||
observations: readonly KloRelationshipScoringCalibrationObservation[],
|
||||
): KloRelationshipScoreWeights {
|
||||
const nonSynthetic = observations.find((observation) => observation.origin !== 'synthetic');
|
||||
if (nonSynthetic) {
|
||||
throw new Error(
|
||||
`Relationship score calibration accepts only synthetic fixtures; ${nonSynthetic.fixtureId} is ${nonSynthetic.origin}`,
|
||||
);
|
||||
}
|
||||
if (observations.length === 0) {
|
||||
return defaultKloRelationshipScoreWeights();
|
||||
}
|
||||
|
||||
const positives = observations.filter((observation) => observation.expectedRelationship);
|
||||
const negatives = observations.filter((observation) => !observation.expectedRelationship);
|
||||
if (positives.length === 0 || negatives.length === 0) {
|
||||
return defaultKloRelationshipScoreWeights();
|
||||
}
|
||||
|
||||
const calibrated = Object.fromEntries(
|
||||
KLO_RELATIONSHIP_SCORE_SIGNAL_KEYS.map((key) => {
|
||||
const positiveAverage = averageSignal(positives, key);
|
||||
const negativeAverage = averageSignal(negatives, key);
|
||||
const separation = Math.max(0, positiveAverage - negativeAverage);
|
||||
return [key, separation + DEFAULT_WEIGHTS[key] * 0.25];
|
||||
}),
|
||||
) as KloRelationshipScoreWeights;
|
||||
|
||||
return normalizeKloRelationshipScoreWeights(calibrated);
|
||||
}
|
||||
241
packages/context/src/scan/relationship-threshold-advice.test.ts
Normal file
241
packages/context/src/scan/relationship-threshold-advice.test.ts
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import {
|
||||
adviseLocalRelationshipFeedbackThresholds,
|
||||
buildKloRelationshipThresholdAdviceReport,
|
||||
formatKloRelationshipThresholdAdviceMarkdown,
|
||||
} from './relationship-threshold-advice.js';
|
||||
import type {
|
||||
ExportLocalRelationshipFeedbackLabelsResult,
|
||||
KloRelationshipFeedbackLabel,
|
||||
} from './relationship-feedback-export.js';
|
||||
|
||||
function label(
|
||||
input: Partial<KloRelationshipFeedbackLabel> & Pick<KloRelationshipFeedbackLabel, 'candidateId' | 'decision' | 'score'>,
|
||||
): KloRelationshipFeedbackLabel {
|
||||
return {
|
||||
schemaVersion: 1,
|
||||
previousStatus: 'review',
|
||||
connectionId: 'warehouse',
|
||||
runId: 'scan-run-a',
|
||||
syncId: 'sync-a',
|
||||
decidedAt: '2026-05-07T12:00:00.000Z',
|
||||
reviewer: 'Andrey',
|
||||
note: null,
|
||||
relationshipType: 'many_to_one',
|
||||
source: 'deterministic_name',
|
||||
confidence: input.score ?? 0,
|
||||
pkScore: input.pkScore ?? null,
|
||||
fkScore: input.fkScore ?? input.score,
|
||||
fromTable: 'public.orders',
|
||||
fromColumns: ['customer_id'],
|
||||
toTable: 'public.customers',
|
||||
toColumns: ['id'],
|
||||
reasons: [],
|
||||
artifactPath: 'raw-sources/warehouse/live-database/sync-a/enrichment/relationship-review-decisions.json',
|
||||
...input,
|
||||
};
|
||||
}
|
||||
|
||||
function feedback(labels: KloRelationshipFeedbackLabel[]): ExportLocalRelationshipFeedbackLabelsResult {
|
||||
return {
|
||||
generatedAt: '2026-05-07T13:00:00.000Z',
|
||||
filters: { connectionId: null, decision: 'all' },
|
||||
summary: {
|
||||
total: labels.length,
|
||||
accepted: labels.filter((item) => item.decision === 'accepted').length,
|
||||
rejected: labels.filter((item) => item.decision === 'rejected').length,
|
||||
connections: new Set(labels.map((item) => item.connectionId)).size,
|
||||
runs: new Set(labels.map((item) => `${item.connectionId}:${item.runId}`)).size,
|
||||
},
|
||||
labels,
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship threshold advice', () => {
|
||||
it('selects the highest-quality threshold candidate when enough labels exist', () => {
|
||||
const report = buildKloRelationshipThresholdAdviceReport(
|
||||
feedback([
|
||||
label({
|
||||
candidateId: 'orders:orders.customer_id->customers:customers.id',
|
||||
decision: 'accepted',
|
||||
score: 0.91,
|
||||
pkScore: 0.97,
|
||||
fkScore: 0.91,
|
||||
}),
|
||||
label({
|
||||
candidateId: 'orders:orders.account_id->accounts:accounts.id',
|
||||
decision: 'accepted',
|
||||
score: 0.61,
|
||||
pkScore: 0.88,
|
||||
fkScore: 0.61,
|
||||
}),
|
||||
label({
|
||||
candidateId: 'orders:orders.note_id->notes:notes.id',
|
||||
decision: 'rejected',
|
||||
score: 0.21,
|
||||
pkScore: 0.4,
|
||||
fkScore: 0.21,
|
||||
}),
|
||||
label({
|
||||
candidateId: 'orders:orders.region_id->regions:regions.id',
|
||||
decision: 'rejected',
|
||||
score: 0.88,
|
||||
pkScore: 0.9,
|
||||
fkScore: 0.88,
|
||||
}),
|
||||
]),
|
||||
{
|
||||
acceptThresholds: [0.9, 0.85],
|
||||
reviewThresholds: [0.55],
|
||||
minTotalLabels: 4,
|
||||
minAcceptedLabels: 2,
|
||||
minRejectedLabels: 2,
|
||||
minAcceptedBandPrecision: 0.75,
|
||||
minAcceptedOrReviewRecall: 0.75,
|
||||
minRejectedBandPrecision: 0.75,
|
||||
},
|
||||
);
|
||||
|
||||
expect(report.status).toBe('ready');
|
||||
expect(report.summary).toMatchObject({
|
||||
totalLabels: 4,
|
||||
scoredLabels: 4,
|
||||
acceptedLabels: 2,
|
||||
rejectedLabels: 2,
|
||||
eligibleCandidates: 1,
|
||||
});
|
||||
expect(report.recommended).toMatchObject({
|
||||
acceptThreshold: 0.9,
|
||||
reviewThreshold: 0.55,
|
||||
eligible: true,
|
||||
acceptedBandPrecision: 1,
|
||||
acceptedRecall: 0.5,
|
||||
acceptedOrReviewRecall: 1,
|
||||
rejectedBandPrecision: 1,
|
||||
rejectedRecall: 1,
|
||||
falseAcceptedRejectedLabels: 0,
|
||||
falseRejectedAcceptedLabels: 0,
|
||||
});
|
||||
expect(report.candidates.map((candidate) => [candidate.acceptThreshold, candidate.reviewThreshold, candidate.eligible])).toEqual([
|
||||
[0.9, 0.55, true],
|
||||
[0.85, 0.55, false],
|
||||
]);
|
||||
});
|
||||
|
||||
it('reports insufficient labels without hiding evaluated candidates', () => {
|
||||
const report = buildKloRelationshipThresholdAdviceReport(
|
||||
feedback([
|
||||
label({ candidateId: 'orders:orders.customer_id->customers:customers.id', decision: 'accepted', score: 0.91 }),
|
||||
label({ candidateId: 'orders:orders.note_id->notes:notes.id', decision: 'rejected', score: 0.21 }),
|
||||
]),
|
||||
{
|
||||
acceptThresholds: [0.9],
|
||||
reviewThresholds: [0.55],
|
||||
minTotalLabels: 10,
|
||||
minAcceptedLabels: 5,
|
||||
minRejectedLabels: 5,
|
||||
},
|
||||
);
|
||||
|
||||
expect(report.status).toBe('insufficient_labels');
|
||||
expect(report.recommended).toBeNull();
|
||||
expect(report.summary).toMatchObject({
|
||||
totalLabels: 2,
|
||||
scoredLabels: 2,
|
||||
acceptedLabels: 1,
|
||||
rejectedLabels: 1,
|
||||
eligibleCandidates: 1,
|
||||
});
|
||||
expect(report.reasons).toEqual([
|
||||
'Need at least 10 scored labels; found 2.',
|
||||
'Need at least 5 accepted labels; found 1.',
|
||||
'Need at least 5 rejected labels; found 1.',
|
||||
]);
|
||||
expect(report.candidates).toHaveLength(1);
|
||||
});
|
||||
|
||||
it('reports no eligible thresholds when label counts pass but quality gates fail', () => {
|
||||
const report = buildKloRelationshipThresholdAdviceReport(
|
||||
feedback([
|
||||
label({ candidateId: 'a', decision: 'accepted', score: 0.92 }),
|
||||
label({ candidateId: 'b', decision: 'accepted', score: 0.58 }),
|
||||
label({ candidateId: 'c', decision: 'rejected', score: 0.91 }),
|
||||
label({ candidateId: 'd', decision: 'rejected', score: 0.2 }),
|
||||
]),
|
||||
{
|
||||
acceptThresholds: [0.9],
|
||||
reviewThresholds: [0.55],
|
||||
minTotalLabels: 4,
|
||||
minAcceptedLabels: 2,
|
||||
minRejectedLabels: 2,
|
||||
minAcceptedBandPrecision: 0.9,
|
||||
},
|
||||
);
|
||||
|
||||
expect(report.status).toBe('no_eligible_thresholds');
|
||||
expect(report.recommended).toBeNull();
|
||||
expect(report.reasons).toEqual(['No threshold candidate met the precision and recall gates.']);
|
||||
expect(report.candidates[0]).toMatchObject({
|
||||
acceptThreshold: 0.9,
|
||||
reviewThreshold: 0.55,
|
||||
eligible: false,
|
||||
acceptedBandPrecision: 0.5,
|
||||
});
|
||||
});
|
||||
|
||||
it('wraps the feedback exporter and preserves warnings', async () => {
|
||||
const project = { projectDir: '/tmp/klo-project' } as KloLocalProject;
|
||||
const exportLocalRelationshipFeedbackLabels = vi.fn(async () => ({
|
||||
...feedback([]),
|
||||
warnings: [
|
||||
{
|
||||
path: 'raw-sources/broken/live-database/sync/enrichment/relationship-review-decisions.json',
|
||||
message: 'Unexpected token',
|
||||
},
|
||||
],
|
||||
}));
|
||||
|
||||
const report = await adviseLocalRelationshipFeedbackThresholds(project, {
|
||||
connectionId: 'warehouse',
|
||||
exportLocalRelationshipFeedbackLabels,
|
||||
minTotalLabels: 1,
|
||||
});
|
||||
|
||||
expect(exportLocalRelationshipFeedbackLabels).toHaveBeenCalledWith(project, {
|
||||
connectionId: 'warehouse',
|
||||
decision: 'all',
|
||||
});
|
||||
expect(report.warnings).toEqual([
|
||||
{
|
||||
path: 'raw-sources/broken/live-database/sync/enrichment/relationship-review-decisions.json',
|
||||
message: 'Unexpected token',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('formats a stable human-readable report', () => {
|
||||
const report = buildKloRelationshipThresholdAdviceReport(
|
||||
feedback([
|
||||
label({ candidateId: 'orders:orders.customer_id->customers:customers.id', decision: 'accepted', score: 0.91 }),
|
||||
label({ candidateId: 'orders:orders.account_id->accounts:accounts.id', decision: 'accepted', score: 0.61 }),
|
||||
label({ candidateId: 'orders:orders.note_id->notes:notes.id', decision: 'rejected', score: 0.21 }),
|
||||
label({ candidateId: 'orders:orders.region_id->regions:regions.id', decision: 'rejected', score: 0.88 }),
|
||||
]),
|
||||
{
|
||||
acceptThresholds: [0.9],
|
||||
reviewThresholds: [0.55],
|
||||
minTotalLabels: 4,
|
||||
minAcceptedLabels: 2,
|
||||
minRejectedLabels: 2,
|
||||
minAcceptedBandPrecision: 0.75,
|
||||
},
|
||||
);
|
||||
|
||||
expect(formatKloRelationshipThresholdAdviceMarkdown(report)).toContain('KLO relationship threshold advice');
|
||||
expect(formatKloRelationshipThresholdAdviceMarkdown(report)).toContain('Status: ready');
|
||||
expect(formatKloRelationshipThresholdAdviceMarkdown(report)).toContain('Recommended: accept=0.90 review=0.55');
|
||||
expect(formatKloRelationshipThresholdAdviceMarkdown(report)).toContain('acceptedPrecision=1.000');
|
||||
});
|
||||
});
|
||||
335
packages/context/src/scan/relationship-threshold-advice.ts
Normal file
335
packages/context/src/scan/relationship-threshold-advice.ts
Normal file
|
|
@ -0,0 +1,335 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import {
|
||||
exportLocalRelationshipFeedbackLabels,
|
||||
type ExportLocalRelationshipFeedbackLabelsInput,
|
||||
type ExportLocalRelationshipFeedbackLabelsResult,
|
||||
type KloRelationshipFeedbackExportWarning,
|
||||
type KloRelationshipFeedbackLabel,
|
||||
} from './relationship-feedback-export.js';
|
||||
import type { KloResolvedRelationshipStatus } from './relationship-graph-resolver.js';
|
||||
|
||||
const DEFAULT_ACCEPT_THRESHOLDS = [0.95, 0.9, 0.85, 0.8, 0.75] as const;
|
||||
const DEFAULT_REVIEW_THRESHOLDS = [0.65, 0.6, 0.55, 0.5, 0.45] as const;
|
||||
|
||||
type AdvicePredictedStatus = KloResolvedRelationshipStatus;
|
||||
export type KloRelationshipThresholdAdviceStatus = 'ready' | 'insufficient_labels' | 'no_eligible_thresholds';
|
||||
|
||||
export interface BuildKloRelationshipThresholdAdviceReportInput {
|
||||
acceptThresholds?: readonly number[];
|
||||
reviewThresholds?: readonly number[];
|
||||
minTotalLabels?: number;
|
||||
minAcceptedLabels?: number;
|
||||
minRejectedLabels?: number;
|
||||
minAcceptedBandPrecision?: number;
|
||||
minAcceptedOrReviewRecall?: number;
|
||||
minRejectedBandPrecision?: number;
|
||||
}
|
||||
|
||||
export interface AdviseLocalRelationshipFeedbackThresholdsInput
|
||||
extends Omit<ExportLocalRelationshipFeedbackLabelsInput, 'decision'>,
|
||||
BuildKloRelationshipThresholdAdviceReportInput {
|
||||
exportLocalRelationshipFeedbackLabels?: typeof exportLocalRelationshipFeedbackLabels;
|
||||
}
|
||||
|
||||
export interface KloRelationshipThresholdAdviceCandidate {
|
||||
acceptThreshold: number;
|
||||
reviewThreshold: number;
|
||||
eligible: boolean;
|
||||
predictedAccepted: number;
|
||||
predictedReview: number;
|
||||
predictedRejected: number;
|
||||
acceptedBandPrecision: number | null;
|
||||
acceptedRecall: number | null;
|
||||
acceptedOrReviewRecall: number | null;
|
||||
rejectedBandPrecision: number | null;
|
||||
rejectedRecall: number | null;
|
||||
falseAcceptedRejectedLabels: number;
|
||||
falseRejectedAcceptedLabels: number;
|
||||
}
|
||||
|
||||
export interface KloRelationshipThresholdAdviceReport {
|
||||
generatedAt: string;
|
||||
filters: ExportLocalRelationshipFeedbackLabelsResult['filters'];
|
||||
status: KloRelationshipThresholdAdviceStatus;
|
||||
gates: {
|
||||
minTotalLabels: number;
|
||||
minAcceptedLabels: number;
|
||||
minRejectedLabels: number;
|
||||
minAcceptedBandPrecision: number;
|
||||
minAcceptedOrReviewRecall: number;
|
||||
minRejectedBandPrecision: number;
|
||||
};
|
||||
summary: {
|
||||
totalLabels: number;
|
||||
scoredLabels: number;
|
||||
unscoredLabels: number;
|
||||
acceptedLabels: number;
|
||||
rejectedLabels: number;
|
||||
evaluatedCandidates: number;
|
||||
eligibleCandidates: number;
|
||||
};
|
||||
recommended: KloRelationshipThresholdAdviceCandidate | null;
|
||||
candidates: KloRelationshipThresholdAdviceCandidate[];
|
||||
reasons: string[];
|
||||
warnings: KloRelationshipFeedbackExportWarning[];
|
||||
}
|
||||
|
||||
interface ResolvedAdviceInput {
|
||||
acceptThresholds: number[];
|
||||
reviewThresholds: number[];
|
||||
minTotalLabels: number;
|
||||
minAcceptedLabels: number;
|
||||
minRejectedLabels: number;
|
||||
minAcceptedBandPrecision: number;
|
||||
minAcceptedOrReviewRecall: number;
|
||||
minRejectedBandPrecision: number;
|
||||
}
|
||||
|
||||
function resolveInput(input: BuildKloRelationshipThresholdAdviceReportInput): ResolvedAdviceInput {
|
||||
return {
|
||||
acceptThresholds: [...(input.acceptThresholds ?? DEFAULT_ACCEPT_THRESHOLDS)].sort((left, right) => right - left),
|
||||
reviewThresholds: [...(input.reviewThresholds ?? DEFAULT_REVIEW_THRESHOLDS)].sort((left, right) => right - left),
|
||||
minTotalLabels: input.minTotalLabels ?? 20,
|
||||
minAcceptedLabels: input.minAcceptedLabels ?? 5,
|
||||
minRejectedLabels: input.minRejectedLabels ?? 5,
|
||||
minAcceptedBandPrecision: input.minAcceptedBandPrecision ?? 0.9,
|
||||
minAcceptedOrReviewRecall: input.minAcceptedOrReviewRecall ?? 0.8,
|
||||
minRejectedBandPrecision: input.minRejectedBandPrecision ?? 0.8,
|
||||
};
|
||||
}
|
||||
|
||||
function roundMetric(value: number): number {
|
||||
return Math.round(value * 1000) / 1000;
|
||||
}
|
||||
|
||||
function ratio(numerator: number, denominator: number): number | null {
|
||||
return denominator === 0 ? null : roundMetric(numerator / denominator);
|
||||
}
|
||||
|
||||
function prediction(score: number, acceptThreshold: number, reviewThreshold: number): AdvicePredictedStatus {
|
||||
if (score >= acceptThreshold) {
|
||||
return 'accepted';
|
||||
}
|
||||
if (score >= reviewThreshold) {
|
||||
return 'review';
|
||||
}
|
||||
return 'rejected';
|
||||
}
|
||||
|
||||
function isMetricAtLeast(value: number | null, minimum: number): boolean {
|
||||
return value !== null && value >= minimum;
|
||||
}
|
||||
|
||||
function thresholdCandidate(
|
||||
labels: readonly KloRelationshipFeedbackLabel[],
|
||||
acceptThreshold: number,
|
||||
reviewThreshold: number,
|
||||
gates: ResolvedAdviceInput,
|
||||
): KloRelationshipThresholdAdviceCandidate {
|
||||
const scored = labels.filter((label): label is KloRelationshipFeedbackLabel & { score: number } => label.score !== null);
|
||||
const acceptedLabels = scored.filter((label) => label.decision === 'accepted');
|
||||
const rejectedLabels = scored.filter((label) => label.decision === 'rejected');
|
||||
const predictions = scored.map((label) => ({
|
||||
label,
|
||||
predictedStatus: prediction(label.score, acceptThreshold, reviewThreshold),
|
||||
}));
|
||||
const predictedAccepted = predictions.filter((item) => item.predictedStatus === 'accepted');
|
||||
const predictedReview = predictions.filter((item) => item.predictedStatus === 'review');
|
||||
const predictedRejected = predictions.filter((item) => item.predictedStatus === 'rejected');
|
||||
const acceptedBandPrecision = ratio(
|
||||
predictedAccepted.filter((item) => item.label.decision === 'accepted').length,
|
||||
predictedAccepted.length,
|
||||
);
|
||||
const acceptedOrReviewRecall = ratio(
|
||||
predictions.filter((item) => item.label.decision === 'accepted' && item.predictedStatus !== 'rejected').length,
|
||||
acceptedLabels.length,
|
||||
);
|
||||
const rejectedBandPrecision = ratio(
|
||||
predictedRejected.filter((item) => item.label.decision === 'rejected').length,
|
||||
predictedRejected.length,
|
||||
);
|
||||
|
||||
return {
|
||||
acceptThreshold,
|
||||
reviewThreshold,
|
||||
eligible:
|
||||
predictedAccepted.length > 0 &&
|
||||
predictedRejected.length > 0 &&
|
||||
isMetricAtLeast(acceptedBandPrecision, gates.minAcceptedBandPrecision) &&
|
||||
isMetricAtLeast(acceptedOrReviewRecall, gates.minAcceptedOrReviewRecall) &&
|
||||
isMetricAtLeast(rejectedBandPrecision, gates.minRejectedBandPrecision),
|
||||
predictedAccepted: predictedAccepted.length,
|
||||
predictedReview: predictedReview.length,
|
||||
predictedRejected: predictedRejected.length,
|
||||
acceptedBandPrecision,
|
||||
acceptedRecall: ratio(
|
||||
predictedAccepted.filter((item) => item.label.decision === 'accepted').length,
|
||||
acceptedLabels.length,
|
||||
),
|
||||
acceptedOrReviewRecall,
|
||||
rejectedBandPrecision,
|
||||
rejectedRecall: ratio(
|
||||
predictions.filter((item) => item.label.decision === 'rejected' && item.predictedStatus !== 'accepted').length,
|
||||
rejectedLabels.length,
|
||||
),
|
||||
falseAcceptedRejectedLabels: predictedAccepted.filter((item) => item.label.decision === 'rejected').length,
|
||||
falseRejectedAcceptedLabels: predictedRejected.filter((item) => item.label.decision === 'accepted').length,
|
||||
};
|
||||
}
|
||||
|
||||
function metricRank(value: number | null): number {
|
||||
return value ?? -1;
|
||||
}
|
||||
|
||||
function sortCandidates(
|
||||
candidates: readonly KloRelationshipThresholdAdviceCandidate[],
|
||||
): KloRelationshipThresholdAdviceCandidate[] {
|
||||
return [...candidates].sort(
|
||||
(left, right) =>
|
||||
Number(right.eligible) - Number(left.eligible) ||
|
||||
metricRank(right.acceptedBandPrecision) - metricRank(left.acceptedBandPrecision) ||
|
||||
metricRank(right.acceptedOrReviewRecall) - metricRank(left.acceptedOrReviewRecall) ||
|
||||
metricRank(right.rejectedBandPrecision) - metricRank(left.rejectedBandPrecision) ||
|
||||
right.acceptThreshold - left.acceptThreshold ||
|
||||
right.reviewThreshold - left.reviewThreshold,
|
||||
);
|
||||
}
|
||||
|
||||
function labelGateReasons(labels: readonly KloRelationshipFeedbackLabel[], gates: ResolvedAdviceInput): string[] {
|
||||
const scored = labels.filter((label) => label.score !== null);
|
||||
const accepted = scored.filter((label) => label.decision === 'accepted');
|
||||
const rejected = scored.filter((label) => label.decision === 'rejected');
|
||||
const reasons: string[] = [];
|
||||
if (scored.length < gates.minTotalLabels) {
|
||||
reasons.push(`Need at least ${gates.minTotalLabels} scored labels; found ${scored.length}.`);
|
||||
}
|
||||
if (accepted.length < gates.minAcceptedLabels) {
|
||||
reasons.push(`Need at least ${gates.minAcceptedLabels} accepted labels; found ${accepted.length}.`);
|
||||
}
|
||||
if (rejected.length < gates.minRejectedLabels) {
|
||||
reasons.push(`Need at least ${gates.minRejectedLabels} rejected labels; found ${rejected.length}.`);
|
||||
}
|
||||
return reasons;
|
||||
}
|
||||
|
||||
export function buildKloRelationshipThresholdAdviceReport(
|
||||
feedback: ExportLocalRelationshipFeedbackLabelsResult,
|
||||
input: BuildKloRelationshipThresholdAdviceReportInput = {},
|
||||
): KloRelationshipThresholdAdviceReport {
|
||||
const gates = resolveInput(input);
|
||||
const scored = feedback.labels.filter((label) => label.score !== null);
|
||||
const acceptedLabels = scored.filter((label) => label.decision === 'accepted');
|
||||
const rejectedLabels = scored.filter((label) => label.decision === 'rejected');
|
||||
const candidates = sortCandidates(
|
||||
gates.acceptThresholds.flatMap((acceptThreshold) =>
|
||||
gates.reviewThresholds.flatMap((reviewThreshold) =>
|
||||
acceptThreshold > reviewThreshold
|
||||
? [thresholdCandidate(feedback.labels, acceptThreshold, reviewThreshold, gates)]
|
||||
: [],
|
||||
),
|
||||
),
|
||||
);
|
||||
const labelReasons = labelGateReasons(feedback.labels, gates);
|
||||
const eligibleCandidates = candidates.filter((candidate) => candidate.eligible);
|
||||
const status: KloRelationshipThresholdAdviceStatus =
|
||||
labelReasons.length > 0 ? 'insufficient_labels' : eligibleCandidates.length > 0 ? 'ready' : 'no_eligible_thresholds';
|
||||
const reasons =
|
||||
status === 'insufficient_labels'
|
||||
? labelReasons
|
||||
: status === 'no_eligible_thresholds'
|
||||
? ['No threshold candidate met the precision and recall gates.']
|
||||
: [];
|
||||
|
||||
return {
|
||||
generatedAt: feedback.generatedAt,
|
||||
filters: feedback.filters,
|
||||
status,
|
||||
gates: {
|
||||
minTotalLabels: gates.minTotalLabels,
|
||||
minAcceptedLabels: gates.minAcceptedLabels,
|
||||
minRejectedLabels: gates.minRejectedLabels,
|
||||
minAcceptedBandPrecision: gates.minAcceptedBandPrecision,
|
||||
minAcceptedOrReviewRecall: gates.minAcceptedOrReviewRecall,
|
||||
minRejectedBandPrecision: gates.minRejectedBandPrecision,
|
||||
},
|
||||
summary: {
|
||||
totalLabels: feedback.labels.length,
|
||||
scoredLabels: scored.length,
|
||||
unscoredLabels: feedback.labels.length - scored.length,
|
||||
acceptedLabels: acceptedLabels.length,
|
||||
rejectedLabels: rejectedLabels.length,
|
||||
evaluatedCandidates: candidates.length,
|
||||
eligibleCandidates: eligibleCandidates.length,
|
||||
},
|
||||
recommended: status === 'ready' ? eligibleCandidates[0] ?? null : null,
|
||||
candidates,
|
||||
reasons,
|
||||
warnings: [...feedback.warnings],
|
||||
};
|
||||
}
|
||||
|
||||
export async function adviseLocalRelationshipFeedbackThresholds(
|
||||
project: KloLocalProject,
|
||||
input: AdviseLocalRelationshipFeedbackThresholdsInput = {},
|
||||
): Promise<KloRelationshipThresholdAdviceReport> {
|
||||
const exporter = input.exportLocalRelationshipFeedbackLabels ?? exportLocalRelationshipFeedbackLabels;
|
||||
const feedback = await exporter(project, {
|
||||
connectionId: input.connectionId,
|
||||
decision: 'all',
|
||||
});
|
||||
return buildKloRelationshipThresholdAdviceReport(feedback, input);
|
||||
}
|
||||
|
||||
function formatMetric(value: number | null): string {
|
||||
return value === null ? 'n/a' : value.toFixed(3);
|
||||
}
|
||||
|
||||
function candidateLine(candidate: KloRelationshipThresholdAdviceCandidate): string {
|
||||
return [
|
||||
`accept=${candidate.acceptThreshold.toFixed(2)}`,
|
||||
`review=${candidate.reviewThreshold.toFixed(2)}`,
|
||||
`eligible=${candidate.eligible ? 'yes' : 'no'}`,
|
||||
`acceptedPrecision=${formatMetric(candidate.acceptedBandPrecision)}`,
|
||||
`acceptedRecall=${formatMetric(candidate.acceptedRecall)}`,
|
||||
`acceptedOrReviewRecall=${formatMetric(candidate.acceptedOrReviewRecall)}`,
|
||||
`rejectedPrecision=${formatMetric(candidate.rejectedBandPrecision)}`,
|
||||
`rejectedRecall=${formatMetric(candidate.rejectedRecall)}`,
|
||||
`falseAcceptedRejected=${candidate.falseAcceptedRejectedLabels}`,
|
||||
`falseRejectedAccepted=${candidate.falseRejectedAcceptedLabels}`,
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
export function formatKloRelationshipThresholdAdviceMarkdown(report: KloRelationshipThresholdAdviceReport): string {
|
||||
const lines = [
|
||||
'KLO relationship threshold advice',
|
||||
`Generated: ${report.generatedAt}`,
|
||||
`Filter connection: ${report.filters.connectionId ?? 'all'}`,
|
||||
`Status: ${report.status}`,
|
||||
`Labels: total=${report.summary.totalLabels} scored=${report.summary.scoredLabels} accepted=${report.summary.acceptedLabels} rejected=${report.summary.rejectedLabels}`,
|
||||
`Gates: minTotal=${report.gates.minTotalLabels} minAccepted=${report.gates.minAcceptedLabels} minRejected=${report.gates.minRejectedLabels} acceptedPrecision=${report.gates.minAcceptedBandPrecision.toFixed(3)} acceptedOrReviewRecall=${report.gates.minAcceptedOrReviewRecall.toFixed(3)} rejectedPrecision=${report.gates.minRejectedBandPrecision.toFixed(3)}`,
|
||||
`Evaluated candidates: ${report.summary.evaluatedCandidates}`,
|
||||
`Eligible candidates: ${report.summary.eligibleCandidates}`,
|
||||
`Recommended: ${
|
||||
report.recommended
|
||||
? `accept=${report.recommended.acceptThreshold.toFixed(2)} review=${report.recommended.reviewThreshold.toFixed(2)}`
|
||||
: 'none'
|
||||
}`,
|
||||
];
|
||||
|
||||
if (report.reasons.length > 0) {
|
||||
lines.push('', 'Reasons', ...report.reasons.map((reason) => ` - ${reason}`));
|
||||
}
|
||||
|
||||
if (report.candidates.length > 0) {
|
||||
lines.push('', 'Top candidates', ...report.candidates.slice(0, 5).map((candidate) => ` - ${candidateLine(candidate)}`));
|
||||
}
|
||||
|
||||
if (report.warnings.length > 0) {
|
||||
lines.push('', 'Warnings');
|
||||
for (const warning of report.warnings.slice(0, 5)) {
|
||||
lines.push(` - ${warning.path}: ${warning.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return `${lines.join('\n')}\n`;
|
||||
}
|
||||
492
packages/context/src/scan/relationship-validation.test.ts
Normal file
492
packages/context/src/scan/relationship-validation.test.ts
Normal file
|
|
@ -0,0 +1,492 @@
|
|||
import Database from 'better-sqlite3';
|
||||
import { afterEach, describe, expect, it } from 'vitest';
|
||||
import type { KloEnrichedColumn, KloEnrichedSchema, KloEnrichedTable } from './enrichment-types.js';
|
||||
import { generateKloRelationshipDiscoveryCandidates } from './relationship-candidates.js';
|
||||
import type { KloRelationshipProfileArtifact } from './relationship-profiling.js';
|
||||
import { profileKloRelationshipSchema } from './relationship-profiling.js';
|
||||
import { validateKloRelationshipDiscoveryCandidates } from './relationship-validation.js';
|
||||
import type { KloQueryResult, KloReadOnlyQueryInput, KloScanContext } from './types.js';
|
||||
|
||||
class InMemorySqliteExecutor {
|
||||
readonly db = new Database(':memory:');
|
||||
queryCount = 0;
|
||||
|
||||
executeReadOnly(input: KloReadOnlyQueryInput, _ctx: KloScanContext): Promise<KloQueryResult> {
|
||||
this.queryCount += 1;
|
||||
const rows = this.db.prepare(input.sql).all() as Record<string, unknown>[];
|
||||
const headers = Object.keys(rows[0] ?? {});
|
||||
return Promise.resolve({
|
||||
headers,
|
||||
rows: rows.map((row) => headers.map((header) => row[header])),
|
||||
totalRows: rows.length,
|
||||
rowCount: rows.length,
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.db.close();
|
||||
}
|
||||
}
|
||||
|
||||
function column(tableId: string, name: string, overrides: Partial<KloEnrichedColumn> = {}): KloEnrichedColumn {
|
||||
const tableRef = overrides.tableRef ?? { catalog: null, db: null, name: tableId };
|
||||
return {
|
||||
id: `${tableId}.${name}`,
|
||||
tableId,
|
||||
tableRef,
|
||||
name,
|
||||
nativeType: overrides.nativeType ?? 'INTEGER',
|
||||
normalizedType: overrides.normalizedType ?? 'integer',
|
||||
dimensionType: overrides.dimensionType ?? 'number',
|
||||
nullable: overrides.nullable ?? true,
|
||||
primaryKey: overrides.primaryKey ?? false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function table(name: string, columns: KloEnrichedColumn[]): KloEnrichedTable {
|
||||
const ref = { catalog: null, db: null, name };
|
||||
return {
|
||||
id: name,
|
||||
ref,
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: columns.map((item) => ({ ...item, tableId: name, tableRef: ref })),
|
||||
};
|
||||
}
|
||||
|
||||
function schema(tables?: KloEnrichedTable[]): KloEnrichedSchema {
|
||||
return {
|
||||
connectionId: 'warehouse',
|
||||
tables: tables ?? [
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id', { nullable: false })]),
|
||||
table('invoices', [
|
||||
column('invoices', 'id', { nullable: false }),
|
||||
column('invoices', 'account_id', { nullable: false }),
|
||||
]),
|
||||
],
|
||||
relationships: [],
|
||||
};
|
||||
}
|
||||
|
||||
describe('relationship validation', () => {
|
||||
let executor: InMemorySqliteExecutor | null = null;
|
||||
|
||||
afterEach(() => {
|
||||
executor?.close();
|
||||
executor = null;
|
||||
});
|
||||
|
||||
it('accepts a relationship-discovery candidate with unique parent values and full source coverage', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex'), (3, 'Initech');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2), (12, 3);
|
||||
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 2), (22, 999);
|
||||
`);
|
||||
const testSchema = schema();
|
||||
const profiles = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
});
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(testSchema).filter(
|
||||
(candidate) => candidate.from.table.name === 'users',
|
||||
);
|
||||
|
||||
const validated = await validateKloRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
});
|
||||
|
||||
expect(validated).toHaveLength(1);
|
||||
expect(validated[0]).toMatchObject({
|
||||
from: { table: { name: 'users' }, columns: ['account_id'] },
|
||||
to: { table: { name: 'accounts' }, columns: ['id'] },
|
||||
status: 'accepted',
|
||||
score: expect.any(Number),
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationCount: 0,
|
||||
violationRatio: 0,
|
||||
reasons: expect.arrayContaining(['validation_passed']),
|
||||
},
|
||||
});
|
||||
expect(validated[0]?.score).toBeGreaterThanOrEqual(0.85);
|
||||
});
|
||||
|
||||
it('rejects a candidate with missing parent values and records the deterministic reason', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2);
|
||||
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 999), (22, 1000);
|
||||
`);
|
||||
const testSchema = schema();
|
||||
const profiles = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
});
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(testSchema).filter(
|
||||
(candidate) => candidate.from.table.name === 'invoices',
|
||||
);
|
||||
|
||||
const validated = await validateKloRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-test' },
|
||||
settings: {
|
||||
minSourceCoverage: 0.9,
|
||||
maxViolationRatio: 0.01,
|
||||
},
|
||||
});
|
||||
|
||||
expect(validated).toHaveLength(1);
|
||||
expect(validated[0]).toMatchObject({
|
||||
from: { table: { name: 'invoices' }, columns: ['account_id'] },
|
||||
to: { table: { name: 'accounts' }, columns: ['id'] },
|
||||
status: 'rejected',
|
||||
validation: {
|
||||
sourceCoverage: 1 / 3,
|
||||
violationCount: 2,
|
||||
violationRatio: 2 / 3,
|
||||
reasons: expect.arrayContaining(['low_source_coverage', 'excessive_violations']),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps over-budget candidates review-only without executing coverage SQL for them', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
CREATE TABLE invoices (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex'), (3, 'Initech');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2), (12, 3);
|
||||
INSERT INTO invoices (id, account_id) VALUES (20, 1), (21, 2), (22, 3);
|
||||
`);
|
||||
const testSchema = schema();
|
||||
const profiles = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-budget-profile' },
|
||||
});
|
||||
executor.queryCount = 0;
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(testSchema).map((candidate) => ({
|
||||
...candidate,
|
||||
confidence: candidate.from.table.name === 'users' ? 0.99 : 0.5,
|
||||
}));
|
||||
|
||||
const validated = await validateKloRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-budget' },
|
||||
tableCount: testSchema.tables.length,
|
||||
settings: {
|
||||
validationBudget: 1,
|
||||
},
|
||||
});
|
||||
|
||||
expect(executor.queryCount).toBe(1);
|
||||
expect(validated).toHaveLength(2);
|
||||
expect(validated.find((candidate) => candidate.from.table.name === 'users')).toMatchObject({
|
||||
status: 'accepted',
|
||||
validation: { reasons: expect.arrayContaining(['validation_passed']) },
|
||||
});
|
||||
expect(validated.find((candidate) => candidate.from.table.name === 'invoices')).toMatchObject({
|
||||
status: 'review',
|
||||
validation: {
|
||||
reasons: ['validation_unattempted'],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('treats validation budget zero as review-only validation without coverage SQL', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER, name TEXT);
|
||||
CREATE TABLE users (id INTEGER, account_id INTEGER);
|
||||
INSERT INTO accounts (id, name) VALUES (1, 'Acme'), (2, 'Globex');
|
||||
INSERT INTO users (id, account_id) VALUES (10, 1), (11, 2);
|
||||
`);
|
||||
const testSchema = schema([
|
||||
table('accounts', [
|
||||
column('accounts', 'id', { nullable: false }),
|
||||
column('accounts', 'name', { nativeType: 'TEXT', normalizedType: 'text', dimensionType: 'string' }),
|
||||
]),
|
||||
table('users', [column('users', 'id', { nullable: false }), column('users', 'account_id', { nullable: false })]),
|
||||
]);
|
||||
const profiles = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validate-zero-budget-profile' },
|
||||
});
|
||||
executor.queryCount = 0;
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(testSchema);
|
||||
|
||||
const validated = await validateKloRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates,
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'validate-zero-budget' },
|
||||
tableCount: testSchema.tables.length,
|
||||
settings: {
|
||||
validationBudget: 0,
|
||||
},
|
||||
});
|
||||
|
||||
expect(executor.queryCount).toBe(0);
|
||||
expect(validated).toHaveLength(1);
|
||||
expect(validated[0]).toMatchObject({
|
||||
status: 'review',
|
||||
score: expect.any(Number),
|
||||
validation: {
|
||||
checkedValues: 0,
|
||||
reasons: ['validation_unattempted'],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('marks rejected LLM proposals with the spec rejection reason', async () => {
|
||||
executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE customers (id INTEGER);
|
||||
CREATE TABLE orders (buyer_ref INTEGER);
|
||||
INSERT INTO customers (id) VALUES (1), (2);
|
||||
INSERT INTO orders (buyer_ref) VALUES (98), (99);
|
||||
`);
|
||||
const testSchema = schema([
|
||||
table('customers', [column('customers', 'id', { nullable: false })]),
|
||||
table('orders', [column('orders', 'buyer_ref')]),
|
||||
]);
|
||||
const profiles = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'llm-rejected-validation' },
|
||||
});
|
||||
const [candidate] = generateKloRelationshipDiscoveryCandidates(
|
||||
schema([
|
||||
table('customers', [column('customers', 'id', { nullable: false })]),
|
||||
table('orders', [column('orders', 'customer_id')]),
|
||||
]),
|
||||
);
|
||||
if (!candidate) {
|
||||
throw new Error('Expected base candidate');
|
||||
}
|
||||
const llmCandidate = {
|
||||
...candidate,
|
||||
id: 'orders:(orders.buyer_ref)->customers:(customers.id)',
|
||||
from: { ...candidate.from, columnIds: ['orders.buyer_ref'], columns: ['buyer_ref'] },
|
||||
source: 'llm_proposal' as const,
|
||||
evidence: {
|
||||
...candidate.evidence,
|
||||
reasons: ['llm_proposal'],
|
||||
llmConfidence: 0.84,
|
||||
llmRationale: 'Buyer references should map to customers.',
|
||||
},
|
||||
};
|
||||
|
||||
const [validated] = await validateKloRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates: [llmCandidate],
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'llm-rejected-validation' },
|
||||
});
|
||||
|
||||
expect(validated?.status).toBe('rejected');
|
||||
expect(validated?.validation.reasons).toEqual(
|
||||
expect.arrayContaining(['low_source_coverage', 'llm_proposed_but_validation_failed']),
|
||||
);
|
||||
});
|
||||
|
||||
it('limits validation query concurrency', async () => {
|
||||
const executor = new InMemorySqliteExecutor();
|
||||
executor.db.exec(`
|
||||
CREATE TABLE accounts (id INTEGER NOT NULL);
|
||||
CREATE TABLE orders (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
CREATE TABLE invoices (id INTEGER NOT NULL, account_id INTEGER NOT NULL);
|
||||
INSERT INTO accounts VALUES (1), (2);
|
||||
INSERT INTO orders VALUES (10, 1), (11, 2);
|
||||
INSERT INTO invoices VALUES (20, 1), (21, 2);
|
||||
`);
|
||||
|
||||
let active = 0;
|
||||
let maxActive = 0;
|
||||
const throttled = {
|
||||
executeReadOnly: async (input: KloReadOnlyQueryInput, ctx: KloScanContext) => {
|
||||
active += 1;
|
||||
maxActive = Math.max(maxActive, active);
|
||||
await new Promise((resolve) => setTimeout(resolve, input.sql.includes('WITH child_values') ? 10 : 0));
|
||||
const result = await executor.executeReadOnly(input, ctx);
|
||||
active -= 1;
|
||||
return result;
|
||||
},
|
||||
};
|
||||
|
||||
const testSchema = schema([
|
||||
table('accounts', [column('accounts', 'id', { nullable: false })]),
|
||||
table('orders', [column('orders', 'id', { nullable: false }), column('orders', 'account_id')]),
|
||||
table('invoices', [column('invoices', 'id', { nullable: false }), column('invoices', 'account_id')]),
|
||||
]);
|
||||
const profiles = await profileKloRelationshipSchema({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
schema: testSchema,
|
||||
executor,
|
||||
ctx: { runId: 'validation-concurrency-profile' },
|
||||
});
|
||||
const candidates = generateKloRelationshipDiscoveryCandidates(testSchema);
|
||||
|
||||
await validateKloRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates,
|
||||
profiles,
|
||||
executor: throttled,
|
||||
ctx: { runId: 'validation-concurrency' },
|
||||
settings: { concurrency: 1 },
|
||||
});
|
||||
|
||||
expect(maxActive).toBe(1);
|
||||
executor.close();
|
||||
});
|
||||
|
||||
it('pins column_suffix_match validation scoring for plan-code suffix candidates', async () => {
|
||||
const candidate = {
|
||||
id: 'mart:(current_plan_code)->plans:(plan_code)',
|
||||
from: {
|
||||
tableId: 'mart-account-segments-id',
|
||||
columnIds: ['current-plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
columns: ['current_plan_code'],
|
||||
},
|
||||
to: {
|
||||
tableId: 'plans-id',
|
||||
columnIds: ['plan-code-col'],
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
columns: ['plan_code'],
|
||||
},
|
||||
relationshipType: 'many_to_one' as const,
|
||||
confidence: 0.902,
|
||||
source: 'column_suffix_match' as const,
|
||||
status: 'review' as const,
|
||||
evidence: {
|
||||
sourceColumnBase: 'current_plan',
|
||||
targetTableBase: 'plan',
|
||||
targetColumnBase: 'plan_code',
|
||||
targetKeyScore: 0.86,
|
||||
nameScore: 0.78,
|
||||
reasons: ['column_suffix_match', 'profile_unique_target'],
|
||||
},
|
||||
};
|
||||
const profiles = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
sqlAvailable: true,
|
||||
queryCount: 0,
|
||||
tables: [],
|
||||
warnings: [],
|
||||
columns: {
|
||||
'mart_account_segments.current_plan_code': {
|
||||
table: { catalog: null, db: null, name: 'mart_account_segments' },
|
||||
column: 'current_plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
'stg_plans.plan_code': {
|
||||
table: { catalog: null, db: null, name: 'stg_plans' },
|
||||
column: 'plan_code',
|
||||
nativeType: 'TEXT',
|
||||
normalizedType: 'text',
|
||||
rowCount: 4,
|
||||
nullCount: 0,
|
||||
distinctCount: 4,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['basic', 'enterprise', 'free', 'pro'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 10,
|
||||
},
|
||||
},
|
||||
} satisfies KloRelationshipProfileArtifact;
|
||||
const executor = {
|
||||
async executeReadOnly() {
|
||||
return {
|
||||
headers: ['child_distinct', 'parent_distinct', 'overlap', 'violation_count'],
|
||||
rows: [[4, 4, 4, 0]],
|
||||
rowCount: 1,
|
||||
totalRows: 1,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const [validated] = await validateKloRelationshipDiscoveryCandidates({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'sqlite',
|
||||
candidates: [candidate],
|
||||
profiles,
|
||||
executor,
|
||||
ctx: { runId: 'rule-b-validation-score' },
|
||||
});
|
||||
|
||||
expect(validated).toMatchObject({
|
||||
status: 'accepted',
|
||||
score: 0.98,
|
||||
validation: {
|
||||
targetUniqueness: 1,
|
||||
sourceCoverage: 1,
|
||||
violationRatio: 0,
|
||||
reasons: ['validation_passed'],
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
370
packages/context/src/scan/relationship-validation.ts
Normal file
370
packages/context/src/scan/relationship-validation.ts
Normal file
|
|
@ -0,0 +1,370 @@
|
|||
import type { KloRelationshipEndpoint } from './enrichment-types.js';
|
||||
import { applyKloRelationshipValidationBudget, type KloRelationshipValidationBudget } from './relationship-budget.js';
|
||||
import type { KloRelationshipDiscoveryCandidate } from './relationship-candidates.js';
|
||||
import {
|
||||
formatKloRelationshipTableRef,
|
||||
type KloRelationshipProfileArtifact,
|
||||
type KloRelationshipReadOnlyExecutor,
|
||||
quoteKloRelationshipIdentifier,
|
||||
} from './relationship-profiling.js';
|
||||
import type { KloConnectionDriver, KloQueryResult, KloScanContext } from './types.js';
|
||||
|
||||
export type KloValidatedRelationshipStatus = 'accepted' | 'review' | 'rejected';
|
||||
|
||||
export interface KloRelationshipValidationSettings {
|
||||
acceptThreshold: number;
|
||||
reviewThreshold: number;
|
||||
minTargetUniqueness: number;
|
||||
minSourceCoverage: number;
|
||||
maxViolationRatio: number;
|
||||
maxDistinctSourceValues: number;
|
||||
concurrency: number;
|
||||
validationBudget?: KloRelationshipValidationBudget;
|
||||
}
|
||||
|
||||
export interface KloRelationshipValidationEvidence {
|
||||
targetUniqueness: number;
|
||||
sourceCoverage: number;
|
||||
violationCount: number;
|
||||
violationRatio: number;
|
||||
sourceNullRate: number;
|
||||
targetNullRate: number;
|
||||
childDistinct: number;
|
||||
parentDistinct: number;
|
||||
overlap: number;
|
||||
checkedValues: number;
|
||||
reasons: string[];
|
||||
}
|
||||
|
||||
export interface KloValidatedRelationshipDiscoveryCandidate
|
||||
extends Omit<KloRelationshipDiscoveryCandidate, 'status'> {
|
||||
status: KloValidatedRelationshipStatus;
|
||||
score: number;
|
||||
validation: KloRelationshipValidationEvidence;
|
||||
}
|
||||
|
||||
export interface ValidateKloRelationshipDiscoveryCandidatesInput {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
candidates: readonly KloRelationshipDiscoveryCandidate[];
|
||||
profiles: KloRelationshipProfileArtifact;
|
||||
executor: KloRelationshipReadOnlyExecutor | null;
|
||||
ctx: KloScanContext;
|
||||
tableCount?: number;
|
||||
settings?: Partial<KloRelationshipValidationSettings>;
|
||||
}
|
||||
|
||||
const DEFAULT_SETTINGS: KloRelationshipValidationSettings = {
|
||||
acceptThreshold: 0.85,
|
||||
reviewThreshold: 0.55,
|
||||
minTargetUniqueness: 0.9,
|
||||
minSourceCoverage: 0.9,
|
||||
maxViolationRatio: 0.01,
|
||||
maxDistinctSourceValues: 10000,
|
||||
concurrency: 4,
|
||||
};
|
||||
|
||||
function mergeSettings(
|
||||
settings: Partial<KloRelationshipValidationSettings> | undefined,
|
||||
): KloRelationshipValidationSettings {
|
||||
return { ...DEFAULT_SETTINGS, ...settings };
|
||||
}
|
||||
|
||||
function profileKey(table: string, column: string): string {
|
||||
return `${table}.${column}`;
|
||||
}
|
||||
|
||||
function singleRelationshipColumn(endpointValue: KloRelationshipEndpoint): string {
|
||||
const column = endpointValue.columns[0];
|
||||
if (!column) {
|
||||
throw new Error(`Expected relationship endpoint ${endpointValue.table.name} to contain one column`);
|
||||
}
|
||||
return column;
|
||||
}
|
||||
|
||||
function headerIndex(result: KloQueryResult, header: string): number {
|
||||
return result.headers.findIndex((candidate) => candidate.toLowerCase() === header.toLowerCase());
|
||||
}
|
||||
|
||||
function firstRow(result: KloQueryResult): unknown[] {
|
||||
return result.rows[0] ?? [];
|
||||
}
|
||||
|
||||
function numberAt(result: KloQueryResult, header: string): number {
|
||||
const value = firstRow(result)[headerIndex(result, header)];
|
||||
if (typeof value === 'number') {
|
||||
return value;
|
||||
}
|
||||
if (typeof value === 'bigint') {
|
||||
return Number(value);
|
||||
}
|
||||
if (typeof value === 'string' && value.trim() !== '') {
|
||||
return Number(value);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function limitSql(driver: KloConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return '';
|
||||
}
|
||||
return ` LIMIT ${Math.max(1, Math.floor(limit))}`;
|
||||
}
|
||||
|
||||
function topSql(driver: KloConnectionDriver, limit: number): string {
|
||||
if (driver === 'sqlserver') {
|
||||
return ` TOP (${Math.max(1, Math.floor(limit))})`;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function buildCoverageSql(input: {
|
||||
driver: KloConnectionDriver;
|
||||
childTable: string;
|
||||
childColumn: string;
|
||||
parentTable: string;
|
||||
parentColumn: string;
|
||||
maxDistinctSourceValues: number;
|
||||
}): string {
|
||||
const childTable = formatKloRelationshipTableRef(input.driver, { catalog: null, db: null, name: input.childTable });
|
||||
const parentTable = formatKloRelationshipTableRef(input.driver, { catalog: null, db: null, name: input.parentTable });
|
||||
const childColumn = quoteKloRelationshipIdentifier(input.driver, input.childColumn);
|
||||
const parentColumn = quoteKloRelationshipIdentifier(input.driver, input.parentColumn);
|
||||
const limit = limitSql(input.driver, input.maxDistinctSourceValues);
|
||||
const top = topSql(input.driver, input.maxDistinctSourceValues);
|
||||
|
||||
return [
|
||||
'WITH child_values AS (',
|
||||
`SELECT DISTINCT${top} ${childColumn} AS value FROM ${childTable} WHERE ${childColumn} IS NOT NULL${limit}`,
|
||||
'), parent_values AS (',
|
||||
`SELECT DISTINCT ${parentColumn} AS value FROM ${parentTable} WHERE ${parentColumn} IS NOT NULL`,
|
||||
')',
|
||||
'SELECT',
|
||||
'(SELECT COUNT(*) FROM child_values) AS child_distinct,',
|
||||
'(SELECT COUNT(*) FROM parent_values) AS parent_distinct,',
|
||||
'SUM(CASE WHEN parent_values.value IS NOT NULL THEN 1 ELSE 0 END) AS overlap,',
|
||||
'SUM(CASE WHEN parent_values.value IS NULL THEN 1 ELSE 0 END) AS violation_count',
|
||||
'FROM child_values',
|
||||
'LEFT JOIN parent_values ON child_values.value = parent_values.value',
|
||||
].join(' ');
|
||||
}
|
||||
|
||||
function score(input: {
|
||||
candidateConfidence: number;
|
||||
targetUniqueness: number;
|
||||
sourceCoverage: number;
|
||||
violationRatio: number;
|
||||
}): number {
|
||||
const violationScore = Math.max(0, 1 - input.violationRatio);
|
||||
return Number(
|
||||
Math.min(
|
||||
1,
|
||||
0.2 * input.candidateConfidence +
|
||||
0.3 * input.targetUniqueness +
|
||||
0.4 * input.sourceCoverage +
|
||||
0.1 * violationScore,
|
||||
).toFixed(3),
|
||||
);
|
||||
}
|
||||
|
||||
function statusFor(input: {
|
||||
score: number;
|
||||
reasons: readonly string[];
|
||||
settings: KloRelationshipValidationSettings;
|
||||
}): KloValidatedRelationshipStatus {
|
||||
if (
|
||||
input.reasons.includes('low_target_uniqueness') ||
|
||||
input.reasons.includes('low_source_coverage') ||
|
||||
input.reasons.includes('excessive_violations')
|
||||
) {
|
||||
return 'rejected';
|
||||
}
|
||||
if (
|
||||
input.score >= input.settings.acceptThreshold &&
|
||||
!input.reasons.includes('low_target_uniqueness') &&
|
||||
!input.reasons.includes('low_source_coverage') &&
|
||||
!input.reasons.includes('excessive_violations')
|
||||
) {
|
||||
return 'accepted';
|
||||
}
|
||||
if (input.score >= input.settings.reviewThreshold) {
|
||||
return 'review';
|
||||
}
|
||||
return 'rejected';
|
||||
}
|
||||
|
||||
async function mapWithConcurrency<TInput, TOutput>(
|
||||
inputs: readonly TInput[],
|
||||
concurrency: number,
|
||||
mapOne: (input: TInput) => Promise<TOutput>,
|
||||
): Promise<TOutput[]> {
|
||||
const safeConcurrency = Math.max(1, Math.floor(concurrency));
|
||||
const outputs: TOutput[] = new Array(inputs.length);
|
||||
let nextIndex = 0;
|
||||
|
||||
async function worker(): Promise<void> {
|
||||
while (nextIndex < inputs.length) {
|
||||
const index = nextIndex;
|
||||
nextIndex += 1;
|
||||
outputs[index] = await mapOne(inputs[index] as TInput);
|
||||
}
|
||||
}
|
||||
|
||||
await Promise.all(Array.from({ length: Math.min(safeConcurrency, inputs.length) }, () => worker()));
|
||||
return outputs;
|
||||
}
|
||||
|
||||
function reviewWithoutValidation(
|
||||
candidate: KloRelationshipDiscoveryCandidate,
|
||||
profiles: KloRelationshipProfileArtifact,
|
||||
reason: 'validation_unavailable' | 'profile_unavailable' | 'validation_unattempted',
|
||||
): KloValidatedRelationshipDiscoveryCandidate {
|
||||
const sourceColumn = singleRelationshipColumn(candidate.from);
|
||||
const targetColumn = singleRelationshipColumn(candidate.to);
|
||||
const sourceProfile = profiles.columns[profileKey(candidate.from.table.name, sourceColumn)];
|
||||
const targetProfile = profiles.columns[profileKey(candidate.to.table.name, targetColumn)];
|
||||
|
||||
return {
|
||||
...candidate,
|
||||
status: 'review',
|
||||
score: Number((candidate.confidence * 0.6).toFixed(3)),
|
||||
validation: {
|
||||
targetUniqueness: targetProfile?.uniquenessRatio ?? 0,
|
||||
sourceCoverage: 0,
|
||||
violationCount: 0,
|
||||
violationRatio: 1,
|
||||
sourceNullRate: sourceProfile?.nullRate ?? 0,
|
||||
targetNullRate: targetProfile?.nullRate ?? 0,
|
||||
childDistinct: sourceProfile?.distinctCount ?? 0,
|
||||
parentDistinct: targetProfile?.distinctCount ?? 0,
|
||||
overlap: 0,
|
||||
checkedValues: 0,
|
||||
reasons: [reason],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function validateKloRelationshipDiscoveryCandidates(
|
||||
input: ValidateKloRelationshipDiscoveryCandidatesInput,
|
||||
): Promise<KloValidatedRelationshipDiscoveryCandidate[]> {
|
||||
const settings = mergeSettings(input.settings);
|
||||
if (!input.executor || !input.profiles.sqlAvailable) {
|
||||
return input.candidates.map((candidate) =>
|
||||
reviewWithoutValidation(candidate, input.profiles, 'validation_unavailable'),
|
||||
);
|
||||
}
|
||||
|
||||
const executor = input.executor;
|
||||
|
||||
async function validateCandidate(
|
||||
candidate: KloRelationshipDiscoveryCandidate,
|
||||
): Promise<KloValidatedRelationshipDiscoveryCandidate> {
|
||||
const sourceColumn = singleRelationshipColumn(candidate.from);
|
||||
const targetColumn = singleRelationshipColumn(candidate.to);
|
||||
const sourceProfile = input.profiles.columns[profileKey(candidate.from.table.name, sourceColumn)];
|
||||
const targetProfile = input.profiles.columns[profileKey(candidate.to.table.name, targetColumn)];
|
||||
if (!sourceProfile || !targetProfile) {
|
||||
return reviewWithoutValidation(candidate, input.profiles, 'profile_unavailable');
|
||||
}
|
||||
|
||||
const result = await executor.executeReadOnly(
|
||||
{
|
||||
connectionId: input.connectionId,
|
||||
sql: buildCoverageSql({
|
||||
driver: input.driver,
|
||||
childTable: candidate.from.table.name,
|
||||
childColumn: sourceColumn,
|
||||
parentTable: candidate.to.table.name,
|
||||
parentColumn: targetColumn,
|
||||
maxDistinctSourceValues: settings.maxDistinctSourceValues,
|
||||
}),
|
||||
maxRows: 1,
|
||||
},
|
||||
input.ctx,
|
||||
);
|
||||
const childDistinct = numberAt(result, 'child_distinct');
|
||||
const parentDistinct = numberAt(result, 'parent_distinct');
|
||||
const overlap = numberAt(result, 'overlap');
|
||||
const violationCount = numberAt(result, 'violation_count');
|
||||
const sourceCoverage = childDistinct === 0 ? 0 : overlap / childDistinct;
|
||||
const violationRatio = childDistinct === 0 ? 1 : violationCount / childDistinct;
|
||||
const targetUniqueness = targetProfile.uniquenessRatio;
|
||||
const reasons: string[] = [];
|
||||
|
||||
if (targetUniqueness < settings.minTargetUniqueness) {
|
||||
reasons.push('low_target_uniqueness');
|
||||
}
|
||||
if (sourceCoverage < settings.minSourceCoverage) {
|
||||
reasons.push('low_source_coverage');
|
||||
}
|
||||
if (violationRatio > settings.maxViolationRatio) {
|
||||
reasons.push('excessive_violations');
|
||||
}
|
||||
if (reasons.length === 0) {
|
||||
reasons.push('validation_passed');
|
||||
}
|
||||
|
||||
const candidateScore = score({
|
||||
candidateConfidence: candidate.confidence,
|
||||
targetUniqueness,
|
||||
sourceCoverage,
|
||||
violationRatio,
|
||||
});
|
||||
const candidateStatus = statusFor({ score: candidateScore, reasons, settings });
|
||||
if (candidate.source === 'llm_proposal' && candidateStatus === 'rejected') {
|
||||
reasons.push('llm_proposed_but_validation_failed');
|
||||
}
|
||||
return {
|
||||
...candidate,
|
||||
status: candidateStatus,
|
||||
score: candidateScore,
|
||||
validation: {
|
||||
targetUniqueness,
|
||||
sourceCoverage,
|
||||
violationCount,
|
||||
violationRatio,
|
||||
sourceNullRate: sourceProfile.nullRate,
|
||||
targetNullRate: targetProfile.nullRate,
|
||||
childDistinct,
|
||||
parentDistinct,
|
||||
overlap,
|
||||
checkedValues: childDistinct,
|
||||
reasons,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const budgeted = applyKloRelationshipValidationBudget({
|
||||
candidates: input.candidates,
|
||||
tableCount: input.tableCount ?? 0,
|
||||
budget: settings.validationBudget ?? (input.tableCount === undefined ? 'all' : undefined),
|
||||
score: (candidate) => candidate.confidence,
|
||||
});
|
||||
const validated = await mapWithConcurrency(
|
||||
budgeted.toValidate.map((entry) => entry.candidate),
|
||||
settings.concurrency,
|
||||
validateCandidate,
|
||||
);
|
||||
const byOriginalIndex = new Map<number, KloValidatedRelationshipDiscoveryCandidate>();
|
||||
for (let index = 0; index < budgeted.toValidate.length; index += 1) {
|
||||
const originalIndex = budgeted.toValidate[index]?.originalIndex;
|
||||
const candidate = validated[index];
|
||||
if (originalIndex !== undefined && candidate) {
|
||||
byOriginalIndex.set(originalIndex, candidate);
|
||||
}
|
||||
}
|
||||
for (const entry of budgeted.deferred) {
|
||||
byOriginalIndex.set(
|
||||
entry.originalIndex,
|
||||
reviewWithoutValidation(entry.candidate, input.profiles, 'validation_unattempted'),
|
||||
);
|
||||
}
|
||||
|
||||
return input.candidates.map((_, index) => {
|
||||
const candidate = byOriginalIndex.get(index);
|
||||
if (!candidate) {
|
||||
throw new Error(`Missing relationship validation result for candidate at index ${index}`);
|
||||
}
|
||||
return candidate;
|
||||
});
|
||||
}
|
||||
237
packages/context/src/scan/sqlite-local-enrichment-state-store.ts
Normal file
237
packages/context/src/scan/sqlite-local-enrichment-state-store.ts
Normal file
|
|
@ -0,0 +1,237 @@
|
|||
import { mkdirSync } from 'node:fs';
|
||||
import { dirname } from 'node:path';
|
||||
import Database from 'better-sqlite3';
|
||||
import type {
|
||||
KloScanEnrichmentCompletedStage,
|
||||
KloScanEnrichmentFailedStage,
|
||||
KloScanEnrichmentStageLookup,
|
||||
KloScanEnrichmentStageRecord,
|
||||
KloScanEnrichmentStateStore,
|
||||
} from './enrichment-state.js';
|
||||
import type { KloScanEnrichmentStage, KloScanMode } from './types.js';
|
||||
|
||||
export interface SqliteLocalScanEnrichmentStateStoreOptions {
|
||||
dbPath: string;
|
||||
}
|
||||
|
||||
interface StageRow {
|
||||
run_id: string;
|
||||
connection_id: string;
|
||||
sync_id: string;
|
||||
mode: KloScanMode;
|
||||
stage: KloScanEnrichmentStage;
|
||||
input_hash: string;
|
||||
status: 'completed' | 'failed';
|
||||
output_json: string | null;
|
||||
error_message: string | null;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
function parseStageRow<TOutput = unknown>(row: StageRow): KloScanEnrichmentStageRecord<TOutput> {
|
||||
if (row.status === 'completed') {
|
||||
return {
|
||||
runId: row.run_id,
|
||||
connectionId: row.connection_id,
|
||||
syncId: row.sync_id,
|
||||
mode: row.mode,
|
||||
stage: row.stage,
|
||||
inputHash: row.input_hash,
|
||||
status: 'completed',
|
||||
output: JSON.parse(row.output_json ?? 'null') as TOutput,
|
||||
errorMessage: null,
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
runId: row.run_id,
|
||||
connectionId: row.connection_id,
|
||||
syncId: row.sync_id,
|
||||
mode: row.mode,
|
||||
stage: row.stage,
|
||||
inputHash: row.input_hash,
|
||||
status: 'failed',
|
||||
output: null,
|
||||
errorMessage: row.error_message ?? 'Unknown enrichment stage failure',
|
||||
updatedAt: row.updated_at,
|
||||
};
|
||||
}
|
||||
|
||||
function isSafeRunId(runId: string): boolean {
|
||||
return /^[a-zA-Z0-9][a-zA-Z0-9_.-]*$/.test(runId);
|
||||
}
|
||||
|
||||
export class SqliteLocalScanEnrichmentStateStore implements KloScanEnrichmentStateStore {
|
||||
private readonly db: Database.Database;
|
||||
|
||||
constructor(options: SqliteLocalScanEnrichmentStateStoreOptions) {
|
||||
mkdirSync(dirname(options.dbPath), { recursive: true });
|
||||
this.db = new Database(options.dbPath);
|
||||
this.db.pragma('journal_mode = WAL');
|
||||
this.db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS local_scan_enrichment_stages (
|
||||
run_id TEXT NOT NULL,
|
||||
stage TEXT NOT NULL,
|
||||
input_hash TEXT NOT NULL,
|
||||
connection_id TEXT NOT NULL,
|
||||
sync_id TEXT NOT NULL,
|
||||
mode TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
output_json TEXT,
|
||||
error_message TEXT,
|
||||
updated_at TEXT NOT NULL,
|
||||
PRIMARY KEY (run_id, stage)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS local_scan_enrichment_stages_run_idx
|
||||
ON local_scan_enrichment_stages (run_id, updated_at, stage);
|
||||
`);
|
||||
}
|
||||
|
||||
async findCompletedStage<TOutput = unknown>(
|
||||
input: KloScanEnrichmentStageLookup,
|
||||
): Promise<KloScanEnrichmentCompletedStage<TOutput> | null> {
|
||||
if (!isSafeRunId(input.runId)) {
|
||||
return null;
|
||||
}
|
||||
const row = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT *
|
||||
FROM local_scan_enrichment_stages
|
||||
WHERE run_id = ?
|
||||
AND stage = ?
|
||||
AND input_hash = ?
|
||||
AND status = 'completed'
|
||||
`,
|
||||
)
|
||||
.get(input.runId, input.stage, input.inputHash) as StageRow | undefined;
|
||||
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
const parsed = parseStageRow<TOutput>(row);
|
||||
return parsed.status === 'completed' ? parsed : null;
|
||||
}
|
||||
|
||||
async saveCompletedStage<TOutput = unknown>(
|
||||
input: Omit<KloScanEnrichmentCompletedStage<TOutput>, 'status' | 'errorMessage'>,
|
||||
): Promise<void> {
|
||||
this.db
|
||||
.prepare(
|
||||
`
|
||||
INSERT INTO local_scan_enrichment_stages (
|
||||
run_id,
|
||||
stage,
|
||||
input_hash,
|
||||
connection_id,
|
||||
sync_id,
|
||||
mode,
|
||||
status,
|
||||
output_json,
|
||||
error_message,
|
||||
updated_at
|
||||
)
|
||||
VALUES (
|
||||
@runId,
|
||||
@stage,
|
||||
@inputHash,
|
||||
@connectionId,
|
||||
@syncId,
|
||||
@mode,
|
||||
'completed',
|
||||
@outputJson,
|
||||
NULL,
|
||||
@updatedAt
|
||||
)
|
||||
ON CONFLICT(run_id, stage) DO UPDATE SET
|
||||
input_hash = excluded.input_hash,
|
||||
connection_id = excluded.connection_id,
|
||||
sync_id = excluded.sync_id,
|
||||
mode = excluded.mode,
|
||||
status = excluded.status,
|
||||
output_json = excluded.output_json,
|
||||
error_message = excluded.error_message,
|
||||
updated_at = excluded.updated_at
|
||||
`,
|
||||
)
|
||||
.run({
|
||||
runId: input.runId,
|
||||
stage: input.stage,
|
||||
inputHash: input.inputHash,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
mode: input.mode,
|
||||
outputJson: JSON.stringify(input.output),
|
||||
updatedAt: input.updatedAt,
|
||||
});
|
||||
}
|
||||
|
||||
async saveFailedStage(input: Omit<KloScanEnrichmentFailedStage, 'status' | 'output'>): Promise<void> {
|
||||
this.db
|
||||
.prepare(
|
||||
`
|
||||
INSERT INTO local_scan_enrichment_stages (
|
||||
run_id,
|
||||
stage,
|
||||
input_hash,
|
||||
connection_id,
|
||||
sync_id,
|
||||
mode,
|
||||
status,
|
||||
output_json,
|
||||
error_message,
|
||||
updated_at
|
||||
)
|
||||
VALUES (
|
||||
@runId,
|
||||
@stage,
|
||||
@inputHash,
|
||||
@connectionId,
|
||||
@syncId,
|
||||
@mode,
|
||||
'failed',
|
||||
NULL,
|
||||
@errorMessage,
|
||||
@updatedAt
|
||||
)
|
||||
ON CONFLICT(run_id, stage) DO UPDATE SET
|
||||
input_hash = excluded.input_hash,
|
||||
connection_id = excluded.connection_id,
|
||||
sync_id = excluded.sync_id,
|
||||
mode = excluded.mode,
|
||||
status = excluded.status,
|
||||
output_json = excluded.output_json,
|
||||
error_message = excluded.error_message,
|
||||
updated_at = excluded.updated_at
|
||||
`,
|
||||
)
|
||||
.run({
|
||||
runId: input.runId,
|
||||
stage: input.stage,
|
||||
inputHash: input.inputHash,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
mode: input.mode,
|
||||
errorMessage: input.errorMessage,
|
||||
updatedAt: input.updatedAt,
|
||||
});
|
||||
}
|
||||
|
||||
async listRunStages(runId: string): Promise<KloScanEnrichmentStageRecord[]> {
|
||||
if (!isSafeRunId(runId)) {
|
||||
return [];
|
||||
}
|
||||
const rows = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT *
|
||||
FROM local_scan_enrichment_stages
|
||||
WHERE run_id = ?
|
||||
ORDER BY updated_at ASC, stage ASC
|
||||
`,
|
||||
)
|
||||
.all(runId) as StageRow[];
|
||||
return rows.map((row) => parseStageRow(row));
|
||||
}
|
||||
}
|
||||
24
packages/context/src/scan/type-normalization.test.ts
Normal file
24
packages/context/src/scan/type-normalization.test.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { inferKloDimensionType, kloColumnTypeMappingFromNative, normalizeKloNativeType } from './type-normalization.js';
|
||||
|
||||
describe('KLO scan type normalization', () => {
|
||||
it('normalizes native database type strings', () => {
|
||||
expect(normalizeKloNativeType(' NUMERIC(12, 2) ')).toBe('numeric');
|
||||
expect(normalizeKloNativeType('TIMESTAMP WITH TIME ZONE')).toBe('timestamp with time zone');
|
||||
expect(normalizeKloNativeType('')).toBe('unknown');
|
||||
});
|
||||
|
||||
it('infers dimension types from native types', () => {
|
||||
expect(inferKloDimensionType('BOOLEAN')).toBe('boolean');
|
||||
expect(inferKloDimensionType('timestamp with time zone')).toBe('time');
|
||||
expect(inferKloDimensionType('decimal(10,2)')).toBe('number');
|
||||
expect(inferKloDimensionType('varchar(255)')).toBe('string');
|
||||
});
|
||||
|
||||
it('builds a complete column type mapping', () => {
|
||||
expect(kloColumnTypeMappingFromNative('BIGINT')).toEqual({
|
||||
normalizedType: 'bigint',
|
||||
dimensionType: 'number',
|
||||
});
|
||||
});
|
||||
});
|
||||
32
packages/context/src/scan/type-normalization.ts
Normal file
32
packages/context/src/scan/type-normalization.ts
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
import type { KloSchemaDimensionType } from './types.js';
|
||||
|
||||
export interface KloColumnTypeMapping {
|
||||
normalizedType: string;
|
||||
dimensionType: KloSchemaDimensionType;
|
||||
}
|
||||
|
||||
export function normalizeKloNativeType(nativeType: string): string {
|
||||
const normalized = nativeType.toLowerCase().replace(/\([^)]*\)/g, '').replace(/\s+/g, ' ').trim();
|
||||
return normalized.length > 0 ? normalized : 'unknown';
|
||||
}
|
||||
|
||||
export function inferKloDimensionType(nativeType: string): KloSchemaDimensionType {
|
||||
const normalized = normalizeKloNativeType(nativeType);
|
||||
if (/\b(bool|boolean)\b/.test(normalized)) {
|
||||
return 'boolean';
|
||||
}
|
||||
if (/\b(date|datetime|time|timestamp)\b/.test(normalized)) {
|
||||
return 'time';
|
||||
}
|
||||
if (/\b(int|integer|bigint|smallint|decimal|numeric|number|float|double|real)\b/.test(normalized)) {
|
||||
return 'number';
|
||||
}
|
||||
return 'string';
|
||||
}
|
||||
|
||||
export function kloColumnTypeMappingFromNative(nativeType: string): KloColumnTypeMapping {
|
||||
return {
|
||||
normalizedType: normalizeKloNativeType(nativeType),
|
||||
dimensionType: inferKloDimensionType(nativeType),
|
||||
};
|
||||
}
|
||||
258
packages/context/src/scan/types.test.ts
Normal file
258
packages/context/src/scan/types.test.ts
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
createKloConnectorCapabilities,
|
||||
type KloEventPropertyDiscovery,
|
||||
type KloEventPropertyDiscoveryInput,
|
||||
type KloEventPropertyValuesInput,
|
||||
type KloEventPropertyValuesResult,
|
||||
type KloEventStreamDiscoveryPort,
|
||||
type KloEventTypeDiscovery,
|
||||
type KloEventTypeDiscoveryInput,
|
||||
type KloNetworkEndpoint,
|
||||
type KloNetworkTunnelPort,
|
||||
type KloQueryResult,
|
||||
type KloScanConnector,
|
||||
type KloScanContext,
|
||||
type KloScanInput,
|
||||
type KloSchemaSnapshot,
|
||||
} from './types.js';
|
||||
|
||||
describe('KLO scan contract types', () => {
|
||||
it('defaults to structural-only connector capabilities', () => {
|
||||
expect(createKloConnectorCapabilities()).toEqual({
|
||||
structuralIntrospection: true,
|
||||
tableSampling: false,
|
||||
columnSampling: false,
|
||||
columnStats: false,
|
||||
readOnlySql: false,
|
||||
nestedAnalysis: false,
|
||||
eventStreamDiscovery: false,
|
||||
formalForeignKeys: false,
|
||||
estimatedRowCounts: false,
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps structural introspection mandatory when optional capabilities are enabled', () => {
|
||||
expect(
|
||||
createKloConnectorCapabilities({
|
||||
tableSampling: true,
|
||||
readOnlySql: true,
|
||||
eventStreamDiscovery: true,
|
||||
estimatedRowCounts: true,
|
||||
}),
|
||||
).toEqual({
|
||||
structuralIntrospection: true,
|
||||
tableSampling: true,
|
||||
columnSampling: false,
|
||||
columnStats: false,
|
||||
readOnlySql: true,
|
||||
nestedAnalysis: false,
|
||||
eventStreamDiscovery: true,
|
||||
formalForeignKeys: false,
|
||||
estimatedRowCounts: true,
|
||||
});
|
||||
});
|
||||
|
||||
it('describes the connector surface without requiring enrichment methods', async () => {
|
||||
const snapshot: KloSchemaSnapshot = {
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-29T00:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: { source: 'unit-test' },
|
||||
tables: [
|
||||
{
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Customer orders',
|
||||
estimatedRows: 42,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Primary key',
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const connector: KloScanConnector = {
|
||||
id: 'test-postgres',
|
||||
driver: 'postgres',
|
||||
capabilities: createKloConnectorCapabilities({ estimatedRowCounts: true }),
|
||||
async introspect(input: KloScanInput, ctx: KloScanContext) {
|
||||
expect(input.connectionId).toBe('warehouse');
|
||||
expect(ctx.runId).toBe('scan-run-1');
|
||||
return snapshot;
|
||||
},
|
||||
};
|
||||
|
||||
await expect(
|
||||
connector.introspect(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
scope: { schemas: ['public'] },
|
||||
mode: 'structural',
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual(snapshot);
|
||||
});
|
||||
|
||||
it('models optional event-stream discovery as a connector capability and port', async () => {
|
||||
const eventTypes: KloEventTypeDiscovery[] = [{ value: '$pageview', count: 42 }];
|
||||
const propertyKeys: KloEventPropertyDiscovery[] = [{ key: '$browser', count: 31 }];
|
||||
const propertyValues: KloEventPropertyValuesResult = { values: ['Chrome', 'Safari'], cardinality: 2 };
|
||||
const discovery: KloEventStreamDiscoveryPort = {
|
||||
async listEventTypes(input: KloEventTypeDiscoveryInput) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
eventColumn: 'event',
|
||||
limit: 2,
|
||||
minCount: 30,
|
||||
lookbackDays: 14,
|
||||
});
|
||||
return eventTypes;
|
||||
},
|
||||
async listPropertyKeys(input: KloEventPropertyDiscoveryInput) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
sampleSize: 1000,
|
||||
limit: 5,
|
||||
lookbackDays: 7,
|
||||
});
|
||||
return propertyKeys;
|
||||
},
|
||||
async listPropertyValues(input: KloEventPropertyValuesInput) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
propertyKey: '$browser',
|
||||
limit: 3,
|
||||
maxCardinality: 1000,
|
||||
lookbackDays: 30,
|
||||
});
|
||||
return propertyValues;
|
||||
},
|
||||
};
|
||||
|
||||
const connector: KloScanConnector = {
|
||||
id: 'posthog:product',
|
||||
driver: 'posthog',
|
||||
capabilities: createKloConnectorCapabilities({ eventStreamDiscovery: true }),
|
||||
eventStreamDiscovery: discovery,
|
||||
async introspect() {
|
||||
return {
|
||||
connectionId: 'product',
|
||||
driver: 'posthog',
|
||||
extractedAt: '2026-04-29T00:00:00.000Z',
|
||||
scope: { catalogs: ['157881'] },
|
||||
metadata: {},
|
||||
tables: [],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
await expect(
|
||||
connector.eventStreamDiscovery?.listEventTypes(
|
||||
{
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
eventColumn: 'event',
|
||||
limit: 2,
|
||||
minCount: 30,
|
||||
lookbackDays: 14,
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual([{ value: '$pageview', count: 42 }]);
|
||||
await expect(
|
||||
connector.eventStreamDiscovery?.listPropertyKeys(
|
||||
{
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
sampleSize: 1000,
|
||||
limit: 5,
|
||||
lookbackDays: 7,
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual([{ key: '$browser', count: 31 }]);
|
||||
await expect(
|
||||
connector.eventStreamDiscovery?.listPropertyValues(
|
||||
{
|
||||
connectionId: 'product',
|
||||
table: { catalog: '157881', db: null, name: 'events' },
|
||||
jsonColumn: 'properties',
|
||||
propertyKey: '$browser',
|
||||
limit: 3,
|
||||
maxCardinality: 1000,
|
||||
lookbackDays: 30,
|
||||
},
|
||||
{ runId: 'scan-run-1' },
|
||||
),
|
||||
).resolves.toEqual({ values: ['Chrome', 'Safari'], cardinality: 2 });
|
||||
});
|
||||
|
||||
it('keeps read-only query results separate from schema snapshots', () => {
|
||||
const result: KloQueryResult = {
|
||||
headers: ['id', 'amount'],
|
||||
headerTypes: ['integer', 'numeric'],
|
||||
rows: [[1, 10.5]],
|
||||
totalRows: 1,
|
||||
rowCount: 1,
|
||||
};
|
||||
|
||||
expect(result).toEqual({
|
||||
headers: ['id', 'amount'],
|
||||
headerTypes: ['integer', 'numeric'],
|
||||
rows: [[1, 10.5]],
|
||||
totalRows: 1,
|
||||
rowCount: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it('models host-provided network tunnel endpoint resolution without app imports', async () => {
|
||||
const endpoint: KloNetworkEndpoint = {
|
||||
host: '127.0.0.1',
|
||||
port: 15432,
|
||||
close: async () => undefined,
|
||||
};
|
||||
const tunnelPort: KloNetworkTunnelPort<{ networkProxy?: { type: 'ssh_tunnel' } }> = {
|
||||
async resolveEndpoint(input) {
|
||||
expect(input).toEqual({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
host: 'db.internal',
|
||||
port: 5432,
|
||||
connection: { networkProxy: { type: 'ssh_tunnel' } },
|
||||
});
|
||||
return endpoint;
|
||||
},
|
||||
};
|
||||
|
||||
await expect(
|
||||
tunnelPort.resolveEndpoint({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
host: 'db.internal',
|
||||
port: 5432,
|
||||
connection: { networkProxy: { type: 'ssh_tunnel' } },
|
||||
}),
|
||||
).resolves.toBe(endpoint);
|
||||
});
|
||||
});
|
||||
391
packages/context/src/scan/types.ts
Normal file
391
packages/context/src/scan/types.ts
Normal file
|
|
@ -0,0 +1,391 @@
|
|||
export type KloConnectionDriver =
|
||||
| 'sqlite'
|
||||
| 'postgres'
|
||||
| 'postgresql'
|
||||
| 'sqlserver'
|
||||
| 'bigquery'
|
||||
| 'snowflake'
|
||||
| 'posthog'
|
||||
| 'mysql'
|
||||
| 'clickhouse';
|
||||
|
||||
export type KloScanMode = 'structural' | 'relationships' | 'enriched';
|
||||
|
||||
export type KloScanTrigger = 'cli' | 'mcp' | 'schema_scan' | 'scheduled' | 'manual';
|
||||
|
||||
export interface KloConnectorCapabilities {
|
||||
structuralIntrospection: true;
|
||||
tableSampling: boolean;
|
||||
columnSampling: boolean;
|
||||
columnStats: boolean;
|
||||
readOnlySql: boolean;
|
||||
nestedAnalysis: boolean;
|
||||
eventStreamDiscovery: boolean;
|
||||
formalForeignKeys: boolean;
|
||||
estimatedRowCounts: boolean;
|
||||
}
|
||||
|
||||
export type KloOptionalConnectorCapabilities = Partial<Omit<KloConnectorCapabilities, 'structuralIntrospection'>>;
|
||||
|
||||
export function createKloConnectorCapabilities(
|
||||
capabilities: KloOptionalConnectorCapabilities = {},
|
||||
): KloConnectorCapabilities {
|
||||
return {
|
||||
structuralIntrospection: true,
|
||||
tableSampling: capabilities.tableSampling ?? false,
|
||||
columnSampling: capabilities.columnSampling ?? false,
|
||||
columnStats: capabilities.columnStats ?? false,
|
||||
readOnlySql: capabilities.readOnlySql ?? false,
|
||||
nestedAnalysis: capabilities.nestedAnalysis ?? false,
|
||||
eventStreamDiscovery: capabilities.eventStreamDiscovery ?? false,
|
||||
formalForeignKeys: capabilities.formalForeignKeys ?? false,
|
||||
estimatedRowCounts: capabilities.estimatedRowCounts ?? false,
|
||||
};
|
||||
}
|
||||
|
||||
export interface KloSchemaScope {
|
||||
catalogs?: string[];
|
||||
schemas?: string[];
|
||||
datasets?: string[];
|
||||
}
|
||||
|
||||
export type KloSchemaTableKind = 'table' | 'view' | 'external' | 'event_stream';
|
||||
|
||||
export type KloSchemaDimensionType = 'time' | 'string' | 'number' | 'boolean';
|
||||
|
||||
export interface KloSchemaColumn {
|
||||
name: string;
|
||||
nativeType: string;
|
||||
normalizedType: string;
|
||||
dimensionType: KloSchemaDimensionType;
|
||||
nullable: boolean;
|
||||
primaryKey: boolean;
|
||||
comment: string | null;
|
||||
}
|
||||
|
||||
export interface KloSchemaForeignKey {
|
||||
fromColumn: string;
|
||||
toCatalog: string | null;
|
||||
toDb: string | null;
|
||||
toTable: string;
|
||||
toColumn: string;
|
||||
constraintName: string | null;
|
||||
}
|
||||
|
||||
export interface KloSchemaTable {
|
||||
catalog: string | null;
|
||||
db: string | null;
|
||||
name: string;
|
||||
kind: KloSchemaTableKind;
|
||||
comment: string | null;
|
||||
estimatedRows: number | null;
|
||||
columns: KloSchemaColumn[];
|
||||
foreignKeys: KloSchemaForeignKey[];
|
||||
}
|
||||
|
||||
export interface KloSchemaSnapshot {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
extractedAt: string;
|
||||
scope: KloSchemaScope;
|
||||
tables: KloSchemaTable[];
|
||||
metadata: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface KloCredentialEnvReference {
|
||||
kind: 'env';
|
||||
name: string;
|
||||
}
|
||||
|
||||
export interface KloCredentialFileReference {
|
||||
kind: 'file';
|
||||
path: string;
|
||||
}
|
||||
|
||||
export interface KloResolvedCredentialEnvelope {
|
||||
kind: 'resolved';
|
||||
source: 'standalone' | 'host';
|
||||
values: Record<string, unknown>;
|
||||
redacted?: boolean;
|
||||
}
|
||||
|
||||
export type KloCredentialEnvelope =
|
||||
| KloCredentialEnvReference
|
||||
| KloCredentialFileReference
|
||||
| KloResolvedCredentialEnvelope;
|
||||
|
||||
export interface KloNetworkEndpoint {
|
||||
host: string;
|
||||
port: number;
|
||||
close?: () => Promise<void>;
|
||||
}
|
||||
|
||||
export interface KloNetworkTunnelRequest<TConnection = Record<string, unknown>> {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
host: string;
|
||||
port: number;
|
||||
connection: TConnection;
|
||||
}
|
||||
|
||||
export interface KloNetworkTunnelPort<TConnection = Record<string, unknown>> {
|
||||
resolveEndpoint(input: KloNetworkTunnelRequest<TConnection>): Promise<KloNetworkEndpoint | null>;
|
||||
}
|
||||
|
||||
export interface KloScanInput {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
scope?: KloSchemaScope;
|
||||
mode?: KloScanMode;
|
||||
dryRun?: boolean;
|
||||
detectRelationships?: boolean;
|
||||
credentials?: KloCredentialEnvelope;
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface KloProgressUpdateOptions {
|
||||
transient?: boolean;
|
||||
}
|
||||
|
||||
export interface KloProgressPort {
|
||||
update(progress: number, message?: string, options?: KloProgressUpdateOptions): Promise<void>;
|
||||
startPhase(weight: number): KloProgressPort;
|
||||
}
|
||||
|
||||
export interface KloScanLoggerPort {
|
||||
debug(message: string, metadata?: Record<string, unknown>): void;
|
||||
info(message: string, metadata?: Record<string, unknown>): void;
|
||||
warn(message: string, metadata?: Record<string, unknown>): void;
|
||||
error(message: string, metadata?: Record<string, unknown>): void;
|
||||
}
|
||||
|
||||
export interface KloScanContext {
|
||||
runId: string;
|
||||
signal?: AbortSignal;
|
||||
progress?: KloProgressPort;
|
||||
logger?: KloScanLoggerPort;
|
||||
}
|
||||
|
||||
export interface KloTableRef {
|
||||
catalog: string | null;
|
||||
db: string | null;
|
||||
name: string;
|
||||
}
|
||||
|
||||
export interface KloTableSampleInput {
|
||||
connectionId: string;
|
||||
table: KloTableRef;
|
||||
columns?: string[];
|
||||
limit: number;
|
||||
}
|
||||
|
||||
export interface KloTableSampleResult {
|
||||
headers: string[];
|
||||
rows: unknown[][];
|
||||
totalRows: number;
|
||||
}
|
||||
|
||||
export interface KloColumnSampleInput {
|
||||
connectionId: string;
|
||||
table: KloTableRef;
|
||||
column: string;
|
||||
limit: number;
|
||||
}
|
||||
|
||||
export interface KloColumnSampleResult {
|
||||
values: unknown[];
|
||||
nullCount: number | null;
|
||||
distinctCount: number | null;
|
||||
}
|
||||
|
||||
export interface KloColumnStatsInput {
|
||||
connectionId: string;
|
||||
table: KloTableRef;
|
||||
column: string;
|
||||
}
|
||||
|
||||
export interface KloColumnStatsResult {
|
||||
min: unknown;
|
||||
max: unknown;
|
||||
average: number | null;
|
||||
nullCount: number | null;
|
||||
distinctCount: number | null;
|
||||
}
|
||||
|
||||
export interface KloEventTypeDiscoveryInput {
|
||||
connectionId: string;
|
||||
table: KloTableRef;
|
||||
eventColumn: string;
|
||||
limit: number;
|
||||
minCount?: number;
|
||||
lookbackDays?: number;
|
||||
}
|
||||
|
||||
export interface KloEventTypeDiscovery {
|
||||
value: string;
|
||||
count: number;
|
||||
}
|
||||
|
||||
export interface KloEventPropertyDiscoveryInput {
|
||||
connectionId: string;
|
||||
table: KloTableRef;
|
||||
jsonColumn: string;
|
||||
sampleSize: number;
|
||||
limit: number;
|
||||
lookbackDays?: number;
|
||||
}
|
||||
|
||||
export interface KloEventPropertyDiscovery {
|
||||
key: string;
|
||||
count: number;
|
||||
}
|
||||
|
||||
export interface KloEventPropertyValuesInput {
|
||||
connectionId: string;
|
||||
table: KloTableRef;
|
||||
jsonColumn: string;
|
||||
propertyKey: string;
|
||||
limit: number;
|
||||
maxCardinality?: number;
|
||||
lookbackDays?: number;
|
||||
}
|
||||
|
||||
export interface KloEventPropertyValuesResult {
|
||||
values: string[];
|
||||
cardinality: number;
|
||||
}
|
||||
|
||||
export interface KloEventStreamDiscoveryPort {
|
||||
listEventTypes(input: KloEventTypeDiscoveryInput, ctx: KloScanContext): Promise<KloEventTypeDiscovery[]>;
|
||||
listPropertyKeys(input: KloEventPropertyDiscoveryInput, ctx: KloScanContext): Promise<KloEventPropertyDiscovery[]>;
|
||||
listPropertyValues(
|
||||
input: KloEventPropertyValuesInput,
|
||||
ctx: KloScanContext,
|
||||
): Promise<KloEventPropertyValuesResult | null>;
|
||||
}
|
||||
|
||||
export interface KloReadOnlyQueryInput {
|
||||
connectionId: string;
|
||||
sql: string;
|
||||
maxRows?: number;
|
||||
}
|
||||
|
||||
export interface KloQueryResult {
|
||||
headers: string[];
|
||||
headerTypes?: string[];
|
||||
rows: unknown[][];
|
||||
totalRows: number;
|
||||
rowCount: number | null;
|
||||
}
|
||||
|
||||
export interface KloScanConnector {
|
||||
id: string;
|
||||
driver: KloConnectionDriver;
|
||||
capabilities: KloConnectorCapabilities;
|
||||
eventStreamDiscovery?: KloEventStreamDiscoveryPort;
|
||||
introspect(input: KloScanInput, ctx: KloScanContext): Promise<KloSchemaSnapshot>;
|
||||
sampleColumn?(input: KloColumnSampleInput, ctx: KloScanContext): Promise<KloColumnSampleResult>;
|
||||
sampleTable?(input: KloTableSampleInput, ctx: KloScanContext): Promise<KloTableSampleResult>;
|
||||
columnStats?(input: KloColumnStatsInput, ctx: KloScanContext): Promise<KloColumnStatsResult | null>;
|
||||
executeReadOnly?(input: KloReadOnlyQueryInput, ctx: KloScanContext): Promise<KloQueryResult>;
|
||||
cleanup?(): Promise<void>;
|
||||
}
|
||||
|
||||
export interface KloEmbeddingPort {
|
||||
dimensions: number;
|
||||
maxBatchSize: number;
|
||||
embedBatch(texts: string[]): Promise<number[][]>;
|
||||
}
|
||||
|
||||
export interface KloStructuralSyncStats {
|
||||
tablesCreated: number;
|
||||
tablesUpdated: number;
|
||||
tablesDeleted: number;
|
||||
columnsCreated: number;
|
||||
columnsUpdated: number;
|
||||
columnsDeleted: number;
|
||||
}
|
||||
|
||||
export interface KloScanDiffSummary {
|
||||
tablesAdded: number;
|
||||
tablesModified: number;
|
||||
tablesDeleted: number;
|
||||
tablesUnchanged: number;
|
||||
columnsAdded: number;
|
||||
columnsModified: number;
|
||||
columnsDeleted: number;
|
||||
}
|
||||
|
||||
export interface KloScanArtifactPaths {
|
||||
rawSourcesDir: string | null;
|
||||
reportPath: string | null;
|
||||
manifestShards: string[];
|
||||
enrichmentArtifacts: string[];
|
||||
}
|
||||
|
||||
export type KloScanWarningCode =
|
||||
| 'connector_capability_missing'
|
||||
| 'sampling_failed'
|
||||
| 'statistics_failed'
|
||||
| 'llm_unavailable'
|
||||
| 'embedding_unavailable'
|
||||
| 'scan_enrichment_backend_not_configured'
|
||||
| 'relationship_validation_failed'
|
||||
| 'relationship_llm_invalid_reference'
|
||||
| 'relationship_llm_proposal_failed'
|
||||
| 'credential_redacted'
|
||||
| 'enrichment_failed';
|
||||
|
||||
export interface KloScanWarning {
|
||||
code: KloScanWarningCode;
|
||||
message: string;
|
||||
table?: string;
|
||||
column?: string;
|
||||
recoverable: boolean;
|
||||
metadata?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface KloScanEnrichmentSummary {
|
||||
dataDictionary: 'skipped' | 'completed' | 'failed';
|
||||
tableDescriptions: 'skipped' | 'completed' | 'failed';
|
||||
columnDescriptions: 'skipped' | 'completed' | 'failed';
|
||||
embeddings: 'skipped' | 'completed' | 'failed';
|
||||
deterministicRelationships: 'skipped' | 'completed' | 'failed';
|
||||
llmRelationshipValidation: 'skipped' | 'completed' | 'failed';
|
||||
statisticalValidation: 'skipped' | 'completed' | 'failed';
|
||||
}
|
||||
|
||||
export interface KloScanRelationshipSummary {
|
||||
accepted: number;
|
||||
review: number;
|
||||
rejected: number;
|
||||
skipped: number;
|
||||
}
|
||||
|
||||
export type KloScanEnrichmentStage = 'descriptions' | 'embeddings' | 'relationships';
|
||||
|
||||
export interface KloScanEnrichmentStateSummary {
|
||||
resumedStages: KloScanEnrichmentStage[];
|
||||
completedStages: KloScanEnrichmentStage[];
|
||||
failedStages: KloScanEnrichmentStage[];
|
||||
}
|
||||
|
||||
export interface KloScanReport {
|
||||
connectionId: string;
|
||||
driver: KloConnectionDriver;
|
||||
syncId: string;
|
||||
runId: string;
|
||||
trigger: KloScanTrigger;
|
||||
mode: KloScanMode;
|
||||
dryRun: boolean;
|
||||
artifactPaths: KloScanArtifactPaths;
|
||||
diffSummary: KloScanDiffSummary;
|
||||
manifestShardsWritten: number;
|
||||
structuralSyncStats: KloStructuralSyncStats;
|
||||
enrichment: KloScanEnrichmentSummary;
|
||||
capabilityGaps: Array<keyof Omit<KloConnectorCapabilities, 'structuralIntrospection'>>;
|
||||
warnings: KloScanWarning[];
|
||||
relationships: KloScanRelationshipSummary;
|
||||
enrichmentState: KloScanEnrichmentStateSummary;
|
||||
createdAt: string;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue