mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-13 08:15:14 +02:00
fix(context): merge overlay columns onto manifest columns by name (#94)
* fix(context): merge overlay columns onto manifest columns by name composeOverlay was appending overlay columns to the manifest column list, producing duplicate entries when dbt/metabase overlays declared a column just to attach descriptions. The duplicates carried no `type`, so the pydantic SourceDefinition rejected them at semantic-query time and broke `ktx sl query` for every overlay-backed measure. Now overlay columns match base columns by name (case-insensitive): same-name entries merge onto the manifest (overlay fields win, type/role fall back to the base, descriptions merge per source key) and only new names append. * refactor(sl): split overlay columns from column_overrides and enforce TS/Python wire contract Overlay sources now have two distinct collections: `columns:` for computed columns (requiring `expr` + `type`) and `column_overrides:` for metadata patches to inherited manifest columns. Composing or loading an overlay that mixes the two — or references an unknown column — fails with a typed error. Introduce `ResolvedSemanticLayerSource` / `resolvedSourceSchema` / `toResolvedWire` as the strict shape sent to the Python engine, and add a schema contract test that diffs Zod against the Pydantic JSON schema dumped by `python -m semantic_layer dump-schema`. `SourceDefinition` is now `extra="forbid"` on the Python side. `loadAllSources` surfaces per-file load errors instead of swallowing them, so validation/query paths can report manifest shard parse failures. * fix(context): make scan description generation resilient and quiet A transient sampleTable failure during ingest used to take out every table in a connection: generateTableDescription returned a hardcoded 'Table not found' string into descriptions.ai, and KtxDescriptionGenerator was constructed without a logger, so the failure left no trail anywhere. - sampleTable / sampleColumn calls retry 3x with 200/400/800ms backoff, honouring KtxScanContext.signal via a new KtxAbortedError. - On retry exhaustion or missing capability, table generation falls back to a metadata-only prompt built from column name / native type / comment / rawDescriptions. The column path follows the same rule -- call the LLM when any of samples or rawDescriptions are available; skip only when both are absent. - Logger is now threaded from KtxScanContext into the generator. Failures emit structured KtxScanWarning entries (new description_fallback_used code, plus existing sampling_failed / enrichment_failed / connector_capability_missing). ktx scan groups warnings by code so a batch of identical failures collapses to one summary line plus sample. - Returns null on failure instead of the 'Table not found' sentinel; the manifest writer's existing guard already skips empty descriptions, so schema YAML no longer carries misleading text. SCAN_MANAGED_DESCRIPTION_KEYS already strips stale 'ai' on merge, so existing YAML clears on next run. Also suppress AI SDK v6 'system in messages' warning: pull system messages out of KtxMessageBuilder.wrapSimple's output via a new splitKtxSystemMessages helper and pass them top-level to generateText (preserves cacheControl providerOptions on the SystemModelMessage). Agent-runner's local splitSystemPromptMessages dedupes onto the shared helper. * test(docs): align examples-docs assertions with revamped docs PR #103 (setup/guide doc revamp) reworded several CLI examples and connection labels; the assertions in scripts/examples-docs.test.mjs still referenced the pre-revamp wording and were failing in CI on main. Update the regexes to match the post-revamp content: - drop the `--json` flag from the sl-query example expectation - move the `Driver:` / `Status: ok` probe to the connection reference, which is where that output now lives (driver id is lowercase `postgres`, not the display name `PostgreSQL`) - drop the obsolete `Install \`uv\`...` troubleshooting line - accept `<connectionId>` everywhere; the docs no longer use the hyphenated `<connection-id>` form - match the `warehouse` connection id used in the quickstart instead of the `postgres-warehouse` id only used in the README and setup ref * fix(sl): skip TS/Python schema contract test when uv is unavailable The TypeScript checks CI job does not install uv or Python, so the module-level `execFileSync('uv', ...)` in schemas.contract.test.ts threw ENOENT and failed the suite. Wrap the schema dump in a try/catch and guard the describe block with `describe.skipIf` so the test skips in environments without uv. Local dev and any CI job that has uv on PATH still runs the cross-language contract assertion.
This commit is contained in:
parent
6bc8d200ea
commit
cb8902f1e5
56 changed files with 1650 additions and 237 deletions
|
|
@ -133,6 +133,50 @@ function warningLine(warning: KtxScanWarning): string {
|
|||
return `${warning.code}: ${location}${warning.message}`;
|
||||
}
|
||||
|
||||
function groupWarningsByCode(warnings: readonly KtxScanWarning[]): Map<string, KtxScanWarning[]> {
|
||||
const groups = new Map<string, KtxScanWarning[]>();
|
||||
for (const warning of warnings) {
|
||||
const list = groups.get(warning.code);
|
||||
if (list) {
|
||||
list.push(warning);
|
||||
} else {
|
||||
groups.set(warning.code, [warning]);
|
||||
}
|
||||
}
|
||||
return groups;
|
||||
}
|
||||
|
||||
function describeWarningGroup(code: string, count: number): string {
|
||||
switch (code) {
|
||||
case 'sampling_failed':
|
||||
return `${count} ${plural(count, 'table')} could not be sampled (retries exhausted); descriptions used metadata-only fallback or were skipped.`;
|
||||
case 'description_fallback_used':
|
||||
return `${count} ${plural(count, 'table')} got an AI description from column metadata only (no sample rows available).`;
|
||||
case 'enrichment_failed':
|
||||
return `${count} ${plural(count, 'table/column')} could not be enriched.`;
|
||||
case 'connector_capability_missing':
|
||||
return `${count} ${plural(count, 'table')} affected by missing connector capability.`;
|
||||
case 'statistics_failed':
|
||||
return `${count} statistics ${plural(count, 'lookup')} failed.`;
|
||||
case 'llm_unavailable':
|
||||
return 'LLM provider unavailable; AI enrichment was skipped.';
|
||||
case 'embedding_unavailable':
|
||||
return 'Embedding provider unavailable; embeddings were skipped.';
|
||||
case 'relationship_validation_failed':
|
||||
return `${count} relationship ${plural(count, 'validation')} could not run.`;
|
||||
case 'relationship_llm_invalid_reference':
|
||||
return `${count} LLM-proposed ${plural(count, 'relationship')} referenced unknown columns.`;
|
||||
case 'relationship_llm_proposal_failed':
|
||||
return `${count} LLM relationship ${plural(count, 'proposal')} failed.`;
|
||||
case 'scan_enrichment_backend_not_configured':
|
||||
return 'Scan enrichment backend is not configured; AI stages were skipped.';
|
||||
case 'credential_redacted':
|
||||
return `${count} ${plural(count, 'credential')} were redacted from scan output.`;
|
||||
default:
|
||||
return `${count} ${plural(count, 'warning')} (${code})`;
|
||||
}
|
||||
}
|
||||
|
||||
function managedDaemonOptionsForScanRun(args: Extract<KtxScanArgs, { command: 'run' }>, io: KtxCliIo) {
|
||||
if (args.databaseIntrospectionUrl || !args.cliVersion || !args.runtimeInstallPolicy) {
|
||||
return undefined;
|
||||
|
|
@ -153,11 +197,26 @@ function writeNeedsAttention(report: KtxScanReport, io: KtxCliIo): void {
|
|||
}
|
||||
if (report.warnings.length > 0) {
|
||||
io.stdout.write(` ${report.warnings.length} ${plural(report.warnings.length, 'warning')}\n`);
|
||||
for (const warning of report.warnings.slice(0, 5)) {
|
||||
io.stdout.write(` - ${warningLine(warning)}\n`);
|
||||
}
|
||||
if (report.warnings.length > 5) {
|
||||
io.stdout.write(` - ${report.warnings.length - 5} more warnings in the JSON report\n`);
|
||||
const groups = groupWarningsByCode(report.warnings);
|
||||
for (const [code, warnings] of groups) {
|
||||
io.stdout.write(` - ${describeWarningGroup(code, warnings.length)}\n`);
|
||||
const first = warnings[0];
|
||||
if (first) {
|
||||
io.stdout.write(` ${warningLine(first)}\n`);
|
||||
}
|
||||
if (warnings.length > 1) {
|
||||
const moreTables = warnings
|
||||
.slice(1)
|
||||
.map((warning) =>
|
||||
warning.table ? (warning.column ? `${warning.table}.${warning.column}` : warning.table) : null,
|
||||
)
|
||||
.filter((value): value is string => value !== null)
|
||||
.slice(0, 3);
|
||||
if (moreTables.length > 0) {
|
||||
const suffix = warnings.length - 1 > moreTables.length ? `, …` : '';
|
||||
io.stdout.write(` also: ${moreTables.join(', ')}${suffix}\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (report.capabilityGaps.length > 0) {
|
||||
|
|
|
|||
|
|
@ -213,7 +213,11 @@ export async function runKtxSl(args: KtxSlArgs, io: KtxSlIo = process, deps: Ktx
|
|||
if (!source) {
|
||||
throw new Error(`Semantic-layer source "${args.connectionId}/${args.sourceName}" was not found`);
|
||||
}
|
||||
const result = await validateLocalSlSource(source.yaml, { project, connectionId: args.connectionId });
|
||||
const result = await validateLocalSlSource(source.yaml, {
|
||||
project,
|
||||
connectionId: args.connectionId,
|
||||
sourceName: args.sourceName,
|
||||
});
|
||||
if (!result.valid) {
|
||||
for (const error of result.errors) {
|
||||
io.stderr.write(`${error}\n`);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue