ktx/packages/context/src/sl/tools/sl-write-source.tool.ts

410 lines
16 KiB
TypeScript
Raw Normal View History

2026-05-10 23:12:26 +02:00
import YAML from 'yaml';
import { z } from 'zod';
feat(ingest): default local ingest to isolated diffs (#128) * docs: add isolated-diff ingestion design * Refine isolated-diff ingestion design after adversarial review iteration 1 * Refine isolated-diff ingestion design after adversarial review iteration 2 * Refine isolated-diff ingestion design after adversarial review iteration 3 * feat: persist ingest trace events * feat: add isolated ingest patch helpers * feat: validate wiki body semantic references * feat: add final ingest artifact gates * feat: execute ingest work units in child worktrees * feat: integrate isolated work unit patches * feat: route selected ingest sources through isolated diffs * test: cover isolated diff ingestion regressions * feat: add isolated diff ingestion v1 core * docs: document ingest trace inspection * docs: add isolated diff ingestion v1 core plan * fix(ingest): tighten final artifact gates * fix(ingest): gate isolated final integration tree * fix(ingest): persist postmortem failure traces * fix(ingest): trace policy conflicts and cleanup child worktrees * test(ingest): verify isolated diff postmortem coverage * docs: add isolated diff ingestion gates and trace closure plan * fix(ingest): gate provenance before isolated diff squash * docs: add isolated diff ingestion provenance gate closure plan * fix(ingest): gate final wiki references * fix(ingest): enforce SL target connection scope * fix(ingest): trace isolated SL target policy gates * test(ingest): cover isolated diff reference and target gates * chore(ingest): verify isolated diff gate closure * docs: add isolated diff ingestion reference and target gate closure plan * fix(ingest): gate global wiki references * docs: add isolated diff ingestion global wiki reference gate closure plan * fix(ingest): validate scan sources and wiki refs * test(ingest): cover isolated diff textual conflict resolver * test(ingest): cover isolated diff resolver integration * feat(ingest): repair isolated diff textual conflicts * feat(ingest): report isolated diff resolver outcomes * test(ingest): verify isolated diff textual conflict repair * test(ingest): align textual conflict failure coverage * docs: add isolated diff textual conflict resolver plan * test(ingest): cover isolated diff gate repair * feat(ingest): add isolated diff gate repair agent * feat(ingest): repair isolated diff semantic gate failures * feat(ingest): wire isolated diff gate repair * test(ingest): verify isolated diff final gate repair * chore(ingest): verify isolated diff gate repair * docs: add isolated diff gate repair plan * Improve ingest progress updates * feat(ingest): route direct-write connectors through isolated diffs * test(ingest): cover non-metabase isolated diff routing * feat(ingest): project metricflow semantic models before work units * test(ingest): verify metricflow isolated projection path * chore(ingest): verify isolated diff connector migration * docs: add isolated diff connector migration plan * feat(ingest): make isolated diff routing the private default * feat(ingest): promote isolated diff to default runner path * feat(ingest): default local ingest to isolated diffs * chore(ingest): remove isolated diff allowlist references * fix(ingest): preserve transient evidence for isolated work units * docs: add isolated diff default promotion plan * refactor(ingest): remove shared worktree WorkUnit path * docs(ingest): align WorkUnit prompts with isolated diffs * test(ingest): drop unused runner import * docs: add isolated diff shared worktree removal plan * docs: add isolated diff gate repair classification plan * fix: restrict claude-code mcp servers * docs: align ingest trace guidance with public CLI --------- Co-authored-by: Andrey Avtomonov <7889985+andreybavt@users.noreply.github.com>
2026-05-18 13:38:06 +02:00
import {
addTouchedSlSource,
type ToolContext,
type ToolOutput,
validateActionRawPaths,
validateActionTargetConnection,
} from '../../tools/index.js';
2026-05-10 23:12:26 +02:00
import { sourceOverlaySchema } from '../schemas.js';
import type { SemanticLayerService } from '../semantic-layer.service.js';
import type { SemanticLayerSource } from '../types.js';
import {
BaseSemanticLayerTool,
type BaseSemanticLayerToolDeps,
type SemanticLayerStructured,
sourceDefinitionSchema,
} from './base-semantic-layer.tool.js';
2026-05-11 00:31:15 -07:00
import { normalizeSemanticLayerDescriptions } from '../description-normalization.js';
2026-05-10 23:12:26 +02:00
import { slToolConnectionIdSchema } from './connection-id-schema.js';
const sourceInputSchema = z.union([sourceDefinitionSchema, sourceOverlaySchema]);
const slWriteSourceInputSchema = z.object({
connectionId: slToolConnectionIdSchema.describe('Data source connection ID'),
sourceName: z
.string()
.regex(/^[a-z0-9][a-z0-9_]*$/, 'Source name must be snake_case (lowercase alphanumeric and underscores)')
.describe('Name of the source to create, edit, or delete'),
source: sourceInputSchema
.optional()
fix(context): merge overlay columns onto manifest columns by name (#94) * fix(context): merge overlay columns onto manifest columns by name composeOverlay was appending overlay columns to the manifest column list, producing duplicate entries when dbt/metabase overlays declared a column just to attach descriptions. The duplicates carried no `type`, so the pydantic SourceDefinition rejected them at semantic-query time and broke `ktx sl query` for every overlay-backed measure. Now overlay columns match base columns by name (case-insensitive): same-name entries merge onto the manifest (overlay fields win, type/role fall back to the base, descriptions merge per source key) and only new names append. * refactor(sl): split overlay columns from column_overrides and enforce TS/Python wire contract Overlay sources now have two distinct collections: `columns:` for computed columns (requiring `expr` + `type`) and `column_overrides:` for metadata patches to inherited manifest columns. Composing or loading an overlay that mixes the two — or references an unknown column — fails with a typed error. Introduce `ResolvedSemanticLayerSource` / `resolvedSourceSchema` / `toResolvedWire` as the strict shape sent to the Python engine, and add a schema contract test that diffs Zod against the Pydantic JSON schema dumped by `python -m semantic_layer dump-schema`. `SourceDefinition` is now `extra="forbid"` on the Python side. `loadAllSources` surfaces per-file load errors instead of swallowing them, so validation/query paths can report manifest shard parse failures. * fix(context): make scan description generation resilient and quiet A transient sampleTable failure during ingest used to take out every table in a connection: generateTableDescription returned a hardcoded 'Table not found' string into descriptions.ai, and KtxDescriptionGenerator was constructed without a logger, so the failure left no trail anywhere. - sampleTable / sampleColumn calls retry 3x with 200/400/800ms backoff, honouring KtxScanContext.signal via a new KtxAbortedError. - On retry exhaustion or missing capability, table generation falls back to a metadata-only prompt built from column name / native type / comment / rawDescriptions. The column path follows the same rule -- call the LLM when any of samples or rawDescriptions are available; skip only when both are absent. - Logger is now threaded from KtxScanContext into the generator. Failures emit structured KtxScanWarning entries (new description_fallback_used code, plus existing sampling_failed / enrichment_failed / connector_capability_missing). ktx scan groups warnings by code so a batch of identical failures collapses to one summary line plus sample. - Returns null on failure instead of the 'Table not found' sentinel; the manifest writer's existing guard already skips empty descriptions, so schema YAML no longer carries misleading text. SCAN_MANAGED_DESCRIPTION_KEYS already strips stale 'ai' on merge, so existing YAML clears on next run. Also suppress AI SDK v6 'system in messages' warning: pull system messages out of KtxMessageBuilder.wrapSimple's output via a new splitKtxSystemMessages helper and pass them top-level to generateText (preserves cacheControl providerOptions on the SystemModelMessage). Agent-runner's local splitSystemPromptMessages dedupes onto the shared helper. * test(docs): align examples-docs assertions with revamped docs PR #103 (setup/guide doc revamp) reworded several CLI examples and connection labels; the assertions in scripts/examples-docs.test.mjs still referenced the pre-revamp wording and were failing in CI on main. Update the regexes to match the post-revamp content: - drop the `--json` flag from the sl-query example expectation - move the `Driver:` / `Status: ok` probe to the connection reference, which is where that output now lives (driver id is lowercase `postgres`, not the display name `PostgreSQL`) - drop the obsolete `Install \`uv\`...` troubleshooting line - accept `<connectionId>` everywhere; the docs no longer use the hyphenated `<connection-id>` form - match the `warehouse` connection id used in the quickstart instead of the `postgres-warehouse` id only used in the README and setup ref * fix(sl): skip TS/Python schema contract test when uv is unavailable The TypeScript checks CI job does not install uv or Python, so the module-level `execFileSync('uv', ...)` in schemas.contract.test.ts threw ENOENT and failed the suite. Wrap the schema dump in a try/catch and guard the describe block with `describe.skipIf` so the test skips in environments without uv. Local dev and any CI job that has uv on PATH still runs the cross-language contract assertion.
2026-05-15 02:11:04 +02:00
.describe(
'Source definition (standalone with table/sql) or overlay (measures, column_overrides, computed columns, etc.)',
),
2026-05-10 23:12:26 +02:00
delete: z.boolean().optional().describe('Set to true to delete this source entirely'),
rawPaths: z
.array(z.string().min(1))
.optional()
.describe('In ingest sessions, raw source file paths that directly support this SL action.'),
2026-05-10 23:12:26 +02:00
});
type SlWriteSourceInput = z.infer<typeof slWriteSourceInputSchema>;
function actionTargetConnectionId(
runConnectionId: string | null | undefined,
actionConnectionId: string,
): string | null {
return runConnectionId && runConnectionId !== actionConnectionId ? actionConnectionId : null;
}
export class SlWriteSourceTool extends BaseSemanticLayerTool<typeof slWriteSourceInputSchema> {
readonly name = 'sl_write_source';
constructor(deps: BaseSemanticLayerToolDeps) {
super(deps);
}
get description(): string {
return `<purpose>
Create a new semantic layer source or fully rewrite an existing one.
If the source already exists, this tool will overwrite it with the new definition.
</purpose>
<when_to_use>
- First time creating a source definition
- When modeling a new SQL-backed source (e.g., churn risk view, ARR calculation)
- When the user asks to start over / fully rewrite a source
- Consolidating multiple sources into one (write merged definition)
- For targeted edits to existing sources (add/remove measures, update joins), prefer sl_edit_source instead
</when_to_use>
<editing_approach>
- New source: provide \`source\` with full definition
- Full rewrite: provide \`source\` (overwrites existing)
- Targeted edits on an existing source: use sl_edit_source instead
- Delete: set \`delete: true\`
</editing_approach>
<source_definition>
- name: Unique identifier for the source
- table: For physical table/view sources (e.g., "public.orders"). Mutually exclusive with sql.
- sql: For SQL-based sources (the SQL query). Mutually exclusive with table.
- grain: What one row represents (e.g., ["id"], ["customer_id", "product_id"])
fix(context): merge overlay columns onto manifest columns by name (#94) * fix(context): merge overlay columns onto manifest columns by name composeOverlay was appending overlay columns to the manifest column list, producing duplicate entries when dbt/metabase overlays declared a column just to attach descriptions. The duplicates carried no `type`, so the pydantic SourceDefinition rejected them at semantic-query time and broke `ktx sl query` for every overlay-backed measure. Now overlay columns match base columns by name (case-insensitive): same-name entries merge onto the manifest (overlay fields win, type/role fall back to the base, descriptions merge per source key) and only new names append. * refactor(sl): split overlay columns from column_overrides and enforce TS/Python wire contract Overlay sources now have two distinct collections: `columns:` for computed columns (requiring `expr` + `type`) and `column_overrides:` for metadata patches to inherited manifest columns. Composing or loading an overlay that mixes the two — or references an unknown column — fails with a typed error. Introduce `ResolvedSemanticLayerSource` / `resolvedSourceSchema` / `toResolvedWire` as the strict shape sent to the Python engine, and add a schema contract test that diffs Zod against the Pydantic JSON schema dumped by `python -m semantic_layer dump-schema`. `SourceDefinition` is now `extra="forbid"` on the Python side. `loadAllSources` surfaces per-file load errors instead of swallowing them, so validation/query paths can report manifest shard parse failures. * fix(context): make scan description generation resilient and quiet A transient sampleTable failure during ingest used to take out every table in a connection: generateTableDescription returned a hardcoded 'Table not found' string into descriptions.ai, and KtxDescriptionGenerator was constructed without a logger, so the failure left no trail anywhere. - sampleTable / sampleColumn calls retry 3x with 200/400/800ms backoff, honouring KtxScanContext.signal via a new KtxAbortedError. - On retry exhaustion or missing capability, table generation falls back to a metadata-only prompt built from column name / native type / comment / rawDescriptions. The column path follows the same rule -- call the LLM when any of samples or rawDescriptions are available; skip only when both are absent. - Logger is now threaded from KtxScanContext into the generator. Failures emit structured KtxScanWarning entries (new description_fallback_used code, plus existing sampling_failed / enrichment_failed / connector_capability_missing). ktx scan groups warnings by code so a batch of identical failures collapses to one summary line plus sample. - Returns null on failure instead of the 'Table not found' sentinel; the manifest writer's existing guard already skips empty descriptions, so schema YAML no longer carries misleading text. SCAN_MANAGED_DESCRIPTION_KEYS already strips stale 'ai' on merge, so existing YAML clears on next run. Also suppress AI SDK v6 'system in messages' warning: pull system messages out of KtxMessageBuilder.wrapSimple's output via a new splitKtxSystemMessages helper and pass them top-level to generateText (preserves cacheControl providerOptions on the SystemModelMessage). Agent-runner's local splitSystemPromptMessages dedupes onto the shared helper. * test(docs): align examples-docs assertions with revamped docs PR #103 (setup/guide doc revamp) reworded several CLI examples and connection labels; the assertions in scripts/examples-docs.test.mjs still referenced the pre-revamp wording and were failing in CI on main. Update the regexes to match the post-revamp content: - drop the `--json` flag from the sl-query example expectation - move the `Driver:` / `Status: ok` probe to the connection reference, which is where that output now lives (driver id is lowercase `postgres`, not the display name `PostgreSQL`) - drop the obsolete `Install \`uv\`...` troubleshooting line - accept `<connectionId>` everywhere; the docs no longer use the hyphenated `<connection-id>` form - match the `warehouse` connection id used in the quickstart instead of the `postgres-warehouse` id only used in the README and setup ref * fix(sl): skip TS/Python schema contract test when uv is unavailable The TypeScript checks CI job does not install uv or Python, so the module-level `execFileSync('uv', ...)` in schemas.contract.test.ts threw ENOENT and failed the suite. Wrap the schema dump in a try/catch and guard the describe block with `describe.skipIf` so the test skips in environments without uv. Local dev and any CI job that has uv on PATH still runs the cross-language contract assertion.
2026-05-15 02:11:04 +02:00
- columns: All columns with type (string/number/time/boolean) and optional descriptions. On overlays, columns are computed-only and require expr + type.
- column_overrides: Overlay-only metadata patches for existing manifest columns (descriptions, role, visibility, constraints, enum_values, tests). Do not include type or expr.
2026-05-10 23:12:26 +02:00
- joins: Relationships to other sources (to, on, relationship: many_to_one/one_to_many/one_to_one)
- measures: Pre-defined aggregations (name, expr like "sum(amount)", optional filter, optional segments bare names of segments defined on the same source, optional description)
- segments: Named, reusable boolean predicates scoped to this source (name, expr a SQL boolean over this source's columns, optional description). A measure references one with \`segments: [name]\`; a query references one with the dotted form \`source.segment_name\`. Use when the same predicate appears on 3+ measures — e.g. extract \`is_paid = true and is_refunded = '0'\` as \`segments: [{name: paid_non_refunded, expr: "..."}]\` and have each measure use \`segments: [paid_non_refunded]\` instead of re-typing the predicate inside \`sum(case when ... then x end)\`. Segments are predicates only — they cannot be selected as dimensions or grouped by; if you need to group by the predicate, add a \`columns[]\` entry instead.
</source_definition>
<join_requirements>
Sources with joins: [] are disconnected from the semantic layer join graph and cannot be composed with other sources in semantic queries.
Before writing, use discover_data to check existing sources and their grain columns.
For each grain/key column in your source (e.g., account_id, item_id), find the matching dimension source (e.g., ACCOUNTS, ITEMS) and declare a many_to_one join.
Example: a source graining on [account_id] should declare:
joins:
- to: ACCOUNTS
on: source_name.account_id = ACCOUNTS.ACCOUNT_ID
relationship: many_to_one
The on condition format: local_column = TARGET_SOURCE.target_column (right side must include target source name).
Do NOT join back to a table that the SQL already aggregates from if the grain column is not in the output (the relationship is already baked into the SQL).
</join_requirements>`;
}
get inputSchema() {
return slWriteSourceInputSchema;
}
async call(input: SlWriteSourceInput, context: ToolContext): Promise<ToolOutput<SemanticLayerStructured>> {
const { connectionId, sourceName } = input;
const { name: author, email: authorEmail } = await this.authorResolver.resolve(context.userId);
const semanticLayerService = context.session?.semanticLayerService ?? this.semanticLayerService;
const skipIndex = context.session?.isWorktreeScoped === true;
feat(ingest): default local ingest to isolated diffs (#128) * docs: add isolated-diff ingestion design * Refine isolated-diff ingestion design after adversarial review iteration 1 * Refine isolated-diff ingestion design after adversarial review iteration 2 * Refine isolated-diff ingestion design after adversarial review iteration 3 * feat: persist ingest trace events * feat: add isolated ingest patch helpers * feat: validate wiki body semantic references * feat: add final ingest artifact gates * feat: execute ingest work units in child worktrees * feat: integrate isolated work unit patches * feat: route selected ingest sources through isolated diffs * test: cover isolated diff ingestion regressions * feat: add isolated diff ingestion v1 core * docs: document ingest trace inspection * docs: add isolated diff ingestion v1 core plan * fix(ingest): tighten final artifact gates * fix(ingest): gate isolated final integration tree * fix(ingest): persist postmortem failure traces * fix(ingest): trace policy conflicts and cleanup child worktrees * test(ingest): verify isolated diff postmortem coverage * docs: add isolated diff ingestion gates and trace closure plan * fix(ingest): gate provenance before isolated diff squash * docs: add isolated diff ingestion provenance gate closure plan * fix(ingest): gate final wiki references * fix(ingest): enforce SL target connection scope * fix(ingest): trace isolated SL target policy gates * test(ingest): cover isolated diff reference and target gates * chore(ingest): verify isolated diff gate closure * docs: add isolated diff ingestion reference and target gate closure plan * fix(ingest): gate global wiki references * docs: add isolated diff ingestion global wiki reference gate closure plan * fix(ingest): validate scan sources and wiki refs * test(ingest): cover isolated diff textual conflict resolver * test(ingest): cover isolated diff resolver integration * feat(ingest): repair isolated diff textual conflicts * feat(ingest): report isolated diff resolver outcomes * test(ingest): verify isolated diff textual conflict repair * test(ingest): align textual conflict failure coverage * docs: add isolated diff textual conflict resolver plan * test(ingest): cover isolated diff gate repair * feat(ingest): add isolated diff gate repair agent * feat(ingest): repair isolated diff semantic gate failures * feat(ingest): wire isolated diff gate repair * test(ingest): verify isolated diff final gate repair * chore(ingest): verify isolated diff gate repair * docs: add isolated diff gate repair plan * Improve ingest progress updates * feat(ingest): route direct-write connectors through isolated diffs * test(ingest): cover non-metabase isolated diff routing * feat(ingest): project metricflow semantic models before work units * test(ingest): verify metricflow isolated projection path * chore(ingest): verify isolated diff connector migration * docs: add isolated diff connector migration plan * feat(ingest): make isolated diff routing the private default * feat(ingest): promote isolated diff to default runner path * feat(ingest): default local ingest to isolated diffs * chore(ingest): remove isolated diff allowlist references * fix(ingest): preserve transient evidence for isolated work units * docs: add isolated diff default promotion plan * refactor(ingest): remove shared worktree WorkUnit path * docs(ingest): align WorkUnit prompts with isolated diffs * test(ingest): drop unused runner import * docs: add isolated diff shared worktree removal plan * docs: add isolated diff gate repair classification plan * fix: restrict claude-code mcp servers * docs: align ingest trace guidance with public CLI --------- Co-authored-by: Andrey Avtomonov <7889985+andreybavt@users.noreply.github.com>
2026-05-18 13:38:06 +02:00
const targetConnectionValidation = validateActionTargetConnection(context.session, connectionId);
if (!targetConnectionValidation.ok) {
return this.buildOutput(false, [targetConnectionValidation.error], sourceName);
}
const rawPathValidation = validateActionRawPaths(context.session, input.rawPaths);
if (!rawPathValidation.ok) {
return this.buildOutput(false, [rawPathValidation.error], sourceName);
}
2026-05-10 23:12:26 +02:00
// Handle delete
if (input.delete) {
try {
await semanticLayerService.deleteSource(connectionId, sourceName, author, authorEmail);
if (!skipIndex) {
fix(context): merge overlay columns onto manifest columns by name (#94) * fix(context): merge overlay columns onto manifest columns by name composeOverlay was appending overlay columns to the manifest column list, producing duplicate entries when dbt/metabase overlays declared a column just to attach descriptions. The duplicates carried no `type`, so the pydantic SourceDefinition rejected them at semantic-query time and broke `ktx sl query` for every overlay-backed measure. Now overlay columns match base columns by name (case-insensitive): same-name entries merge onto the manifest (overlay fields win, type/role fall back to the base, descriptions merge per source key) and only new names append. * refactor(sl): split overlay columns from column_overrides and enforce TS/Python wire contract Overlay sources now have two distinct collections: `columns:` for computed columns (requiring `expr` + `type`) and `column_overrides:` for metadata patches to inherited manifest columns. Composing or loading an overlay that mixes the two — or references an unknown column — fails with a typed error. Introduce `ResolvedSemanticLayerSource` / `resolvedSourceSchema` / `toResolvedWire` as the strict shape sent to the Python engine, and add a schema contract test that diffs Zod against the Pydantic JSON schema dumped by `python -m semantic_layer dump-schema`. `SourceDefinition` is now `extra="forbid"` on the Python side. `loadAllSources` surfaces per-file load errors instead of swallowing them, so validation/query paths can report manifest shard parse failures. * fix(context): make scan description generation resilient and quiet A transient sampleTable failure during ingest used to take out every table in a connection: generateTableDescription returned a hardcoded 'Table not found' string into descriptions.ai, and KtxDescriptionGenerator was constructed without a logger, so the failure left no trail anywhere. - sampleTable / sampleColumn calls retry 3x with 200/400/800ms backoff, honouring KtxScanContext.signal via a new KtxAbortedError. - On retry exhaustion or missing capability, table generation falls back to a metadata-only prompt built from column name / native type / comment / rawDescriptions. The column path follows the same rule -- call the LLM when any of samples or rawDescriptions are available; skip only when both are absent. - Logger is now threaded from KtxScanContext into the generator. Failures emit structured KtxScanWarning entries (new description_fallback_used code, plus existing sampling_failed / enrichment_failed / connector_capability_missing). ktx scan groups warnings by code so a batch of identical failures collapses to one summary line plus sample. - Returns null on failure instead of the 'Table not found' sentinel; the manifest writer's existing guard already skips empty descriptions, so schema YAML no longer carries misleading text. SCAN_MANAGED_DESCRIPTION_KEYS already strips stale 'ai' on merge, so existing YAML clears on next run. Also suppress AI SDK v6 'system in messages' warning: pull system messages out of KtxMessageBuilder.wrapSimple's output via a new splitKtxSystemMessages helper and pass them top-level to generateText (preserves cacheControl providerOptions on the SystemModelMessage). Agent-runner's local splitSystemPromptMessages dedupes onto the shared helper. * test(docs): align examples-docs assertions with revamped docs PR #103 (setup/guide doc revamp) reworded several CLI examples and connection labels; the assertions in scripts/examples-docs.test.mjs still referenced the pre-revamp wording and were failing in CI on main. Update the regexes to match the post-revamp content: - drop the `--json` flag from the sl-query example expectation - move the `Driver:` / `Status: ok` probe to the connection reference, which is where that output now lives (driver id is lowercase `postgres`, not the display name `PostgreSQL`) - drop the obsolete `Install \`uv\`...` troubleshooting line - accept `<connectionId>` everywhere; the docs no longer use the hyphenated `<connection-id>` form - match the `warehouse` connection id used in the quickstart instead of the `postgres-warehouse` id only used in the README and setup ref * fix(sl): skip TS/Python schema contract test when uv is unavailable The TypeScript checks CI job does not install uv or Python, so the module-level `execFileSync('uv', ...)` in schemas.contract.test.ts threw ENOENT and failed the suite. Wrap the schema dump in a try/catch and guard the describe block with `describe.skipIf` so the test skips in environments without uv. Local dev and any CI job that has uv on PATH still runs the cross-language contract assertion.
2026-05-15 02:11:04 +02:00
const { sources: allSources } = await semanticLayerService.loadAllSources(connectionId);
2026-05-10 23:12:26 +02:00
await this.slSearchService.indexSources(connectionId, allSources).catch(() => {});
}
if (context.session) {
addTouchedSlSource(context.session.touchedSlSources, connectionId, sourceName);
context.session.actions.push({
target: 'sl',
type: 'removed',
key: sourceName,
detail: 'Deleted source',
targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId),
...(rawPathValidation.rawPaths ? { rawPaths: rawPathValidation.rawPaths } : {}),
2026-05-10 23:12:26 +02:00
});
}
return this.buildOutput(true, [], sourceName, { yaml: undefined, commitHash: undefined });
} catch (error) {
return this.buildOutput(false, [error instanceof Error ? error.message : String(error)], sourceName);
}
}
// Require source for create/rewrite
if (!input.source) {
return this.buildOutput(
false,
['Provide `source` to create or rewrite. For targeted edits, use sl_edit_source.'],
sourceName,
);
}
return this.writeFullSource(
connectionId,
input.source,
sourceName,
author,
authorEmail,
context,
semanticLayerService,
skipIndex,
rawPathValidation.rawPaths,
2026-05-10 23:12:26 +02:00
);
}
private async writeFullSource(
connectionId: string,
source: z.infer<typeof sourceInputSchema>,
sourceName: string,
author: string,
authorEmail: string,
context: ToolContext,
semanticLayerService: SemanticLayerService,
skipIndex: boolean,
rawPaths: string[] | undefined,
2026-05-10 23:12:26 +02:00
): Promise<ToolOutput<SemanticLayerStructured>> {
2026-05-11 00:31:15 -07:00
const normalizedSource = normalizeSemanticLayerDescriptions(source, { fillMissing: !!context.session?.ingest });
const isOverlay =
!('table' in normalizedSource && normalizedSource.table) && !('sql' in normalizedSource && normalizedSource.sql);
2026-05-10 23:12:26 +02:00
const existing = await this.readSourceYamlFromService(semanticLayerService, connectionId, sourceName);
const commitMessage = existing
? `${isOverlay ? 'Update overlay' : 'Rewrite source'}: ${sourceName}`
: `${isOverlay ? 'Create overlay' : 'Create source'}: ${sourceName}`;
const yamlContent = YAML.stringify(normalizedSource, { indent: 2, lineWidth: 0, version: '1.1' });
2026-05-10 23:12:26 +02:00
const orphanError = await this.rejectOrphanOverlay(semanticLayerService, connectionId, sourceName, yamlContent);
if (orphanError) {
return this.buildOutput(false, [orphanError], sourceName, { yaml: yamlContent });
}
const shadowError = await this.rejectStandaloneShadow(semanticLayerService, connectionId, sourceName, yamlContent);
if (shadowError) {
return this.buildOutput(false, [shadowError], sourceName, { yaml: yamlContent });
}
2026-05-11 00:31:15 -07:00
const validatedSource = normalizedSource as SemanticLayerSource;
2026-05-10 23:12:26 +02:00
const validationResult = await semanticLayerService.validateWithProposedSource(connectionId, validatedSource);
const validationErrors = validationResult.errors;
const validationWarnings = [...validationResult.warnings];
const actionRequiredWarnings = validationResult.perSourceWarnings?.[sourceName] ?? [];
if (validationErrors.length > 0) {
return this.buildOutput(false, ['Validation failed — source was NOT saved:', ...validationErrors], sourceName, {
yaml: yamlContent,
validationErrors,
validationWarnings,
actionRequiredWarnings,
});
}
try {
const result = await semanticLayerService.writeSource(
connectionId,
validatedSource,
author,
authorEmail,
commitMessage,
);
if (!skipIndex) {
fix(context): merge overlay columns onto manifest columns by name (#94) * fix(context): merge overlay columns onto manifest columns by name composeOverlay was appending overlay columns to the manifest column list, producing duplicate entries when dbt/metabase overlays declared a column just to attach descriptions. The duplicates carried no `type`, so the pydantic SourceDefinition rejected them at semantic-query time and broke `ktx sl query` for every overlay-backed measure. Now overlay columns match base columns by name (case-insensitive): same-name entries merge onto the manifest (overlay fields win, type/role fall back to the base, descriptions merge per source key) and only new names append. * refactor(sl): split overlay columns from column_overrides and enforce TS/Python wire contract Overlay sources now have two distinct collections: `columns:` for computed columns (requiring `expr` + `type`) and `column_overrides:` for metadata patches to inherited manifest columns. Composing or loading an overlay that mixes the two — or references an unknown column — fails with a typed error. Introduce `ResolvedSemanticLayerSource` / `resolvedSourceSchema` / `toResolvedWire` as the strict shape sent to the Python engine, and add a schema contract test that diffs Zod against the Pydantic JSON schema dumped by `python -m semantic_layer dump-schema`. `SourceDefinition` is now `extra="forbid"` on the Python side. `loadAllSources` surfaces per-file load errors instead of swallowing them, so validation/query paths can report manifest shard parse failures. * fix(context): make scan description generation resilient and quiet A transient sampleTable failure during ingest used to take out every table in a connection: generateTableDescription returned a hardcoded 'Table not found' string into descriptions.ai, and KtxDescriptionGenerator was constructed without a logger, so the failure left no trail anywhere. - sampleTable / sampleColumn calls retry 3x with 200/400/800ms backoff, honouring KtxScanContext.signal via a new KtxAbortedError. - On retry exhaustion or missing capability, table generation falls back to a metadata-only prompt built from column name / native type / comment / rawDescriptions. The column path follows the same rule -- call the LLM when any of samples or rawDescriptions are available; skip only when both are absent. - Logger is now threaded from KtxScanContext into the generator. Failures emit structured KtxScanWarning entries (new description_fallback_used code, plus existing sampling_failed / enrichment_failed / connector_capability_missing). ktx scan groups warnings by code so a batch of identical failures collapses to one summary line plus sample. - Returns null on failure instead of the 'Table not found' sentinel; the manifest writer's existing guard already skips empty descriptions, so schema YAML no longer carries misleading text. SCAN_MANAGED_DESCRIPTION_KEYS already strips stale 'ai' on merge, so existing YAML clears on next run. Also suppress AI SDK v6 'system in messages' warning: pull system messages out of KtxMessageBuilder.wrapSimple's output via a new splitKtxSystemMessages helper and pass them top-level to generateText (preserves cacheControl providerOptions on the SystemModelMessage). Agent-runner's local splitSystemPromptMessages dedupes onto the shared helper. * test(docs): align examples-docs assertions with revamped docs PR #103 (setup/guide doc revamp) reworded several CLI examples and connection labels; the assertions in scripts/examples-docs.test.mjs still referenced the pre-revamp wording and were failing in CI on main. Update the regexes to match the post-revamp content: - drop the `--json` flag from the sl-query example expectation - move the `Driver:` / `Status: ok` probe to the connection reference, which is where that output now lives (driver id is lowercase `postgres`, not the display name `PostgreSQL`) - drop the obsolete `Install \`uv\`...` troubleshooting line - accept `<connectionId>` everywhere; the docs no longer use the hyphenated `<connection-id>` form - match the `warehouse` connection id used in the quickstart instead of the `postgres-warehouse` id only used in the README and setup ref * fix(sl): skip TS/Python schema contract test when uv is unavailable The TypeScript checks CI job does not install uv or Python, so the module-level `execFileSync('uv', ...)` in schemas.contract.test.ts threw ENOENT and failed the suite. Wrap the schema dump in a try/catch and guard the describe block with `describe.skipIf` so the test skips in environments without uv. Local dev and any CI job that has uv on PATH still runs the cross-language contract assertion.
2026-05-15 02:11:04 +02:00
const { sources: allSources } = await semanticLayerService.loadAllSources(connectionId);
2026-05-10 23:12:26 +02:00
await this.slSearchService.indexSources(connectionId, allSources).catch(() => {});
}
if (context.session) {
addTouchedSlSource(context.session.touchedSlSources, connectionId, sourceName);
context.session.actions.push({
target: 'sl',
type: existing ? 'updated' : 'created',
key: sourceName,
detail: existing ? `Rewrote source` : `Created source`,
targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId),
...(rawPaths ? { rawPaths } : {}),
2026-05-10 23:12:26 +02:00
});
}
return this.buildOutput(true, [], sourceName, {
yaml: yamlContent,
commitHash: result.commitHash ?? undefined,
validationErrors,
validationWarnings,
actionRequiredWarnings,
});
} catch (error) {
return this.buildOutput(false, [error instanceof Error ? error.message : String(error)], sourceName);
}
}
private async readSourceYamlFromService(
service: SemanticLayerService,
connectionId: string,
sourceName: string,
): Promise<string | null> {
try {
const { content } = await service.readSourceFile(connectionId, sourceName);
return content;
} catch {
return null;
}
}
private async rejectOrphanOverlay(
semanticLayerService: SemanticLayerService,
connectionId: string,
sourceName: string,
content: string,
): Promise<string | null> {
let parsed: Record<string, unknown>;
try {
parsed = YAML.parse(content) as Record<string, unknown>;
} catch {
return null;
}
if (!parsed || typeof parsed !== 'object') {
return null;
}
const isOverlay = !('table' in parsed && parsed.table) && !('sql' in parsed && parsed.sql);
if (!isOverlay) {
return null;
}
const manifestNames = await semanticLayerService.listManifestSourceNames(connectionId);
if (manifestNames.includes(sourceName)) {
return null;
}
const suggestions = this.nearestMatches(sourceName, manifestNames, 3);
return [
`Error: cannot write "${sourceName}" as an overlay — no manifest entry with that name exists.`,
suggestions.length > 0
? ` Nearest manifest matches: ${suggestions.join(', ')}.`
: ` No manifest entries resemble "${sourceName}".`,
`To customize an existing base table, retarget the overlay at one of the nearest matches.`,
`For a LookML derived_table or any source backed by inline SQL, rewrite as a standalone`,
`curated source with a top-level "sql:" block plus explicit "grain:" and "columns:".`,
].join('\n');
}
private async rejectStandaloneShadow(
semanticLayerService: SemanticLayerService,
connectionId: string,
sourceName: string,
content: string,
): Promise<string | null> {
let parsed: Record<string, unknown>;
try {
parsed = YAML.parse(content) as Record<string, unknown>;
} catch {
return null;
}
if (!parsed || typeof parsed !== 'object') {
return null;
}
const isOverlay = !('table' in parsed && parsed.table) && !('sql' in parsed && parsed.sql);
if (isOverlay) {
return null;
}
const isManifestBacked = await semanticLayerService.isManifestBacked(connectionId, sourceName);
if (!isManifestBacked) {
return null;
}
return [
`Error: cannot write "${sourceName}" as a standalone source — a manifest entry with that name already exists.`,
` Writing standalone would drop the manifest's columns and joins, leaving only what you list here.`,
`To add measures/segments on top of the manifest, rewrite this YAML as an overlay:`,
fix(context): merge overlay columns onto manifest columns by name (#94) * fix(context): merge overlay columns onto manifest columns by name composeOverlay was appending overlay columns to the manifest column list, producing duplicate entries when dbt/metabase overlays declared a column just to attach descriptions. The duplicates carried no `type`, so the pydantic SourceDefinition rejected them at semantic-query time and broke `ktx sl query` for every overlay-backed measure. Now overlay columns match base columns by name (case-insensitive): same-name entries merge onto the manifest (overlay fields win, type/role fall back to the base, descriptions merge per source key) and only new names append. * refactor(sl): split overlay columns from column_overrides and enforce TS/Python wire contract Overlay sources now have two distinct collections: `columns:` for computed columns (requiring `expr` + `type`) and `column_overrides:` for metadata patches to inherited manifest columns. Composing or loading an overlay that mixes the two — or references an unknown column — fails with a typed error. Introduce `ResolvedSemanticLayerSource` / `resolvedSourceSchema` / `toResolvedWire` as the strict shape sent to the Python engine, and add a schema contract test that diffs Zod against the Pydantic JSON schema dumped by `python -m semantic_layer dump-schema`. `SourceDefinition` is now `extra="forbid"` on the Python side. `loadAllSources` surfaces per-file load errors instead of swallowing them, so validation/query paths can report manifest shard parse failures. * fix(context): make scan description generation resilient and quiet A transient sampleTable failure during ingest used to take out every table in a connection: generateTableDescription returned a hardcoded 'Table not found' string into descriptions.ai, and KtxDescriptionGenerator was constructed without a logger, so the failure left no trail anywhere. - sampleTable / sampleColumn calls retry 3x with 200/400/800ms backoff, honouring KtxScanContext.signal via a new KtxAbortedError. - On retry exhaustion or missing capability, table generation falls back to a metadata-only prompt built from column name / native type / comment / rawDescriptions. The column path follows the same rule -- call the LLM when any of samples or rawDescriptions are available; skip only when both are absent. - Logger is now threaded from KtxScanContext into the generator. Failures emit structured KtxScanWarning entries (new description_fallback_used code, plus existing sampling_failed / enrichment_failed / connector_capability_missing). ktx scan groups warnings by code so a batch of identical failures collapses to one summary line plus sample. - Returns null on failure instead of the 'Table not found' sentinel; the manifest writer's existing guard already skips empty descriptions, so schema YAML no longer carries misleading text. SCAN_MANAGED_DESCRIPTION_KEYS already strips stale 'ai' on merge, so existing YAML clears on next run. Also suppress AI SDK v6 'system in messages' warning: pull system messages out of KtxMessageBuilder.wrapSimple's output via a new splitKtxSystemMessages helper and pass them top-level to generateText (preserves cacheControl providerOptions on the SystemModelMessage). Agent-runner's local splitSystemPromptMessages dedupes onto the shared helper. * test(docs): align examples-docs assertions with revamped docs PR #103 (setup/guide doc revamp) reworded several CLI examples and connection labels; the assertions in scripts/examples-docs.test.mjs still referenced the pre-revamp wording and were failing in CI on main. Update the regexes to match the post-revamp content: - drop the `--json` flag from the sl-query example expectation - move the `Driver:` / `Status: ok` probe to the connection reference, which is where that output now lives (driver id is lowercase `postgres`, not the display name `PostgreSQL`) - drop the obsolete `Install \`uv\`...` troubleshooting line - accept `<connectionId>` everywhere; the docs no longer use the hyphenated `<connection-id>` form - match the `warehouse` connection id used in the quickstart instead of the `postgres-warehouse` id only used in the README and setup ref * fix(sl): skip TS/Python schema contract test when uv is unavailable The TypeScript checks CI job does not install uv or Python, so the module-level `execFileSync('uv', ...)` in schemas.contract.test.ts threw ENOENT and failed the suite. Wrap the schema dump in a try/catch and guard the describe block with `describe.skipIf` so the test skips in environments without uv. Local dev and any CI job that has uv on PATH still runs the cross-language contract assertion.
2026-05-15 02:11:04 +02:00
` - Remove "sql:", "table:", "grain:", and base-table "columns:".`,
` - Keep "name:" plus "measures:", "segments:", "descriptions:", "joins:", "disable_joins:",`,
` "exclude_columns:", "column_overrides:", and/or computed-only "columns:" entries with expr + type.`,
2026-05-10 23:12:26 +02:00
` - The manifest's schema is inherited automatically.`,
`If you really need a different base table, use a different source name.`,
].join('\n');
}
private nearestMatches(needle: string, haystack: string[], limit: number): string[] {
if (haystack.length === 0) {
return [];
}
const lowerNeedle = needle.toLowerCase();
const scored = haystack.map((candidate) => {
const lower = candidate.toLowerCase();
const prefixBoost = lower.startsWith(lowerNeedle) || lowerNeedle.startsWith(lower) ? 0.2 : 0;
const substringBoost = lower.includes(lowerNeedle) || lowerNeedle.includes(lower) ? 0.1 : 0;
const score = jaroWinkler(lowerNeedle, lower) + prefixBoost + substringBoost;
return { candidate, score };
});
scored.sort((a, b) => b.score - a.score);
return scored
.filter((s) => s.score > 0.4)
.slice(0, limit)
.map((s) => s.candidate);
}
}
function jaroWinkler(a: string, b: string): number {
if (a === b) {
return 1;
}
const matchDistance = Math.max(0, Math.floor(Math.max(a.length, b.length) / 2) - 1);
const aMatches = new Array<boolean>(a.length).fill(false);
const bMatches = new Array<boolean>(b.length).fill(false);
let matches = 0;
for (let i = 0; i < a.length; i++) {
const start = Math.max(0, i - matchDistance);
const end = Math.min(i + matchDistance + 1, b.length);
for (let j = start; j < end; j++) {
if (bMatches[j]) {
continue;
}
if (a[i] !== b[j]) {
continue;
}
aMatches[i] = true;
bMatches[j] = true;
matches++;
break;
}
}
if (matches === 0) {
return 0;
}
let transpositions = 0;
let k = 0;
for (let i = 0; i < a.length; i++) {
if (!aMatches[i]) {
continue;
}
while (!bMatches[k]) {
k++;
}
if (a[i] !== b[k]) {
transpositions++;
}
k++;
}
const jaro = (matches / a.length + matches / b.length + (matches - transpositions / 2) / matches) / 3;
let prefix = 0;
const maxPrefix = Math.min(4, a.length, b.length);
while (prefix < maxPrefix && a[prefix] === b[prefix]) {
prefix++;
}
return jaro + prefix * 0.1 * (1 - jaro);
}