From 86c818a454ffe0a3aa7da20e1ba87e2ad8c3f8b8 Mon Sep 17 00:00:00 2001 From: Luca Martial Date: Mon, 11 May 2026 00:31:15 -0700 Subject: [PATCH] Normalize semantic layer descriptions --- packages/cli/src/cli-runtime.ts | 2 +- packages/cli/src/context-build-view.test.ts | 29 +++- packages/cli/src/context-build-view.ts | 38 ++++- packages/context/src/project/project.test.ts | 2 + packages/context/src/project/project.ts | 5 +- .../context/src/project/setup-config.test.ts | 4 +- packages/context/src/project/setup-config.ts | 10 +- .../src/sl/description-normalization.ts | 136 ++++++++++++++++++ packages/context/src/sl/local-sl.ts | 5 +- packages/context/src/sl/schemas.ts | 5 + .../src/sl/semantic-layer.service.test.ts | 29 ++++ .../context/src/sl/semantic-layer.service.ts | 43 +++--- packages/context/src/sl/sl-search.service.ts | 2 + .../src/sl/tools/sl-edit-source.tool.test.ts | 33 +++++ .../src/sl/tools/sl-edit-source.tool.ts | 2 + .../src/sl/tools/sl-write-source.tool.test.ts | 83 +++++++++++ .../src/sl/tools/sl-write-source.tool.ts | 9 +- .../src/wiki/tools/wiki-write.tool.test.ts | 36 +++++ .../context/src/wiki/tools/wiki-write.tool.ts | 18 ++- python/ktx-sl/semantic_layer/models.py | 26 ++++ python/ktx-sl/tests/test_models.py | 18 +++ 21 files changed, 498 insertions(+), 37 deletions(-) create mode 100644 packages/context/src/sl/description-normalization.ts diff --git a/packages/cli/src/cli-runtime.ts b/packages/cli/src/cli-runtime.ts index 60129922..124d132d 100644 --- a/packages/cli/src/cli-runtime.ts +++ b/packages/cli/src/cli-runtime.ts @@ -22,7 +22,7 @@ export interface KtxCliPackageInfo { } export interface KtxCliIo { - stdout: { isTTY?: boolean; write(chunk: string): void }; + stdout: { isTTY?: boolean; columns?: number; write(chunk: string): void }; stderr: { write(chunk: string): void }; } diff --git a/packages/cli/src/context-build-view.test.ts b/packages/cli/src/context-build-view.test.ts index c6ef0eb1..d7069578 100644 --- a/packages/cli/src/context-build-view.test.ts +++ b/packages/cli/src/context-build-view.test.ts @@ -3,6 +3,7 @@ import { describe, expect, it, vi } from 'vitest'; import type { KtxPublicIngestProject, KtxPublicIngestTargetResult } from './public-ingest.js'; import { extractProgressMessage, + createRepainter, initViewState, parseIngestSummary, parseScanSummary, @@ -11,13 +12,14 @@ import { viewStateFromSourceProgress, } from './context-build-view.js'; -function makeIo(options: { isTTY?: boolean } = {}) { +function makeIo(options: { isTTY?: boolean; columns?: number } = {}) { let stdout = ''; let stderr = ''; return { io: { stdout: { isTTY: options.isTTY, + columns: options.columns, write: (chunk: string) => { stdout += chunk; }, @@ -305,6 +307,31 @@ describe('renderContextBuildView', () => { }); }); +describe('createRepainter', () => { + it('moves up visual rows, not just newline count, when content wraps', () => { + const io = makeIo({ isTTY: true, columns: 5 }); + const repainter = createRepainter(io.io); + + repainter.paint('abcdefghijk\n'); + repainter.paint('updated\n'); + repainter.paint('done\n'); + + const cursorMoves = [...io.stdout().matchAll(/\u001b\[(\d+)A\r/g)].map((match) => Number(match[1])); + expect(cursorMoves).toEqual([3, 2]); + }); + + it('returns to the start of a single-line frame without moving up when content has no newline', () => { + const io = makeIo({ isTTY: true, columns: 80 }); + const repainter = createRepainter(io.io); + + repainter.paint('hello'); + repainter.paint('bye'); + + expect(io.stdout()).toContain('\rbye'); + expect(io.stdout()).not.toContain('\u001b[1A\rbye'); + }); +}); + describe('runContextBuild', () => { it('executes scan targets before source-ingest targets', async () => { const io = makeIo(); diff --git a/packages/cli/src/context-build-view.ts b/packages/cli/src/context-build-view.ts index bb661655..571c71dd 100644 --- a/packages/cli/src/context-build-view.ts +++ b/packages/cli/src/context-build-view.ts @@ -226,6 +226,7 @@ export function renderContextBuildView( // --- IO Capture --- const ESC_K_RE = new RegExp(`${ESC.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\[K`, 'g'); +const ANSI_RE = /\x1b\[[0-9;]*m/g; export function extractProgressMessage(chunk: string): string | null { const cleaned = chunk.replace(/^\r/, '').replace(ESC_K_RE, '').replace(/\n$/, '').trim(); @@ -342,16 +343,45 @@ export function viewStateFromSourceProgress( // --- Repaint --- export function createRepainter(io: KtxCliIo) { - let lastLineCount = 0; + let hasPainted = false; + let lastCursorUpRows = 0; + + const terminalColumns = () => { + for (const columns of [io.stdout.columns, process.stdout.columns]) { + if (typeof columns === 'number' && Number.isFinite(columns) && columns > 0) return columns; + } + return 80; + }; + + const visualRows = (line: string, columns: number) => { + const plainLength = line.replace(ANSI_RE, '').length; + return Math.max(1, Math.ceil(plainLength / columns)); + }; + + const cursorUpRowsAfterWrite = (content: string) => { + const columns = terminalColumns(); + const endsWithNewline = content.endsWith('\n'); + const lines = content.split('\n'); + return lines.reduce((sum, line, index) => { + if (index === lines.length - 1) { + return endsWithNewline ? sum : sum + Math.max(0, visualRows(line, columns) - 1); + } + return sum + visualRows(line, columns); + }, 0); + }; return { paint(content: string) { - if (lastLineCount > 0) { - io.stdout.write(`${ESC}[${lastLineCount}A\r`); + if (hasPainted) { + if (lastCursorUpRows > 0) { + io.stdout.write(`${ESC}[${lastCursorUpRows}A`); + } + io.stdout.write('\r'); } io.stdout.write(content.replaceAll('\n', `${ESC}[K\n`)); io.stdout.write(`${ESC}[J`); - lastLineCount = (content.match(/\n/g) ?? []).length; + hasPainted = true; + lastCursorUpRows = cursorUpRowsAfterWrite(content); }, }; } diff --git a/packages/context/src/project/project.test.ts b/packages/context/src/project/project.test.ts index ec2120aa..b6e88604 100644 --- a/packages/context/src/project/project.test.ts +++ b/packages/context/src/project/project.test.ts @@ -32,6 +32,8 @@ describe('KTX local project runtime', () => { const gitignore = await readFile(join(projectDir, '.ktx/.gitignore'), 'utf-8'); expect(gitignore).toContain('cache/'); expect(gitignore).toContain('db.sqlite'); + expect(gitignore).toContain('db.sqlite-*'); + expect(gitignore).toContain('ingest-transcripts/'); expect(gitignore).toContain('secrets/'); expect(gitignore).toContain('setup/'); expect(gitignore).toContain('agents/'); diff --git a/packages/context/src/project/project.ts b/packages/context/src/project/project.ts index cbe522a5..59e594a2 100644 --- a/packages/context/src/project/project.ts +++ b/packages/context/src/project/project.ts @@ -35,7 +35,10 @@ export interface InitKtxProjectResult extends KtxLocalProject { } const TRACKED_SCAFFOLD_FILES: Array<{ path: string; content: string }> = [ - { path: '.ktx/.gitignore', content: 'cache/\ndb.sqlite\nsecrets/\nsetup/\nagents/\n' }, + { + path: '.ktx/.gitignore', + content: 'cache/\ndb.sqlite\ndb.sqlite-*\ningest-transcripts/\nsecrets/\nsetup/\nagents/\n', + }, { path: '.ktx/prompts/.gitkeep', content: '' }, { path: '.ktx/skills/.gitkeep', content: '' }, { path: 'knowledge/global/.gitkeep', content: '' }, diff --git a/packages/context/src/project/setup-config.test.ts b/packages/context/src/project/setup-config.test.ts index 3fc8726b..212f16e1 100644 --- a/packages/context/src/project/setup-config.test.ts +++ b/packages/context/src/project/setup-config.test.ts @@ -67,10 +67,10 @@ describe('KTX setup config helpers', () => { it('merges setup-local gitignore entries without removing existing lines', () => { expect(mergeKtxSetupGitignoreEntries('cache/\ndb.sqlite\n')).toBe( - ['cache/', 'db.sqlite', 'secrets/', 'setup/', 'agents/', ''].join('\n'), + ['cache/', 'db.sqlite', 'db.sqlite-*', 'ingest-transcripts/', 'secrets/', 'setup/', 'agents/', ''].join('\n'), ); expect(mergeKtxSetupGitignoreEntries('cache/\nsecrets/\n')).toBe( - ['cache/', 'secrets/', 'setup/', 'agents/', ''].join('\n'), + ['cache/', 'secrets/', 'db.sqlite', 'db.sqlite-*', 'ingest-transcripts/', 'setup/', 'agents/', ''].join('\n'), ); }); }); diff --git a/packages/context/src/project/setup-config.ts b/packages/context/src/project/setup-config.ts index d0f46cf0..76951ef6 100644 --- a/packages/context/src/project/setup-config.ts +++ b/packages/context/src/project/setup-config.ts @@ -4,7 +4,15 @@ export const KTX_SETUP_STEPS = ['project', 'llm', 'embeddings', 'databases', 'so export type KtxSetupStep = (typeof KTX_SETUP_STEPS)[number]; -const SETUP_GITIGNORE_ENTRIES = ['secrets/', 'setup/', 'agents/'] as const; +const SETUP_GITIGNORE_ENTRIES = [ + 'cache/', + 'db.sqlite', + 'db.sqlite-*', + 'ingest-transcripts/', + 'secrets/', + 'setup/', + 'agents/', +] as const; export function markKtxSetupStepComplete(config: KtxProjectConfig, step: KtxSetupStep): KtxProjectConfig { const databaseConnectionIds = config.setup?.database_connection_ids ?? []; diff --git a/packages/context/src/sl/description-normalization.ts b/packages/context/src/sl/description-normalization.ts new file mode 100644 index 00000000..5a1b5ab6 --- /dev/null +++ b/packages/context/src/sl/description-normalization.ts @@ -0,0 +1,136 @@ +type DescriptionMap = Record; + +interface NormalizeDescriptionOptions { + fillMissing?: boolean; +} + +function cleanText(value: unknown): string | null { + return typeof value === 'string' && value.trim().length > 0 ? value.trim() : null; +} + +function cleanDescriptionMap(value: unknown): DescriptionMap { + const result: DescriptionMap = {}; + if (!value || typeof value !== 'object' || Array.isArray(value)) { + return result; + } + for (const [key, text] of Object.entries(value)) { + const cleaned = cleanText(text); + if (cleaned) { + result[key] = cleaned; + } + } + return result; +} + +function hasDescriptions(descriptions: DescriptionMap): boolean { + return Object.keys(descriptions).length > 0; +} + +function withDescriptionMap(record: Record, fallback: string | null): Record { + const descriptions = cleanDescriptionMap(record.descriptions); + const flatDescription = cleanText(record.description); + if (flatDescription && !descriptions.user) { + descriptions.user = flatDescription; + } + if (!hasDescriptions(descriptions) && fallback) { + descriptions.ktx = fallback; + } + + const next = { ...record }; + delete next.description; + if (hasDescriptions(descriptions)) { + next.descriptions = descriptions; + } else { + delete next.descriptions; + } + return next; +} + +function humanizeIdentifier(value: string): string { + return value + .replace(/([a-z0-9])([A-Z])/g, '$1 $2') + .replace(/[_-]+/g, ' ') + .replace(/\s+/g, ' ') + .trim() + .toLowerCase(); +} + +function formatCount(count: number, singular: string, plural = `${singular}s`): string | null { + if (count <= 0) { + return null; + } + return `${count} ${count === 1 ? singular : plural}`; +} + +function sourceFallback(source: Record, sourceName: string): string { + const table = cleanText(source.table); + const sql = cleanText(source.sql); + if (table) { + return `Semantic-layer source for ${sourceName} backed by ${table}.`; + } + if (sql) { + return `Semantic-layer source for ${sourceName} backed by curated SQL.`; + } + + const counts = [ + formatCount(Array.isArray(source.measures) ? source.measures.length : 0, 'measure'), + formatCount(Array.isArray(source.segments) ? source.segments.length : 0, 'segment'), + formatCount(Array.isArray(source.columns) ? source.columns.length : 0, 'computed column'), + ].filter((item): item is string => Boolean(item)); + return counts.length > 0 + ? `Semantic-layer overlay for ${sourceName} defining ${counts.join(', ')}.` + : `Semantic-layer overlay for ${sourceName}.`; +} + +function columnFallback(column: Record, sourceName: string): string { + const columnName = cleanText(column.name) ?? 'column'; + const label = humanizeIdentifier(columnName) || columnName; + const expr = cleanText(column.expr); + + if (expr) { + return `Computed ${label} value for ${sourceName}.`; + } + + if (columnName.toLowerCase() === 'id') { + return `Identifier column for ${sourceName}.`; + } + + const idMatch = columnName.match(/^(.+)_id$/i); + if (idMatch) { + const entity = humanizeIdentifier(idMatch[1] ?? ''); + return entity ? `Identifier for the related ${entity} on ${sourceName}.` : `Identifier column for ${sourceName}.`; + } + + if (/(^|_)(date|time|timestamp|created_at|updated_at|week_start|month_start)($|_)/i.test(columnName)) { + return `Date or time value for ${label} on ${sourceName}.`; + } + + return `Column ${label} from ${sourceName}.`; +} + +export function normalizeSemanticLayerDescriptions( + source: T, + options: NormalizeDescriptionOptions = {}, +): T { + const sourceRecord = source as Record; + const sourceName = cleanText(sourceRecord.name) ?? 'source'; + const normalized = withDescriptionMap( + sourceRecord, + options.fillMissing ? sourceFallback(sourceRecord, sourceName) : null, + ); + + if (Array.isArray(sourceRecord.columns)) { + normalized.columns = sourceRecord.columns.map((column) => { + if (!column || typeof column !== 'object' || Array.isArray(column)) { + return column; + } + const columnRecord = column as Record; + return withDescriptionMap( + columnRecord, + options.fillMissing ? columnFallback(columnRecord, sourceName) : null, + ); + }); + } + + return normalized as T; +} diff --git a/packages/context/src/sl/local-sl.ts b/packages/context/src/sl/local-sl.ts index b8d29e87..676b2522 100644 --- a/packages/context/src/sl/local-sl.ts +++ b/packages/context/src/sl/local-sl.ts @@ -5,6 +5,7 @@ import type { KtxEmbeddingPort, KtxFileWriteResult } from '../core/index.js'; import type { KtxLocalProject } from '../project/index.js'; import { HybridSearchCore, type SearchCandidateGenerator } from '../search/index.js'; import { DEFAULT_PRIORITY, resolveDescription } from './descriptions.js'; +import { normalizeSemanticLayerDescriptions } from './description-normalization.js'; import { sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js'; import { composeOverlay, type ManifestTableEntry, projectManifestEntry } from './semantic-layer.service.js'; import type { PgliteSlSearchPrototypeOwnerOptions } from './pglite-sl-search-prototype.js'; @@ -180,14 +181,14 @@ function manifestTables(value: Record): Record, name: string): SemanticLayerSource { const source = parsed as Partial; - return { + return normalizeSemanticLayerDescriptions({ ...source, name, grain: Array.isArray(parsed.grain) ? (parsed.grain.filter((item) => typeof item === 'string') as string[]) : [], columns: Array.isArray(parsed.columns) ? (parsed.columns as SemanticLayerSource['columns']) : [], joins: Array.isArray(parsed.joins) ? (parsed.joins as SemanticLayerSource['joins']) : [], measures: Array.isArray(parsed.measures) ? (parsed.measures as SemanticLayerSource['measures']) : [], - }; + }); } export async function loadLocalSlSourceRecords( diff --git a/packages/context/src/sl/schemas.ts b/packages/context/src/sl/schemas.ts index 55e07f22..56f415a8 100644 --- a/packages/context/src/sl/schemas.ts +++ b/packages/context/src/sl/schemas.ts @@ -23,6 +23,8 @@ const segmentDefinitionSchema = z.object({ description: z.string().optional(), }); +const descriptionsSchema = z.record(z.string(), z.string().min(1)); + const defaultTimeDimensionDbtSchema = z.object({ dbt: z.string().optional(), }); @@ -77,6 +79,7 @@ const sourceColumnSchema = z.object({ role: z.enum(columnRoleValues).optional(), visibility: z.enum(columnVisibilityValues).optional(), description: z.string().optional(), + descriptions: descriptionsSchema.optional(), expr: z.string().optional(), constraints: sourceKeyedColumnConstraintsSchema.optional(), enum_values: sourceKeyedStringArraySchema.optional(), @@ -91,6 +94,7 @@ const overlayColumnSchema = z role: z.enum(columnRoleValues).optional(), visibility: z.enum(columnVisibilityValues).optional(), description: z.string().optional(), + descriptions: descriptionsSchema.optional(), expr: z.string().optional(), }) .refine((col) => !col.type || col.expr, { @@ -102,6 +106,7 @@ export const sourceDefinitionSchema = z .object({ name: z.string().min(1), description: z.string().optional(), + descriptions: descriptionsSchema.optional(), // Accepted for documentation parity with the Python spec; behavior is driven // by the `table` / `sql` fields, not by this discriminator. source_type: z.enum(['table', 'sql']).optional(), diff --git a/packages/context/src/sl/semantic-layer.service.test.ts b/packages/context/src/sl/semantic-layer.service.test.ts index 5d7413ac..0b9656de 100644 --- a/packages/context/src/sl/semantic-layer.service.test.ts +++ b/packages/context/src/sl/semantic-layer.service.test.ts @@ -257,12 +257,14 @@ describe('sourceDefinitionSchema', () => { it('preserves dbt structural metadata fields used by manifest-backed SL readers', () => { const result = sourceDefinitionSchema.safeParse({ name: 'orders', + descriptions: { dbt: 'Order facts from dbt.' }, table: 'public.orders', grain: ['id'], columns: [ { name: 'status', type: 'string', + descriptions: { dbt: 'Order lifecycle status.' }, constraints: { dbt: { not_null: true, unique: true } }, enum_values: { dbt: ['placed', 'shipped'] }, tests: { @@ -282,7 +284,9 @@ describe('sourceDefinitionSchema', () => { if (!result.success) { return; } + expect(result.data.descriptions).toEqual({ dbt: 'Order facts from dbt.' }); expect(result.data.columns[0]).toMatchObject({ + descriptions: { dbt: 'Order lifecycle status.' }, constraints: { dbt: { not_null: true, unique: true } }, enum_values: { dbt: ['placed', 'shipped'] }, tests: { @@ -528,6 +532,31 @@ describe('loadAllSources — standalone enrichment via inherits_columns_from', ( const aav = sources.find((s) => s.name === 'aav_consignments'); expect(aav?.columns).toEqual([{ name: 'FOO', type: 'string' }]); }); + + it('normalizes legacy flat source and column descriptions when loading standalone files', async () => { + const standalonePath = 'semantic-layer/conn-1/orders.yaml'; + configService.listFiles.mockResolvedValue({ files: [standalonePath] }); + configService.readFile.mockResolvedValue({ + content: [ + 'name: orders', + 'description: Finance orders used for invoice reconciliation.', + 'table: public.orders', + 'grain: [id]', + 'columns:', + ' - name: id', + ' type: string', + ' description: Stable order identifier.', + ].join('\n'), + }); + + const sources = await service.loadAllSources('conn-1'); + + expect(sources[0]).toMatchObject({ + name: 'orders', + descriptions: { user: 'Finance orders used for invoice reconciliation.' }, + columns: [{ name: 'id', type: 'string', descriptions: { user: 'Stable order identifier.' } }], + }); + }); }); describe('validateWithProposedSource', () => { diff --git a/packages/context/src/sl/semantic-layer.service.ts b/packages/context/src/sl/semantic-layer.service.ts index 5d559a31..0ccce66a 100644 --- a/packages/context/src/sl/semantic-layer.service.ts +++ b/packages/context/src/sl/semantic-layer.service.ts @@ -2,6 +2,7 @@ import YAML from 'yaml'; import type { KtxFileStorePort, KtxLogger } from '../core/index.js'; import { noopLogger } from '../core/index.js'; import type { SlConnectionCatalogPort, SlPythonPort } from './ports.js'; +import { normalizeSemanticLayerDescriptions } from './description-normalization.js'; import { isOverlaySource, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js'; import type { SemanticLayerQueryExecutionResult, SemanticLayerQueryInput, SemanticLayerSource } from './types.js'; @@ -101,6 +102,7 @@ export class SemanticLayerService { const warnings: string[] = []; if (!options?.skipValidation) { + source = normalizeSemanticLayerDescriptions(source); const sourceData: Record = { ...source }; if ((sourceData.table || sourceData.sql) && (await this.isManifestBacked(connectionId, source.name))) { @@ -129,7 +131,8 @@ export class SemanticLayerService { } const path = this.sourcePath(connectionId, source.name); - const content = YAML.stringify(source, { indent: 2, lineWidth: 0 }); + const normalizedSource = normalizeSemanticLayerDescriptions(source); + const content = YAML.stringify(normalizedSource, { indent: 2, lineWidth: 0 }); const message = commitMessage ?? `Update semantic layer source: ${source.name}`; const result = await this.configService.writeFile(path, content, author, authorEmail, message, { skipLock: options?.skipLock, @@ -199,14 +202,14 @@ export class SemanticLayerService { if (sources.has(name)) { this.logger.warn(`Standalone source '${name}' in ${filePath} overrides manifest entry of the same name`); } - let standalone: SemanticLayerSource = { + let standalone: SemanticLayerSource = normalizeSemanticLayerDescriptions({ ...(data as Partial), name, grain: Array.isArray(data.grain) ? (data.grain as string[]) : [], columns: Array.isArray(data.columns) ? (data.columns as SemanticLayerSource['columns']) : [], joins: Array.isArray(data.joins) ? (data.joins as SemanticLayerSource['joins']) : [], measures: Array.isArray(data.measures) ? (data.measures as SemanticLayerSource['measures']) : [], - }; + }); // If the source declares `inherits_columns_from`, fill any blank // type/descriptions/role from the matching manifest entry. Lets the // agent write `columns: [{name: FOO}]` without redeclaring known fields. @@ -1005,7 +1008,8 @@ const COMPOSE_KNOWN_KEYS = new Set([ ]); export function composeOverlay(base: SemanticLayerSource, overlay: Record): SemanticLayerSource { - const unknownKeys = Object.keys(overlay).filter((k) => !COMPOSE_KNOWN_KEYS.has(k)); + const normalizedOverlay = normalizeSemanticLayerDescriptions(overlay); + const unknownKeys = Object.keys(normalizedOverlay).filter((k) => !COMPOSE_KNOWN_KEYS.has(k)); if (unknownKeys.length > 0) { throw new Error( `composeOverlay: overlay for '${base.name}' has unhandled keys [${unknownKeys.join(', ')}]. ` + @@ -1015,50 +1019,47 @@ export function composeOverlay(base: SemanticLayerSource, overlay: Record), + ...(normalizedOverlay.descriptions as Record), }; } // Filter out excluded columns - const excluded = new Set((overlay.exclude_columns as string[] | undefined) ?? []); + const excluded = new Set((normalizedOverlay.exclude_columns as string[] | undefined) ?? []); let columns = result.columns.filter((c) => !excluded.has(c.name)); // Append overlay computed columns - const overlayColumns = (overlay.columns as SemanticLayerSource['columns'] | undefined) ?? []; + const overlayColumns = (normalizedOverlay.columns as SemanticLayerSource['columns'] | undefined) ?? []; columns = [...columns, ...overlayColumns]; result.columns = columns; // Measures from overlay only - result.measures = (overlay.measures as SemanticLayerSource['measures'] | undefined) ?? []; + result.measures = (normalizedOverlay.measures as SemanticLayerSource['measures'] | undefined) ?? []; // Segments: overlay-replaces semantics. Manifest tables don't carry segments today; // if that changes, add a union branch here. - if (overlay.segments !== undefined) { - result.segments = overlay.segments as SemanticLayerSource['segments']; + if (normalizedOverlay.segments !== undefined) { + result.segments = normalizedOverlay.segments as SemanticLayerSource['segments']; } // Override grain - if (overlay.grain) { - result.grain = overlay.grain as string[]; + if (normalizedOverlay.grain) { + result.grain = normalizedOverlay.grain as string[]; } - if (overlay.default_time_dimension !== undefined) { - result.default_time_dimension = overlay.default_time_dimension as SemanticLayerSource['default_time_dimension']; + if (normalizedOverlay.default_time_dimension !== undefined) { + result.default_time_dimension = + normalizedOverlay.default_time_dimension as SemanticLayerSource['default_time_dimension']; } // Union + dedupe joins, apply suppressions - const disabled = new Set(((overlay.disable_joins as string[] | undefined) ?? []).map(normalizeWs)); + const disabled = new Set(((normalizedOverlay.disable_joins as string[] | undefined) ?? []).map(normalizeWs)); const manifestJoins = result.joins.filter((j) => !disabled.has(normalizeWs(j.on))); - const overlayJoins = (overlay.joins as SemanticLayerSource['joins'] | undefined) ?? []; + const overlayJoins = (normalizedOverlay.joins as SemanticLayerSource['joins'] | undefined) ?? []; const existingKeys = new Set(manifestJoins.map((j) => `${j.to}::${normalizeWs(j.on)}`)); const newJoins = overlayJoins.filter((j) => !existingKeys.has(`${j.to}::${normalizeWs(j.on)}`)); result.joins = [...manifestJoins, ...newJoins]; diff --git a/packages/context/src/sl/sl-search.service.ts b/packages/context/src/sl/sl-search.service.ts index e351011f..47743ae1 100644 --- a/packages/context/src/sl/sl-search.service.ts +++ b/packages/context/src/sl/sl-search.service.ts @@ -1,6 +1,7 @@ import type { KtxEmbeddingPort, KtxLogger } from '../core/index.js'; import { noopLogger } from '../core/index.js'; import { DEFAULT_PRIORITY, resolveDescription } from './descriptions.js'; +import { normalizeSemanticLayerDescriptions } from './description-normalization.js'; import type { SlSourcesIndexPort } from './ports.js'; import type { SemanticLayerSource } from './types.js'; @@ -8,6 +9,7 @@ export function buildSemanticLayerSourceSearchText( source: SemanticLayerSource, priority: string[] = DEFAULT_PRIORITY, ): string { + source = normalizeSemanticLayerDescriptions(source); const config = { priority }; const parts: string[] = [source.name.replace(/_/g, ' ')]; diff --git a/packages/context/src/sl/tools/sl-edit-source.tool.test.ts b/packages/context/src/sl/tools/sl-edit-source.tool.test.ts index 5165112a..d90f0356 100644 --- a/packages/context/src/sl/tools/sl-edit-source.tool.test.ts +++ b/packages/context/src/sl/tools/sl-edit-source.tool.test.ts @@ -127,6 +127,39 @@ describe('SlEditSourceTool — session gating', () => { ); expect((session.semanticLayerService as any).writeSource).toHaveBeenCalled(); }); + + it('fills missing descriptions when an ingest session edits a source', async () => { + const { tool } = makeTool(); + const session = makeSession({ + ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'dbt' }, + }); + const context: ToolContext = { ...baseContext, session }; + + const result = await tool.call( + { + connectionId: session.connectionId, + sourceName: 'orders', + yaml_edits: [{ oldText: 'measures: []', newText: 'measures: []' }], + } as any, + context, + ); + + expect(result.structured.success).toBe(true); + expect((session.semanticLayerService as any).writeSource).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ + descriptions: { ktx: expect.stringContaining('orders') }, + columns: [ + expect.objectContaining({ + descriptions: { ktx: expect.stringContaining('Identifier') }, + }), + ], + }), + expect.any(String), + expect.any(String), + expect.any(String), + ); + }); }); describe('SlEditSourceTool — manifest-backed source without overlay', () => { diff --git a/packages/context/src/sl/tools/sl-edit-source.tool.ts b/packages/context/src/sl/tools/sl-edit-source.tool.ts index 29fa275d..17a85990 100644 --- a/packages/context/src/sl/tools/sl-edit-source.tool.ts +++ b/packages/context/src/sl/tools/sl-edit-source.tool.ts @@ -2,6 +2,7 @@ import YAML from 'yaml'; import { z } from 'zod'; import { addTouchedSlSource, type ToolContext, type ToolOutput } from '../../tools/index.js'; import { applySqlEdits } from '../../tools/sql-edit-replacer.js'; +import { normalizeSemanticLayerDescriptions } from '../description-normalization.js'; import type { SemanticLayerSource } from '../types.js'; import { BaseSemanticLayerTool, @@ -147,6 +148,7 @@ If no source exists yet, use sl_write_source instead — this tool will reject t } catch (e) { return this.buildOutput(false, [`YAML parse error after edits: ${e}`], sourceName); } + source = normalizeSemanticLayerDescriptions(source, { fillMissing: !!context.session?.ingest }); // Re-serialize and write const updatedYaml = YAML.stringify(source, { indent: 2, lineWidth: 0 }); diff --git a/packages/context/src/sl/tools/sl-write-source.tool.test.ts b/packages/context/src/sl/tools/sl-write-source.tool.test.ts index 4ad6bf53..1502c177 100644 --- a/packages/context/src/sl/tools/sl-write-source.tool.test.ts +++ b/packages/context/src/sl/tools/sl-write-source.tool.test.ts @@ -175,6 +175,89 @@ describe('SlWriteSourceTool — session gating', () => { ); expect((session.semanticLayerService as any).writeSource).toHaveBeenCalled(); }); + + it('normalizes flat source and column descriptions before writing', async () => { + const { tool, semanticLayerService } = makeTool(); + const result = await tool.call( + { + connectionId: '11111111-1111-1111-1111-111111111111', + sourceName: 'orders', + source: { + name: 'orders', + description: 'Finance orders used for invoice reconciliation.', + table: 'public.orders', + grain: ['id'], + columns: [{ name: 'id', type: 'string', description: 'Stable order identifier.' }], + measures: [], + joins: [], + } as any, + } as any, + baseContext, + ); + + expect(result.structured.success).toBe(true); + expect(semanticLayerService.writeSource).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ + descriptions: { user: 'Finance orders used for invoice reconciliation.' }, + columns: [expect.objectContaining({ descriptions: { user: 'Stable order identifier.' } })], + }), + expect.any(String), + expect.any(String), + expect.any(String), + ); + }); + + it('fills missing descriptions for ingest-written overlays and columns', async () => { + const session = makeSession({ + ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'metabase' }, + semanticLayerService: { + loadSource: vi.fn().mockResolvedValue(null), + loadAllSources: vi.fn().mockResolvedValue([]), + validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }), + writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }), + deleteSource: vi.fn().mockResolvedValue(undefined), + listManifestSourceNames: vi.fn().mockResolvedValue(['mart_account_segments']), + isManifestBacked: vi.fn().mockResolvedValue(false), + readSourceFile: vi.fn().mockRejectedValue(new Error('not found')), + findManifestEntryByTableRef: vi.fn().mockResolvedValue(null), + } as any, + }); + const { tool } = makeTool(); + + const result = await tool.call( + { + connectionId: session.connectionId, + sourceName: 'mart_account_segments', + source: { + name: 'mart_account_segments', + columns: [{ name: 'is_large_contract', type: 'boolean', expr: 'contract_arr_cents >= 20000000' }], + measures: [{ name: 'account_count', expr: 'count(account_id)' }], + } as any, + } as any, + { ...baseContext, session }, + ); + + expect(result.structured.success).toBe(true); + expect((session.semanticLayerService as any).writeSource).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ + descriptions: { + ktx: expect.stringContaining('mart_account_segments'), + }, + columns: [ + expect.objectContaining({ + descriptions: { + ktx: expect.stringContaining('is large contract'), + }, + }), + ], + }), + expect.any(String), + expect.any(String), + expect.any(String), + ); + }); }); describe('SlWriteSourceTool — disconnected-components warning in markdown', () => { diff --git a/packages/context/src/sl/tools/sl-write-source.tool.ts b/packages/context/src/sl/tools/sl-write-source.tool.ts index 39a5ad5e..638b130e 100644 --- a/packages/context/src/sl/tools/sl-write-source.tool.ts +++ b/packages/context/src/sl/tools/sl-write-source.tool.ts @@ -10,6 +10,7 @@ import { type SemanticLayerStructured, sourceDefinitionSchema, } from './base-semantic-layer.tool.js'; +import { normalizeSemanticLayerDescriptions } from '../description-normalization.js'; import { slToolConnectionIdSchema } from './connection-id-schema.js'; const sourceInputSchema = z.union([sourceDefinitionSchema, sourceOverlaySchema]); @@ -154,14 +155,16 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co semanticLayerService: SemanticLayerService, skipIndex: boolean, ): Promise> { - const isOverlay = !('table' in source && source.table) && !('sql' in source && source.sql); + const normalizedSource = normalizeSemanticLayerDescriptions(source, { fillMissing: !!context.session?.ingest }); + const isOverlay = + !('table' in normalizedSource && normalizedSource.table) && !('sql' in normalizedSource && normalizedSource.sql); const existing = await this.readSourceYamlFromService(semanticLayerService, connectionId, sourceName); const commitMessage = existing ? `${isOverlay ? 'Update overlay' : 'Rewrite source'}: ${sourceName}` : `${isOverlay ? 'Create overlay' : 'Create source'}: ${sourceName}`; - const yamlContent = YAML.stringify(source); + const yamlContent = YAML.stringify(normalizedSource); const orphanError = await this.rejectOrphanOverlay(semanticLayerService, connectionId, sourceName, yamlContent); if (orphanError) { @@ -172,7 +175,7 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co return this.buildOutput(false, [shadowError], sourceName, { yaml: yamlContent }); } - const validatedSource = source as SemanticLayerSource; + const validatedSource = normalizedSource as SemanticLayerSource; const validationResult = await semanticLayerService.validateWithProposedSource(connectionId, validatedSource); const validationErrors = validationResult.errors; const validationWarnings = [...validationResult.warnings]; diff --git a/packages/context/src/wiki/tools/wiki-write.tool.test.ts b/packages/context/src/wiki/tools/wiki-write.tool.test.ts index 3b51c6e3..9e947d84 100644 --- a/packages/context/src/wiki/tools/wiki-write.tool.test.ts +++ b/packages/context/src/wiki/tools/wiki-write.tool.test.ts @@ -37,6 +37,42 @@ describe('WikiWriteTool', () => { expect(result.markdown).toMatch(/created/i); }); + it('normalizes accidentally escaped markdown newlines before writing', async () => { + const { tool, wikiService } = makeTool(); + + await tool.call( + { + key: 'large-contract-requesters', + summary: 'Cross-schema Metabase query', + content: + '# Large Contract Requesters\\n\\n**Source card:** Metabase #110\\n\\n## SQL\\n\\n```sql\\nselect * from orbit_analytics.mart_account_segments\\n```\\n', + } as any, + baseContext, + ); + + expect(wikiService.writePage.mock.calls[0][4]).toBe( + '# Large Contract Requesters\n\n**Source card:** Metabase #110\n\n## SQL\n\n```sql\nselect * from orbit_analytics.mart_account_segments\n```\n', + ); + expect(wikiService.syncSinglePage.mock.calls[0][4]).toBe( + '# Large Contract Requesters\n\n**Source card:** Metabase #110\n\n## SQL\n\n```sql\nselect * from orbit_analytics.mart_account_segments\n```\n', + ); + }); + + it('preserves intentional escaped newline examples in inline code', async () => { + const { tool, wikiService } = makeTool(); + + await tool.call( + { + key: 'newline-token', + summary: 'Escaped newline token', + content: 'Use `\\n\\n` when documenting the literal separator.', + } as any, + baseContext, + ); + + expect(wikiService.writePage.mock.calls[0][4]).toBe('Use `\\n\\n` when documenting the literal separator.'); + }); + it('skips syncSinglePage when session is worktree-scoped', async () => { const { tool, wikiService } = makeTool(); const session: ToolSession = { diff --git a/packages/context/src/wiki/tools/wiki-write.tool.ts b/packages/context/src/wiki/tools/wiki-write.tool.ts index f0ba954d..a2930fd8 100644 --- a/packages/context/src/wiki/tools/wiki-write.tool.ts +++ b/packages/context/src/wiki/tools/wiki-write.tool.ts @@ -47,6 +47,22 @@ interface WikiWriteStructured { action?: 'created' | 'updated'; } +function looksLikeEscapedMarkdown(content: string): boolean { + const withoutInlineCode = content.replace(/`[^`]*`/g, ''); + return /\\n\\n|(?:^|\\n)#{1,6}\s|\\n[-*]\s|\\n\d+\.\s|\\n```|\\n\|/.test(withoutInlineCode); +} + +function normalizeAccidentalEscapedMarkdownNewlines(content: string): string { + const escapedBreaks = content.match(/\\[rn]/g)?.length ?? 0; + if (escapedBreaks < 2) return content; + + const actualBreaks = content.match(/\r?\n/g)?.length ?? 0; + if (actualBreaks > 0 && escapedBreaks <= actualBreaks * 4) return content; + if (!looksLikeEscapedMarkdown(content)) return content; + + return content.replace(/\\r\\n/g, '\n').replace(/\\n/g, '\n').replace(/\\r/g, '\n'); +} + export class WikiWriteTool extends BaseTool { readonly name = 'wiki_write'; @@ -125,7 +141,7 @@ tags/refs/sl_refs use REPLACE semantics: omit to keep existing on update, [] to }; if (input.content) { - finalContent = input.content; + finalContent = normalizeAccidentalEscapedMarkdownNewlines(input.content); } else { const editResult = applySqlEdits(existing?.content ?? '', input.replacements ?? []); if (!editResult.success) { diff --git a/python/ktx-sl/semantic_layer/models.py b/python/ktx-sl/semantic_layer/models.py index 9a6a514f..7e922933 100644 --- a/python/ktx-sl/semantic_layer/models.py +++ b/python/ktx-sl/semantic_layer/models.py @@ -36,6 +36,22 @@ class SourceColumnTests(BaseModel): dbt_by_package: dict[str, list[str]] | None = None +_DEFAULT_DESCRIPTION_PRIORITY = ["user", "ai", "dbt", "db"] + + +def _resolve_description_map(descriptions: dict[str, str] | None) -> str | None: + if not descriptions: + return None + for source in _DEFAULT_DESCRIPTION_PRIORITY: + text = descriptions.get(source) + if text: + return text + for text in descriptions.values(): + if text: + return text + return None + + class FreshnessDbt(BaseModel): raw: Any | None = None loaded_at_field: str | None = None @@ -47,12 +63,19 @@ class SourceColumn(BaseModel): visibility: ColumnVisibility = ColumnVisibility.PUBLIC role: ColumnRole = ColumnRole.DEFAULT description: str | None = None + descriptions: dict[str, str] | None = None expr: str | None = None natural_granularity: str | None = None constraints: dict[str, ColumnDbtConstraints] | None = None enum_values: dict[str, list[str]] | None = None tests: SourceColumnTests | None = None + @model_validator(mode="after") + def resolve_description(self) -> SourceColumn: + if self.description is None: + self.description = _resolve_description_map(self.descriptions) + return self + class JoinDeclaration(BaseModel): to: str @@ -84,6 +107,7 @@ class DefaultTimeDimensionDbt(BaseModel): class SourceDefinition(BaseModel): name: str description: str | None = None + descriptions: dict[str, str] | None = None table: str | None = None sql: str | None = None grain: list[str] @@ -97,6 +121,8 @@ class SourceDefinition(BaseModel): @model_validator(mode="after") def validate_source(self) -> SourceDefinition: + if self.description is None: + self.description = _resolve_description_map(self.descriptions) if self.table and self.sql: raise ValueError("'table' and 'sql' are mutually exclusive") if not self.grain: diff --git a/python/ktx-sl/tests/test_models.py b/python/ktx-sl/tests/test_models.py index b6468462..e227bef9 100644 --- a/python/ktx-sl/tests/test_models.py +++ b/python/ktx-sl/tests/test_models.py @@ -33,6 +33,14 @@ class TestSourceColumn: assert col.visibility == ColumnVisibility.HIDDEN assert col.role == ColumnRole.TIME + def test_descriptions_map_resolves_visible_description(self): + col = SourceColumn( + name="account_id", + type="string", + descriptions={"ktx": "Identifier for the related account."}, + ) + assert col.description == "Identifier for the related account." + def test_invalid_type(self): with pytest.raises(ValidationError): SourceColumn(name="id", type="integer") @@ -63,6 +71,16 @@ class TestSourceDefinition: assert src.is_sql_source assert not src.is_table_source + def test_descriptions_map_resolves_visible_description(self): + src = SourceDefinition( + name="orders", + descriptions={"ktx": "Semantic-layer source for orders."}, + table="public.orders", + grain=["id"], + columns=[SourceColumn(name="id", type="number")], + ) + assert src.description == "Semantic-layer source for orders." + def test_table_and_sql_mutually_exclusive(self): with pytest.raises(ValidationError, match="mutually exclusive"): SourceDefinition(