From f3f893bf01e5b7e7db30a58b1996364540ae82ed Mon Sep 17 00:00:00 2001 From: Andrey Avtomonov Date: Wed, 10 Jun 2026 14:06:13 +0200 Subject: [PATCH] fix: read semantic sources safely (#284) * fix: read semantic sources safely * test: retarget reindex per-scope error case to a broken manifest Reading a broken standalone source was made non-fatal in de1f1a8d (it is surfaced for repair instead of throwing), so the reindex per-scope error test no longer captured an error. Point it at a corrupt manifest shard, which is the remaining fatal read failure the per-scope catch must isolate, and assert the captured error names the offending file. * fix(sl): decouple semantic-layer file names from warehouse naming rules The in-file `name:` field is now the sole source identity; the filename is a derived label that never participates in identity. This removes the "Unsafe semantic-layer source name" failure class entirely: any warehouse identifier (Snowflake's uppercase SIGNED_UP, EVENT$LOG, dotted names) can be read, overlaid, edited, and deleted. - New `source-files.ts`: one total filename derivation (safe lowercase names verbatim; otherwise slug + sha256-hash suffix, immune to case-insensitive-filesystem collisions) and one by-name file resolver. - Reads resolve by name everywhere; the path-from-name fast path and `assertSafeSourceName` are gone. - Writes resolve-then-write: rewrites land on the file that declares the name (human renames survive); new sources get a derived filename; a derived path occupied by a different source fails instead of clobbering. - `readSourceFile` returns null for missing files instead of forcing every caller to launder IO errors; `deleteSource` distinguishes manifest-backed sources from not-found instead of silently succeeding. - `sl_write_source` accepts verbatim warehouse identifiers (snake_case is now a recommendation for new sources) and rejects sourceName/source.name mismatches; `sl_edit_source` rejects name-changing edits. - Ingest projection commits, gate-repair allowlists, and touched-source derivation use resolved paths / in-file names instead of interpolating `/.yaml`. - Collapsed the five parallel path derivations and duplicated path-token helpers onto the shared module; dropped dead service methods. * fix(sl): resolve sources by declared name end-to-end and gate warehouse SQL with the parser-backed validator - Key broken/renamed semantic-layer files by their recoverable in-file name (slSourceNameForFile) so mid-edit sources stay reachable under their real identity in reads, listings, and search - Derive finalization touched sources from composed-source diffs and recover deleted files' declared names from the pre-change commit instead of parsing hash-derived filenames - Resolve revert/rollback paths against history (listFilesAtCommit) so human-renamed files are restored where they lived at preHead - Validate ingest sql_execution through the daemon's sqlglot validateReadOnly in the connection's dialect, sharing one driver-to-dialect map (sql-analysis/dialect.ts) across MCP and ingest - Harden the local read-only SQL backstop: accept leading comments, reject smuggled second statements, and strip trailing semicolons/comments before row-limit wrapping --- .../content/docs/guides/writing-context.mdx | 11 +- .../cli/src/connectors/sqlite/connector.ts | 26 +-- .../cli/src/connectors/sqlserver/connector.ts | 4 +- .../src/context/connections/read-only-sql.ts | 121 +++++++++- packages/cli/src/context/core/git.service.ts | 17 +- .../adapters/historic-sql/projection.ts | 3 +- .../src/context/ingest/final-gate-repair.ts | 7 +- .../src/context/ingest/finalization-scope.ts | 32 ++- .../context/ingest/ingest-bundle.runner.ts | 105 +++++++-- .../context/ingest/local-bundle-runtime.ts | 16 +- .../cli/src/context/ingest/local-ingest.ts | 5 + .../create-warehouse-verification-tools.ts | 4 +- .../sql-execution.tool.ts | 22 +- .../src/context/mcp/local-project-ports.ts | 69 +----- .../cli/src/context/memory/local-memory.ts | 3 + .../context/memory/memory-agent.service.ts | 2 +- .../scan/local-enrichment-artifacts.ts | 3 +- packages/cli/src/context/sl/local-query.ts | 41 +--- packages/cli/src/context/sl/local-sl.ts | 140 ++++------- .../src/context/sl/semantic-layer.service.ts | 127 +++++----- packages/cli/src/context/sl/source-files.ts | 160 +++++++++++++ .../sl/tools/base-semantic-layer.tool.ts | 8 +- .../context/sl/tools/sl-edit-source.tool.ts | 23 +- .../sl/tools/sl-warehouse-validation.ts | 86 ++++--- .../context/sl/tools/sl-write-source.tool.ts | 27 ++- .../cli/src/context/sql-analysis/dialect.ts | 23 ++ packages/cli/src/ingest.ts | 11 +- packages/cli/src/local-adapters.ts | 4 +- .../cli/src/skills/lookml_ingest/SKILL.md | 2 +- .../cli/src/skills/metricflow_ingest/SKILL.md | 4 +- packages/cli/src/skills/sl/SKILL.md | 2 +- .../context/connections/read-only-sql.test.ts | 89 ++++++- .../cli/test/context/core/git.service.test.ts | 23 +- .../test/context/index-sync/reindex.test.ts | 9 +- .../context/ingest/final-gate-repair.test.ts | 10 +- .../context/ingest/finalization-scope.test.ts | 46 +++- ...ingest-bundle.runner.isolated-diff.test.ts | 22 +- .../ingest/ingest-bundle.runner.test.ts | 51 +++- .../sql-execution.tool.test.ts | 65 +++++- .../context/mcp/local-project-ports.test.ts | 101 +++++++- .../memory/memory-agent.service.test.ts | 4 + .../backend-conformance.test-utils.test.ts | 7 +- packages/cli/test/context/sl/local-sl.test.ts | 219 +++++++++++++++--- .../sl/pglite-sl-search-prototype.test.ts | 9 +- .../context/sl/semantic-layer.service.test.ts | 207 ++++++++++++++++- .../sl/sl-source-seeding.test-utils.ts | 22 ++ .../cli/test/context/sl/source-files.test.ts | 154 ++++++++++++ .../sl/tools/sl-edit-source.tool.test.ts | 21 +- .../context/sl/tools/sl-rollback.tool.test.ts | 38 ++- .../sl/tools/sl-write-source.tool.test.ts | 39 +++- .../test/context/sql-analysis/dialect.test.ts | 29 +++ 51 files changed, 1797 insertions(+), 476 deletions(-) create mode 100644 packages/cli/src/context/sl/source-files.ts create mode 100644 packages/cli/src/context/sql-analysis/dialect.ts create mode 100644 packages/cli/test/context/sl/sl-source-seeding.test-utils.ts create mode 100644 packages/cli/test/context/sl/source-files.test.ts create mode 100644 packages/cli/test/context/sql-analysis/dialect.test.ts diff --git a/docs-site/content/docs/guides/writing-context.mdx b/docs-site/content/docs/guides/writing-context.mdx index e4ac70c0..8703030e 100644 --- a/docs-site/content/docs/guides/writing-context.mdx +++ b/docs-site/content/docs/guides/writing-context.mdx @@ -44,12 +44,17 @@ Use this order for most context changes: Semantic sources are YAML files for queryable tables or custom SQL. They define agent-facing measures, dimensions, segments, joins, and grain. -Semantic source files live at: +Semantic source files live under: ```text -semantic-layer//.yaml +semantic-layer// ``` +The file's `name:` field is the source's identity — it carries the warehouse +identifier verbatim, including case. The filename is a derived label: simple +lowercase names get `.yaml`, anything else gets a slugged +filename. Renaming a file does not rename the source. + ### Minimal source ```yaml @@ -152,7 +157,7 @@ joins: | Field | Required | Description | |-------|----------|-------------| -| `name` | Yes | Source identifier. Use lowercase words and underscores. | +| `name` | Yes | Source identity (not the filename). When overlaying an ingested table, match the manifest identifier verbatim, including case (e.g. `SIGNED_UP`); for a new standalone source, lowercase words and underscores are recommended. | | `descriptions` | No | Description map keyed by source, such as `user`, `dbt`, or `ai`. | | `table` or `sql` | Yes | Database table or custom SQL expression. Use exactly one. | | `grain` | Yes | Columns that uniquely identify a row at the source grain. | diff --git a/packages/cli/src/connectors/sqlite/connector.ts b/packages/cli/src/connectors/sqlite/connector.ts index f5ba2a55..4cbeeada 100644 --- a/packages/cli/src/connectors/sqlite/connector.ts +++ b/packages/cli/src/connectors/sqlite/connector.ts @@ -97,30 +97,6 @@ function sqlitePathFromUrl(url: string): string { return url; } -function stripLeadingSqlComments(sql: string): string { - let index = 0; - while (index < sql.length) { - while (/\s/.test(sql[index] ?? '')) { - index += 1; - } - if (sql.startsWith('--', index)) { - const end = sql.indexOf('\n', index + 2); - index = end === -1 ? sql.length : end + 1; - continue; - } - if (sql.startsWith('/*', index)) { - const end = sql.indexOf('*/', index + 2); - if (end === -1) { - return sql.slice(index); - } - index = end + 2; - continue; - } - break; - } - return sql.slice(index); -} - export function isKtxSqliteConnectionConfig( connection: KtxSqliteConnectionConfig | undefined, ): connection is KtxSqliteConnectionConfig { @@ -255,7 +231,7 @@ export class KtxSqliteScanConnector implements KtxScanConnector { async executeReadOnly(input: KtxSqliteReadOnlyQueryInput, _ctx: KtxScanContext): Promise { this.assertConnection(input.connectionId); - const result = this.query(limitSqlForExecution(stripLeadingSqlComments(input.sql), input.maxRows), input.params); + const result = this.query(limitSqlForExecution(input.sql, input.maxRows), input.params); return { ...result, rowCount: result.rows.length }; } diff --git a/packages/cli/src/connectors/sqlserver/connector.ts b/packages/cli/src/connectors/sqlserver/connector.ts index 5dd9969b..fa7531ba 100644 --- a/packages/cli/src/connectors/sqlserver/connector.ts +++ b/packages/cli/src/connectors/sqlserver/connector.ts @@ -1,4 +1,4 @@ -import { assertReadOnlySql } from '../../context/connections/read-only-sql.js'; +import { assertReadOnlySql, stripTrailingSqlNoise } from '../../context/connections/read-only-sql.js'; import { getDialectForDriver } from '../../context/connections/dialects.js'; import { tryConstraintQuery } from '../../context/scan/constraint-discovery.js'; import { scopedTableNames } from '../../context/scan/table-ref.js'; @@ -284,7 +284,7 @@ function isDeniedError(error: unknown): boolean { } function limitSqlForSqlServerExecution(sqlText: string, maxRows: number | undefined): string { - const trimmed = assertReadOnlySql(sqlText).replace(/;+\s*$/, ''); + const trimmed = stripTrailingSqlNoise(assertReadOnlySql(sqlText)); if (!maxRows) { return trimmed; } diff --git a/packages/cli/src/context/connections/read-only-sql.ts b/packages/cli/src/context/connections/read-only-sql.ts index fe71a0c3..cd2ab0b1 100644 --- a/packages/cli/src/context/connections/read-only-sql.ts +++ b/packages/cli/src/context/connections/read-only-sql.ts @@ -2,16 +2,133 @@ const MUTATING_SQL = /^\s*(insert|update|delete|merge|alter|drop|create|truncate|grant|revoke|copy|call|do|vacuum|analyze|refresh)\b/i; const READ_SQL = /^\s*(select|with)\b/i; +// Agents (and the daemon's sqlglot validator, which ignores comments) routinely +// emit read-only queries prefixed with `-- ...` or `/* ... */`. Strip leading +// comments so the prefix check sees the real statement; otherwise valid SELECT/WITH +// SQL is rejected here while the parser-backed validator accepts it. +function stripLeadingSqlComments(sql: string): string { + let index = 0; + while (index < sql.length) { + while (/\s/.test(sql[index] ?? '')) { + index += 1; + } + if (sql.startsWith('--', index)) { + const end = sql.indexOf('\n', index + 2); + index = end === -1 ? sql.length : end + 1; + continue; + } + if (sql.startsWith('/*', index)) { + const end = sql.indexOf('*/', index + 2); + if (end === -1) { + return sql.slice(index); + } + index = end + 2; + continue; + } + break; + } + return sql.slice(index); +} + +// Lexes past one string literal, quoted identifier, or comment starting at +// `index`, using standard-SQL rules ('' and "" escapes; no dialect extensions +// such as backslash escapes or dollar quoting). Returns the index after the +// token, or `index` unchanged when no quoted/comment token starts there. +function skipQuotedOrComment(sql: string, index: number): number { + const quote = sql[index]; + if (quote === "'" || quote === '"') { + let i = index + 1; + while (i < sql.length) { + if (sql[i] === quote) { + if (sql[i + 1] === quote) { + i += 2; + continue; + } + return i + 1; + } + i += 1; + } + return sql.length; + } + if (sql.startsWith('--', index)) { + const end = sql.indexOf('\n', index + 2); + return end === -1 ? sql.length : end + 1; + } + if (sql.startsWith('/*', index)) { + const end = sql.indexOf('*/', index + 2); + return end === -1 ? sql.length : end + 2; + } + return index; +} + +// Backstop against statement smuggling (`select 1; drop table x`): reject any +// semicolon that is followed by real content. Semicolons inside string +// literals, quoted identifiers, and comments are fine, as are trailing +// semicolons (optionally followed by whitespace and comments). This deliberately +// lexes standard SQL only, so dialect-specific escapes can cause a false +// reject — never a false accept; the canonical gate is the daemon's +// sqlglot-backed validateReadOnly. +function assertSingleSqlStatement(sql: string): void { + let index = 0; + let sawSemicolon = false; + while (index < sql.length) { + const skipped = skipQuotedOrComment(sql, index); + if (skipped > index) { + index = skipped; + continue; + } + if (sql[index] === ';') { + sawSemicolon = true; + } else if (sawSemicolon && !/\s/.test(sql[index])) { + throw new Error('Only one SQL statement can be executed.'); + } + index += 1; + } +} + export function assertReadOnlySql(sql: string): string { - const trimmed = sql.trim(); + const trimmed = stripLeadingSqlComments(sql).trim(); if (!READ_SQL.test(trimmed) || MUTATING_SQL.test(trimmed)) { throw new Error('Only read-only SELECT/WITH queries can be executed locally.'); } + assertSingleSqlStatement(trimmed); return trimmed; } +// `assertReadOnlySql` deliberately keeps trailing semicolons, comments, and +// whitespace (e.g. `select 1; -- done`) — harmless for direct single-statement +// execution. A row-limit subquery wrapper needs a bare expression instead: a +// trailing `;` would sit illegally inside the subquery, and a trailing line +// comment would comment out the closing paren and limit clause. Lex forward with +// the same standard-SQL rules as the single-statement gate and truncate at the +// end of the last meaningful token, dropping trailing semicolons, comments, and +// whitespace. Characters inside string literals and quoted identifiers stay +// meaningful, so a `;` or `--` within a literal is never mistaken for a +// terminator (a plain regex cannot make that distinction). +export function stripTrailingSqlNoise(sql: string): string { + let index = 0; + let meaningfulEnd = 0; + while (index < sql.length) { + if (sql.startsWith('--', index) || sql.startsWith('/*', index)) { + index = skipQuotedOrComment(sql, index); + continue; + } + const afterQuoted = skipQuotedOrComment(sql, index); + if (afterQuoted > index) { + meaningfulEnd = afterQuoted; + index = afterQuoted; + continue; + } + if (sql[index] !== ';' && !/\s/.test(sql[index] ?? '')) { + meaningfulEnd = index + 1; + } + index += 1; + } + return sql.slice(0, meaningfulEnd); +} + export function limitSqlForExecution(sql: string, maxRows: number | undefined): string { - const trimmed = assertReadOnlySql(sql).replace(/;+\s*$/, ''); + const trimmed = stripTrailingSqlNoise(assertReadOnlySql(sql)); if (!maxRows) { return trimmed; } diff --git a/packages/cli/src/context/core/git.service.ts b/packages/cli/src/context/core/git.service.ts index a9638ea5..9a826e4b 100644 --- a/packages/cli/src/context/core/git.service.ts +++ b/packages/cli/src/context/core/git.service.ts @@ -557,12 +557,13 @@ export class GitService { } /** - * List all paths under the working tree that match `pathSpec`, scoped to HEAD. - * Used for the reconciler's first-ever run when there's no watermark to diff from. + * List all paths matching `pathSpec` as they exist at `commitHash`. Reads from + * git object storage, so it's safe against concurrent working-tree mutations + * and can recover paths (e.g. a human-renamed file) that no longer exist on disk. */ - async listFilesAtHead(pathSpec: string): Promise { + async listFilesAtCommit(pathSpec: string, commitHash: string): Promise { try { - const raw = await this.git.raw(['ls-tree', '-r', '-z', '--name-only', 'HEAD', '--', pathSpec]); + const raw = await this.git.raw(['ls-tree', '-r', '-z', '--name-only', commitHash, '--', pathSpec]); if (!raw) { return []; } @@ -572,6 +573,14 @@ export class GitService { } } + /** + * List all paths under the working tree that match `pathSpec`, scoped to HEAD. + * Used for the reconciler's first-ever run when there's no watermark to diff from. + */ + async listFilesAtHead(pathSpec: string): Promise { + return this.listFilesAtCommit(pathSpec, 'HEAD'); + } + /** * Collapse all commits between `preHead` and current HEAD into a single commit with the given * message. Used by the memory agent to squash N per-tool-call commits into one ingest commit. diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/projection.ts b/packages/cli/src/context/ingest/adapters/historic-sql/projection.ts index 2c7830a2..b3470441 100644 --- a/packages/cli/src/context/ingest/adapters/historic-sql/projection.ts +++ b/packages/cli/src/context/ingest/adapters/historic-sql/projection.ts @@ -3,6 +3,7 @@ import { dirname, join, relative } from 'node:path'; import YAML from 'yaml'; import type { MemoryAction } from '../../../../context/memory/types.js'; import { rawSourcesDirForSync } from '../../raw-sources-paths.js'; +import { isSlYamlPath } from '../../../sl/source-files.js'; import type { FinalizationOverrideReplay } from '../../types.js'; import { mergeUsagePreservingExternal } from '../live-database/manifest.js'; import { historicSqlEvidenceEnvelopeSchema, type HistoricSqlEvidenceEnvelope } from './evidence.js'; @@ -251,7 +252,7 @@ export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInp const patternEvidence = evidence.filter((entry): entry is HistoricSqlEvidenceEnvelope & { kind: 'pattern' } => entry.kind === 'pattern'); const schemaRoot = join(input.workdir, 'semantic-layer', input.connectionId, '_schema'); - for (const file of (await walkFiles(schemaRoot)).filter((candidate) => candidate.endsWith('.yaml') || candidate.endsWith('.yml'))) { + for (const file of (await walkFiles(schemaRoot)).filter(isSlYamlPath)) { const path = join(schemaRoot, file); const before = await readFile(path, 'utf-8'); const shard = (YAML.parse(before) ?? {}) as ManifestShard; diff --git a/packages/cli/src/context/ingest/final-gate-repair.ts b/packages/cli/src/context/ingest/final-gate-repair.ts index f32178d8..2b8e5c33 100644 --- a/packages/cli/src/context/ingest/final-gate-repair.ts +++ b/packages/cli/src/context/ingest/final-gate-repair.ts @@ -2,7 +2,6 @@ import { mkdir, readFile, writeFile } from 'node:fs/promises'; import { dirname, join } from 'node:path'; import { z } from 'zod'; import type { AgentRunnerPort, KtxRuntimeToolSet } from '../../context/llm/runtime-port.js'; -import type { TouchedSlSource } from '../../context/tools/touched-sl-sources.js'; import type { IngestTraceWriter } from './ingest-trace.js'; import { traceTimed } from './ingest-trace.js'; @@ -149,11 +148,13 @@ function buildToolSet(input: { export function finalGateRepairPaths(input: { changedWikiPageKeys: string[]; - touchedSlSources: TouchedSlSource[]; + // Resolved by the caller: SL filenames are derived labels, so the repair + // allowlist must carry the real on-disk paths, not name-interpolated ones. + touchedSlSourcePaths: string[]; }): string[] { return [ ...new Set([ - ...input.touchedSlSources.map((source) => `semantic-layer/${source.connectionId}/${source.sourceName}.yaml`), + ...input.touchedSlSourcePaths, ...input.changedWikiPageKeys.map((pageKey) => `wiki/global/${pageKey}.md`), ]), ].sort(); diff --git a/packages/cli/src/context/ingest/finalization-scope.ts b/packages/cli/src/context/ingest/finalization-scope.ts index b5ace2b9..6d7e4a42 100644 --- a/packages/cli/src/context/ingest/finalization-scope.ts +++ b/packages/cli/src/context/ingest/finalization-scope.ts @@ -1,3 +1,4 @@ +import { isSlYamlPath } from '../../context/sl/source-files.js'; import type { SemanticLayerSource } from '../../context/sl/types.js'; import type { TouchedSlSource } from '../../context/tools/touched-sl-sources.js'; import type { IngestReportFinalizationMismatch } from './reports.js'; @@ -64,39 +65,36 @@ export function deriveFinalizationWikiPageKeys(paths: string[]): string[] { ); } -export async function deriveFinalizationTouchedSources( - input: DeriveTouchedSourcesInput, -): Promise { +// Source identity is the in-file `name:`; filenames are derived labels (see +// source-files.ts), so a changed path — manifest shard or standalone file — +// cannot be mapped to a source by parsing its filename. Instead, every changed +// semantic-layer file is attributed through the before/after diff of its +// connection's composed sources. A changed file whose connection diff is empty +// cannot be attributed to any source and is surfaced as unresolved. +export function deriveFinalizationTouchedSources(input: DeriveTouchedSourcesInput): DeriveTouchedSourcesResult { const touched = new Map(); const unresolvedPaths: string[] = []; + const pathsByConnection = new Map(); for (const path of input.changedPaths) { - if (!path.startsWith('semantic-layer/') || !(path.endsWith('.yaml') || path.endsWith('.yml'))) { + if (!path.startsWith('semantic-layer/') || !isSlYamlPath(path)) { continue; } - const parts = path.split('/'); - const connectionId = parts[1] ?? ''; + const connectionId = path.split('/')[1] ?? ''; if (!connectionId) { unresolvedPaths.push(path); continue; } - if (parts[2] !== '_schema') { - const fileName = parts.at(-1) ?? ''; - const sourceName = fileName.replace(/\.ya?ml$/, ''); - if (!sourceName) { - unresolvedPaths.push(path); - continue; - } - touched.set(`${connectionId}:${sourceName}`, { connectionId, sourceName }); - continue; - } + pathsByConnection.set(connectionId, [...(pathsByConnection.get(connectionId) ?? []), path]); + } + for (const [connectionId, paths] of pathsByConnection) { const changedNames = changedSourceNames( input.beforeSourcesByConnection.get(connectionId) ?? [], input.afterSourcesByConnection.get(connectionId) ?? [], ); if (changedNames.length === 0) { - unresolvedPaths.push(path); + unresolvedPaths.push(...paths); continue; } for (const sourceName of changedNames) { diff --git a/packages/cli/src/context/ingest/ingest-bundle.runner.ts b/packages/cli/src/context/ingest/ingest-bundle.runner.ts index 6f5372d2..0e5086e4 100644 --- a/packages/cli/src/context/ingest/ingest-bundle.runner.ts +++ b/packages/cli/src/context/ingest/ingest-bundle.runner.ts @@ -8,6 +8,7 @@ import { createRuntimeToolDescriptorFromAiTool } from '../../context/llm/runtime import type { KtxRuntimeToolSet } from '../../context/llm/runtime-port.js'; import type { CaptureSession, MemoryAction } from '../../context/memory/types.js'; import type { SemanticLayerService } from '../../context/sl/semantic-layer.service.js'; +import { isSlYamlPath, slSourceFilePath, slSourceNameForFile, sourceNameFromPath } from '../../context/sl/source-files.js'; import type { SemanticLayerSource } from '../../context/sl/types.js'; import type { SlValidationDeps } from '../../context/sl/tools/sl-warehouse-validation.js'; import { createTouchedSlSources, type TouchedSlSource } from '../../context/tools/touched-sl-sources.js'; @@ -498,7 +499,7 @@ export class IngestBundleRunner { const files = await this.deps.semanticLayerService.listFilesForConnection(connectionId); const names = files .filter((f) => !f.startsWith('_schema/')) - .map((f) => f.replace(/\.yaml$/, '')) + .map((f) => sourceNameFromPath(f)) .sort((left, right) => left.localeCompare(right)); const body = names.length > 0 ? names.join('\n') : '(no sources yet)'; return `## ${connectionId}\n${body}`; @@ -791,14 +792,52 @@ export class IngestBundleRunner { ].sort(); } - private touchedSlSourcesFromPaths(paths: string[]): TouchedSlSource[] { - return paths - .filter((path) => path.startsWith('semantic-layer/') && path.endsWith('.yaml') && !path.includes('/_schema/')) - .map((path) => { - const [, connectionId, fileName] = path.split('/'); - return { connectionId: connectionId ?? '', sourceName: (fileName ?? '').replace(/\.yaml$/, '') }; - }) - .filter((source) => source.connectionId.length > 0 && source.sourceName.length > 0); + private async touchedSlSourcesFromPaths( + worktree: IngestSessionWorktree, + paths: string[], + deletedFileSha: string, + ): Promise { + const sources: TouchedSlSource[] = []; + for (const path of paths) { + if (!path.startsWith('semantic-layer/') || !isSlYamlPath(path) || path.includes('/_schema/')) { + continue; + } + const [, connectionId] = path.split('/'); + if (!connectionId) { + continue; + } + // Source identity is the in-file `name:`, never the filename — an uppercase + // warehouse source like `WIDGET_SALES` lives in a hash-derived + // `widget_sales-.yaml`, so parsing the basename yields a phantom name. + // Read the live file; when it was deleted this run, recover its declared + // name from the pre-change commit the way `revertSourceToPreHead` resolves a + // gone file from history. The filename is a last resort only when the content + // is unrecoverable from both. + let content: string | null; + try { + content = await readFile(join(worktree.workdir, path), 'utf-8'); + } catch { + content = await worktree.git.getFileAtCommit(path, deletedFileSha).catch(() => null); + } + const sourceName = content === null ? sourceNameFromPath(path) : slSourceNameForFile(path, content); + if (sourceName.length > 0) { + sources.push({ connectionId, sourceName }); + } + } + return sources; + } + + // Inverse direction for commits and repair allowlists: resolve each touched + // source to its real on-disk path, falling back to the writer's derived + // filename when the file was deleted in this run. + private async touchedSlSourcePaths(workdir: string, touched: TouchedSlSource[]): Promise { + const service = this.deps.semanticLayerService.forWorktree(workdir); + const paths: string[] = []; + for (const source of touched) { + const file = await service.readSourceFile(source.connectionId, source.sourceName); + paths.push(file?.path ?? slSourceFilePath(source.connectionId, source.sourceName)); + } + return paths; } private touchedSlSourcesFromActions(actions: MemoryAction[], fallbackConnectionId: string): TouchedSlSource[] { @@ -1558,7 +1597,7 @@ export class IngestBundleRunner { projectionTouchedSources = projection.touchedSources; projectionChangedWikiPageKeys = projection.changedWikiPageKeys; const projectionPaths = [ - ...projection.touchedSources.map((source) => `semantic-layer/${source.connectionId}/${source.sourceName}.yaml`), + ...(await this.touchedSlSourcePaths(sessionWorktree.workdir, projection.touchedSources)), ...projection.changedWikiPageKeys.map((pageKey) => `wiki/global/${pageKey}.md`), ]; projectionTouchedPaths = projectionPaths; @@ -1740,7 +1779,11 @@ export class IngestBundleRunner { await validateFinalIngestArtifacts({ connectionIds: slConnectionIds, changedWikiPageKeys: this.wikiPageKeysFromPaths(touchedPaths), - touchedSlSources: this.touchedSlSourcesFromPaths(touchedPaths), + touchedSlSources: await this.touchedSlSourcesFromPaths( + sessionWorktree, + touchedPaths, + await sessionWorktree.git.revParseHead(), + ), wikiService: this.deps.wikiService.forWorktree(sessionWorktree.workdir), semanticLayerService: this.deps.semanticLayerService.forWorktree(sessionWorktree.workdir), validateTouchedSources: (touched) => @@ -2289,20 +2332,34 @@ export class IngestBundleRunner { ) : []; - const changedConnectionIds = [ - ...new Set([ - ...slConnectionIds, - ...finalizationTouchedPaths - .filter((path) => path.startsWith('semantic-layer/')) - .map((path) => path.split('/')[1]) - .filter((connectionId): connectionId is string => Boolean(connectionId)), - ]), - ].sort(); + // Validate the write scope before deriving touched sources: attribution + // by before/after diff is only defined for connections whose + // pre-finalization snapshot was loaded (slConnectionIds), and an + // out-of-scope write would otherwise surface downstream as a bogus + // unresolved-path or declaration-mismatch failure instead of the real + // policy violation. + await traceTimed( + runTrace, + 'finalization', + 'semantic_layer_target_policy', + { + sourceKey: job.sourceKey, + allowedTargetConnectionIds: slConnectionIds, + touchedPaths: [...new Set(finalizationTouchedPaths)].sort(), + }, + async () => { + assertSemanticLayerTargetPathsAllowed({ + paths: finalizationTouchedPaths, + allowedConnectionIds: new Set(slConnectionIds), + }); + }, + ); + const postFinalizationSourcesByConnection = await this.loadSourcesByConnection( sessionWorktree.workdir, - changedConnectionIds, + slConnectionIds, ); - const scope = await deriveFinalizationTouchedSources({ + const scope = deriveFinalizationTouchedSources({ changedPaths: finalizationTouchedPaths, beforeSourcesByConnection: preFinalizationSourcesByConnection, afterSourcesByConnection: postFinalizationSourcesByConnection, @@ -2437,7 +2494,7 @@ export class IngestBundleRunner { ...(isolatedDiffEnabled ? projectionTouchedSources : []), ...workUnitOutcomes.flatMap((outcome) => outcome.touchedSlSources), ...this.touchedSlSourcesFromActions(reconcileActions, job.connectionId), - ...this.touchedSlSourcesFromPaths(postReconciliationPaths), + ...(await this.touchedSlSourcesFromPaths(sessionWorktree, postReconciliationPaths, preReconciliationSha)), ...finalizationTouchedSources, ]); const finalWikiGateScope = await this.wikiPageKeysForFinalGates({ @@ -2528,7 +2585,7 @@ export class IngestBundleRunner { const gateError = this.errorMessage(error); const repairPaths = finalGateRepairPaths({ changedWikiPageKeys: finalChangedWikiPageKeys, - touchedSlSources: finalTouchedSlSources, + touchedSlSourcePaths: await this.touchedSlSourcePaths(sessionWorktree.workdir, finalTouchedSlSources), }); emitStageProgress('final_gates', 89, 'Repairing final artifact gates'); const gateRepair = await repairFinalGateFailure({ diff --git a/packages/cli/src/context/ingest/local-bundle-runtime.ts b/packages/cli/src/context/ingest/local-bundle-runtime.ts index 69b9159a..90f1b3ae 100644 --- a/packages/cli/src/context/ingest/local-bundle-runtime.ts +++ b/packages/cli/src/context/ingest/local-bundle-runtime.ts @@ -4,6 +4,7 @@ import { fileURLToPath } from 'node:url'; import YAML from 'yaml'; import { localConnectionInfoFromConfig } from '../../context/connections/local-warehouse-descriptor.js'; import type { KtxSqlQueryExecutorPort } from '../../context/connections/query-executor.js'; +import type { SqlAnalysisPort } from '../../context/sql-analysis/ports.js'; import type { KtxEmbeddingPort } from '../../context/core/embedding.js'; import type { KtxLogger } from '../../context/core/config.js'; import { noopLogger } from '../../context/core/config.js'; @@ -95,6 +96,7 @@ export interface CreateLocalBundleIngestRuntimeOptions { memoryModel?: string; semanticLayerCompute?: KtxSemanticLayerComputePort; queryExecutor?: KtxSqlQueryExecutorPort; + sqlAnalysis?: SqlAnalysisPort; jobIdFactory?: () => string; logger?: KtxLogger; embeddingProvider?: KtxEmbeddingProvider | null; @@ -271,16 +273,13 @@ class LocalShapeOnlySlValidator implements SlValidatorPort { } async validateSingleSource(deps: SlValidationDeps, connectionId: string, sourceName: string) { - let content: string; - try { - const file = await deps.semanticLayerService.readSourceFile(connectionId, sourceName); - content = file.content; - } catch (error) { - return this.validateComposedSource(deps, connectionId, sourceName, error); + const file = await deps.semanticLayerService.readSourceFile(connectionId, sourceName); + if (!file) { + return this.validateComposedSource(deps, connectionId, sourceName, 'no standalone or overlay file found'); } try { - const parsed = YAML.parse(content) as unknown as Record; + const parsed = YAML.parse(file.content) as unknown as Record; return this.validateParsedSource(sourceName, parsed); } catch (error) { return { @@ -519,6 +518,7 @@ class LocalIngestToolsetFactory implements IngestToolsetFactoryPort { authorResolver: GitAuthorResolverPort; slSourcesRepository: SlSourcesIndexPort; connections: SlConnectionCatalogPort; + sqlAnalysis?: SqlAnalysisPort; contextStore: SqliteContextEvidenceStore; embedding: KtxEmbeddingPort; }) { @@ -551,6 +551,7 @@ class LocalIngestToolsetFactory implements IngestToolsetFactoryPort { const slDiscoverTool = new SlDiscoverTool(slDeps, { maxSources: 25, minRrfScore: 0, maxDetailedSources: 5 }); const warehouseVerificationTools = createWarehouseVerificationTools({ connections: deps.connections, + ...(deps.sqlAnalysis ? { sqlAnalysis: deps.sqlAnalysis } : {}), fallbackFileStore: deps.project.fileStore, wikiSearchTool, slDiscoverTool, @@ -699,6 +700,7 @@ export function createLocalBundleIngestRuntime( authorResolver: new LocalAuthorResolver(), slSourcesRepository, connections, + ...(options.sqlAnalysis ? { sqlAnalysis: options.sqlAnalysis } : {}), contextStore, embedding, }); diff --git a/packages/cli/src/context/ingest/local-ingest.ts b/packages/cli/src/context/ingest/local-ingest.ts index 1a219629..5f7f8c5a 100644 --- a/packages/cli/src/context/ingest/local-ingest.ts +++ b/packages/cli/src/context/ingest/local-ingest.ts @@ -2,6 +2,7 @@ import { randomUUID } from 'node:crypto'; import { cp, mkdir, rm } from 'node:fs/promises'; import { isAbsolute, resolve } from 'node:path'; import type { KtxSqlQueryExecutorPort } from '../../context/connections/query-executor.js'; +import type { SqlAnalysisPort } from '../../context/sql-analysis/ports.js'; import type { KtxLogger } from '../../context/core/config.js'; import { createAbortError, isAbortError } from '../../context/core/abort.js'; import type { KtxSemanticLayerComputePort } from '../../context/daemon/semantic-layer-compute.js'; @@ -35,6 +36,7 @@ export interface RunLocalIngestOptions { memoryModel?: string; semanticLayerCompute?: KtxSemanticLayerComputePort; queryExecutor?: KtxSqlQueryExecutorPort; + sqlAnalysis?: SqlAnalysisPort; logger?: KtxLogger; embeddingProvider?: import('../../llm/types.js').KtxEmbeddingProvider | null; abortSignal?: AbortSignal; @@ -159,6 +161,7 @@ async function runScheduledPullJob(options: { memoryModel?: string; semanticLayerCompute?: KtxSemanticLayerComputePort; queryExecutor?: KtxSqlQueryExecutorPort; + sqlAnalysis?: SqlAnalysisPort; logger?: KtxLogger; embeddingProvider?: import('../../llm/types.js').KtxEmbeddingProvider | null; abortSignal?: AbortSignal; @@ -214,6 +217,7 @@ export async function runLocalIngest(options: RunLocalIngestOptions): Promise { readonly name = 'sql_execution'; - constructor(private readonly connections: SlConnectionCatalogPort) { + constructor( + private readonly connections: SlConnectionCatalogPort, + private readonly sqlAnalysis?: SqlAnalysisPort, + ) { super(); } @@ -69,9 +74,24 @@ export class SqlExecutionTool extends BaseTool { }; } + if (!this.sqlAnalysis) { + throw new Error('sql_execution requires parser-backed SQL validation.'); + } + let sql: string; let wrappedSql: string; try { + const connection = await this.connections.getConnectionById(input.connectionId); + if (!connection) { + throw new Error(`Connection not found: ${input.connectionId}`); + } + const validation = await this.sqlAnalysis.validateReadOnly( + input.sql, + sqlAnalysisDialectForDriver(connection.connectionType), + ); + if (!validation.ok) { + throw new Error(validation.error ?? 'SQL is not read-only.'); + } sql = assertReadOnlySql(input.sql); wrappedSql = limitSqlForExecution(sql, input.rowLimit); } catch (error) { diff --git a/packages/cli/src/context/mcp/local-project-ports.ts b/packages/cli/src/context/mcp/local-project-ports.ts index 8971c51d..4bada831 100644 --- a/packages/cli/src/context/mcp/local-project-ports.ts +++ b/packages/cli/src/context/mcp/local-project-ports.ts @@ -8,9 +8,12 @@ import { createKtxEntityDetailsService } from '../../context/scan/entity-details import type { KtxScanConnector } from '../../context/scan/types.js'; import type { LocalScanMcpOptions } from '../../context/scan/local-scan.js'; import { createKtxDiscoverDataService } from '../../context/search/discover.js'; -import type { SqlAnalysisDialect, SqlAnalysisPort } from '../../context/sql-analysis/ports.js'; +import { sqlAnalysisDialectForDriver } from '../../context/sql-analysis/dialect.js'; +import type { SqlAnalysisPort } from '../../context/sql-analysis/ports.js'; import { compileLocalSlQuery } from '../../context/sl/local-query.js'; import { createKtxDictionarySearchService } from '../../context/sl/dictionary-search.js'; +import { readLocalSlSource } from '../../context/sl/local-sl.js'; +import { assertSafeConnectionId } from '../../context/sl/source-files.js'; import { readLocalKnowledgePage, searchLocalKnowledgePages } from '../wiki/local-knowledge.js'; import type { KtxMcpContextPorts, KtxMcpProgressCallback, KtxSqlExecutionResponse } from './types.js'; @@ -22,64 +25,12 @@ interface CreateLocalProjectMcpContextPortsOptions { embeddingService: KtxEmbeddingPort | null; } -function dialectForDriver(driver: string | undefined): string { - const normalized = (driver ?? 'postgres').toUpperCase(); - const map: Record = { - POSTGRES: 'postgres', - BIGQUERY: 'bigquery', - SNOWFLAKE: 'snowflake', - MYSQL: 'mysql', - SQLSERVER: 'tsql', - SQLITE: 'sqlite', - DUCKDB: 'duckdb', - CLICKHOUSE: 'clickhouse', - DATABRICKS: 'databricks', - }; - return map[normalized] ?? 'postgres'; -} - -function sqlAnalysisDialectForDriver(driver: string | undefined): SqlAnalysisDialect { - return dialectForDriver(driver) as SqlAnalysisDialect; -} - -function assertSafePathToken(kind: string, value: string): string { - if ( - value.trim().length === 0 || - value.includes('..') || - value.includes('\\') || - value.startsWith('/') || - value.startsWith('.') || - value.includes('//') - ) { - throw new Error(`Unsafe ${kind}: ${value}`); - } - return value; -} - -function assertSafeConnectionId(connectionId: string): string { - if (!/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId)) { - throw new Error(`Unsafe connection id: ${connectionId}`); - } - return assertSafePathToken('connection id', connectionId); -} - -function assertSafeSourceName(sourceName: string): string { - if (!/^[a-z0-9][a-z0-9_]*$/.test(sourceName)) { - throw new Error(`Unsafe semantic-layer source name: ${sourceName}`); - } - return assertSafePathToken('semantic-layer source name', sourceName); -} - async function cleanupConnector(connector: KtxScanConnector | null): Promise { if (connector?.cleanup) { await connector.cleanup(); } } -function slPath(connectionId: string, sourceName: string): string { - return `semantic-layer/${assertSafeConnectionId(connectionId)}/${assertSafeSourceName(sourceName)}.yaml`; -} - async function executeValidatedReadOnlySql( project: KtxLocalProject, options: CreateLocalProjectMcpContextPortsOptions, @@ -201,13 +152,11 @@ export function createLocalProjectMcpContextPorts( }, semanticLayer: { async readSource(input) { - const path = slPath(input.connectionId, input.sourceName); - try { - const result = await project.fileStore.readFile(path); - return { sourceName: input.sourceName, yaml: result.content }; - } catch { - return null; - } + const source = await readLocalSlSource(project, { + connectionId: input.connectionId, + sourceName: input.sourceName, + }); + return source ? { sourceName: source.name, yaml: source.yaml } : null; }, async query(input, executionOptions) { if (!options.semanticLayerCompute) { diff --git a/packages/cli/src/context/memory/local-memory.ts b/packages/cli/src/context/memory/local-memory.ts index b72bc9ce..92bb5b78 100644 --- a/packages/cli/src/context/memory/local-memory.ts +++ b/packages/cli/src/context/memory/local-memory.ts @@ -375,6 +375,9 @@ class LocalShapeOnlySlValidator implements SlValidatorPort { async validateSingleSource(deps: SlValidationDeps, connectionId: string, sourceName: string) { try { const file = await deps.semanticLayerService.readSourceFile(connectionId, sourceName); + if (!file) { + return { errors: [`${sourceName}: no standalone or overlay file found`], warnings: [] }; + } const parsed = YAML.parse(file.content) as SemanticLayerSource; const isOverlay = parsed.table == null && parsed.sql == null; const result = (isOverlay ? sourceOverlaySchema : sourceDefinitionSchema).safeParse(parsed); diff --git a/packages/cli/src/context/memory/memory-agent.service.ts b/packages/cli/src/context/memory/memory-agent.service.ts index 7f6438d3..491cd159 100644 --- a/packages/cli/src/context/memory/memory-agent.service.ts +++ b/packages/cli/src/context/memory/memory-agent.service.ts @@ -483,7 +483,7 @@ export class MemoryAgentService { if (session.connectionId) { for (const { connectionId, sourceName } of listTouchedSlSources(session.touchedSlSources)) { try { - const file = await this.deps.semanticLayerService.readSourceFile(connectionId, sourceName).catch(() => null); + const file = await this.deps.semanticLayerService.readSourceFile(connectionId, sourceName); if (file?.content) { const parsed = this.parseYamlOrNull(file.content); if (parsed) { diff --git a/packages/cli/src/context/scan/local-enrichment-artifacts.ts b/packages/cli/src/context/scan/local-enrichment-artifacts.ts index 9de46a98..e2072d6b 100644 --- a/packages/cli/src/context/scan/local-enrichment-artifacts.ts +++ b/packages/cli/src/context/scan/local-enrichment-artifacts.ts @@ -3,6 +3,7 @@ import { buildLiveDatabaseManifestShards, type LiveDatabaseManifestExistingDescr import type { TableUsageOutput } from '../../context/ingest/adapters/historic-sql/skill-schemas.js'; import type { KtxScanRelationshipConfig } from '../project/config.js'; import type { KtxLocalProject } from '../../context/project/project.js'; +import { isSlYamlPath } from '../../context/sl/source-files.js'; import type { KtxLocalScanEnrichmentResult } from './local-enrichment.js'; import { buildKtxRelationshipArtifacts, @@ -205,7 +206,7 @@ async function loadExistingManifestState( let files: string[]; try { - files = (await project.fileStore.listFiles(schemaDir(connectionId))).files.filter((file) => file.endsWith('.yaml')); + files = (await project.fileStore.listFiles(schemaDir(connectionId))).files.filter(isSlYamlPath); } catch { return { descriptions, preservedJoins, usage }; } diff --git a/packages/cli/src/context/sl/local-query.ts b/packages/cli/src/context/sl/local-query.ts index 781d8b94..f6de3aab 100644 --- a/packages/cli/src/context/sl/local-query.ts +++ b/packages/cli/src/context/sl/local-query.ts @@ -2,8 +2,10 @@ import type { KtxSqlQueryExecutorPort } from '../../context/connections/query-ex import type { KtxSemanticLayerComputePort } from '../../context/daemon/semantic-layer-compute.js'; import type { KtxMcpProgressCallback } from '../mcp/types.js'; import type { KtxLocalProject } from '../../context/project/project.js'; +import { sqlAnalysisDialectForDriver } from '../sql-analysis/dialect.js'; import { loadLocalSlSourceRecords } from './local-sl.js'; import { toResolvedWire } from './semantic-layer.service.js'; +import { assertSafeConnectionId } from './source-files.js'; import type { SemanticLayerQueryExecutionResult, SemanticLayerQueryInput } from './types.js'; const COMPILE_ONLY_REASON = @@ -24,43 +26,6 @@ export interface CompileLocalSlQueryResult extends SemanticLayerQueryExecutionRe dialect: string; } -function assertSafePathToken(kind: string, value: string): string { - if ( - value.trim().length === 0 || - value.includes('..') || - value.includes('\\') || - value.startsWith('/') || - value.startsWith('.') || - value.includes('//') - ) { - throw new Error(`Unsafe ${kind}: ${value}`); - } - return value; -} - -function assertSafeConnectionId(connectionId: string): string { - if (!/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId)) { - throw new Error(`Unsafe connection id: ${connectionId}`); - } - return assertSafePathToken('connection id', connectionId); -} - -function dialectForDriver(driver: string | undefined): string { - const normalized = (driver ?? 'postgres').toUpperCase(); - const map: Record = { - POSTGRES: 'postgres', - BIGQUERY: 'bigquery', - SNOWFLAKE: 'snowflake', - MYSQL: 'mysql', - SQLSERVER: 'tsql', - SQLITE: 'sqlite', - DUCKDB: 'duckdb', - CLICKHOUSE: 'clickhouse', - DATABRICKS: 'databricks', - }; - return map[normalized] ?? 'postgres'; -} - function resolveLocalConnectionId(project: KtxLocalProject, requested: string | undefined): string { if (requested) { return assertSafeConnectionId(requested); @@ -93,7 +58,7 @@ export async function compileLocalSlQuery( ): Promise { await options.onProgress?.({ progress: 0, message: 'Compiling query' }); const connectionId = resolveLocalConnectionId(project, options.connectionId); - const dialect = dialectForDriver(project.config.connections[connectionId]?.driver); + const dialect = sqlAnalysisDialectForDriver(project.config.connections[connectionId]?.driver); const sources = await loadComputableSources(project, connectionId); await options.onProgress?.({ progress: 0.3, message: 'Generating SQL' }); diff --git a/packages/cli/src/context/sl/local-sl.ts b/packages/cli/src/context/sl/local-sl.ts index fb573392..1c12ef67 100644 --- a/packages/cli/src/context/sl/local-sl.ts +++ b/packages/cli/src/context/sl/local-sl.ts @@ -2,7 +2,6 @@ import { join } from 'node:path'; import YAML from 'yaml'; import { z } from 'zod'; import type { KtxEmbeddingPort } from '../../context/core/embedding.js'; -import type { KtxFileWriteResult } from '../../context/core/file-store.js'; import type { KtxLocalProject } from '../../context/project/project.js'; import { HybridSearchCore } from '../../context/search/hybrid-search-core.js'; import type { SearchCandidateGenerator } from '../../context/search/types.js'; @@ -18,6 +17,13 @@ import { } from './semantic-layer.service.js'; import type { PgliteSlSearchPrototypeOwnerOptions } from './pglite-sl-search-prototype.js'; import { loadLatestSlDictionaryEntries } from './sl-dictionary-profile.js'; +import { + assertSafeConnectionId, + isSafeConnectionId, + isSlYamlPath, + slSourceNameForFile, + sourceNameFromPath, +} from './source-files.js'; import { buildSemanticLayerSourceSearchText, SlSearchService } from './sl-search.service.js'; import { SqliteSlSourcesIndex } from './sqlite-sl-sources-index.js'; import type { SemanticLayerSource, SlDictionaryMatch, SlSearchLaneSummary, SlSearchMatchReason } from './types.js'; @@ -69,58 +75,10 @@ export type ResolvedSlSource = | { kind: 'not-found' } | { kind: 'ambiguous'; connectionIds: string[] }; -const LOCAL_AUTHOR = 'ktx'; -const LOCAL_AUTHOR_EMAIL = 'ktx@example.com'; - -function assertSafePathToken(kind: string, value: string): string { - if ( - value.trim().length === 0 || - value.includes('..') || - value.includes('\\') || - value.startsWith('/') || - value.startsWith('.') || - value.includes('//') - ) { - throw new Error(`Unsafe ${kind}: ${value}`); - } - return value; -} - -function assertSafeConnectionId(connectionId: string): string { - if (!/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId)) { - throw new Error(`Unsafe connection id: ${connectionId}`); - } - return assertSafePathToken('connection id', connectionId); -} - -function isSafeConnectionId(connectionId: string | undefined): connectionId is string { - return typeof connectionId === 'string' && /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId); -} - -function assertSafeSourceName(sourceName: string): string { - if (!/^[a-z0-9][a-z0-9_]*$/.test(sourceName)) { - throw new Error(`Unsafe semantic-layer source name: ${sourceName}`); - } - return assertSafePathToken('semantic-layer source name', sourceName); -} - function isRecord(value: unknown): value is Record { return typeof value === 'object' && value !== null && !Array.isArray(value); } -function slPath(connectionId: string, sourceName: string): string { - return `semantic-layer/${assertSafeConnectionId(connectionId)}/${assertSafeSourceName(sourceName)}.yaml`; -} - -function sourceNameFromPath(path: string): string { - return ( - path - .split('/') - .at(-1) - ?.replace(/\.ya?ml$/, '') ?? path - ); -} - function parseYamlRecord(raw: string): Record { const parsed = YAML.parse(raw) as unknown; if (!isRecord(parsed)) { @@ -215,12 +173,17 @@ export async function loadLocalSlSourceRecords( const dir = `semantic-layer/${connectionId}`; const schemaDir = `${dir}/_schema`; const listed = await project.fileStore.listFiles(dir); - const paths = listed.files.filter((file) => file.endsWith('.yaml') || file.endsWith('.yml')).sort(); + const paths = listed.files.filter(isSlYamlPath).sort(); const sources = new Map(); for (const path of paths.filter((file) => file.startsWith(`${schemaDir}/`))) { const raw = await project.fileStore.readFile(path); - const tables = manifestTables(parseYamlRecord(raw.content)); + let tables: Record | null; + try { + tables = manifestTables(parseYamlRecord(raw.content)); + } catch (error) { + throw new Error(`${path}: ${error instanceof Error ? error.message : String(error)}`); + } if (!tables) { continue; } @@ -237,7 +200,29 @@ export async function loadLocalSlSourceRecords( for (const path of paths.filter((file) => !file.startsWith(`${schemaDir}/`))) { const raw = await project.fileStore.readFile(path); - const parsed = parseYamlRecord(raw.content); + let parsed: Record; + try { + parsed = parseYamlRecord(raw.content); + } catch { + // A source mid-edit (e.g. an agent saved half-written YAML) must not take + // down reads, listings, or search for its siblings. Key it by the same + // name the writer side uses (the intact top-level `name:`, recovered even + // when the YAML is broken below it; filename only as a last resort) so a + // broken uppercase/hashed/human-renamed source stays reachable under its + // real name, and surface the raw content for repair. + const brokenName = slSourceNameForFile(path, raw.content); + sources.set(brokenName, { + connectionId, + name: brokenName, + path, + columnCount: 0, + measureCount: 0, + joinCount: 0, + yaml: raw.content, + source: { name: brokenName, grain: [], columns: [], joins: [], measures: [] }, + }); + continue; + } const name = typeof parsed.name === 'string' && parsed.name.length > 0 ? parsed.name : sourceNameFromPath(path); if (parsed.table || parsed.sql) { const source = parsedStandaloneSource(parsed, name); @@ -292,50 +277,21 @@ export async function validateLocalSlSource( } } -/** @internal */ -export async function writeLocalSlSource( - project: KtxLocalProject, - input: { connectionId: string; sourceName: string; yaml: string }, -): Promise { - const validation = await validateLocalSlSource(input.yaml, { project, connectionId: input.connectionId }); - if (!validation.valid) { - throw new Error(`Invalid semantic-layer source: ${validation.errors.join('; ')}`); - } - - const parsed = parseYamlRecord(input.yaml); - if (typeof parsed.name === 'string' && parsed.name !== input.sourceName) { - throw new Error(`Semantic-layer source name "${parsed.name}" does not match requested path "${input.sourceName}"`); - } - - const path = slPath(input.connectionId, input.sourceName); - return project.fileStore.writeFile( - path, - input.yaml.endsWith('\n') ? input.yaml : `${input.yaml}\n`, - LOCAL_AUTHOR, - LOCAL_AUTHOR_EMAIL, - `Write semantic-layer source: ${input.connectionId}/${input.sourceName}`, - ); -} - -/** @internal */ export async function readLocalSlSource( project: KtxLocalProject, input: { connectionId: string; sourceName: string }, ): Promise { - const path = slPath(input.connectionId, input.sourceName); - try { - const result = await project.fileStore.readFile(path); - return { - ...summarizeSource({ connectionId: input.connectionId, path, raw: result.content }), - yaml: result.content, - }; - } catch { - const records = await loadLocalSlSourceRecords(project, { - connectionId: input.connectionId, - }); - const record = records.find((source) => source.name === input.sourceName); - return record ? { ...record } : null; - } + // Source identity is the in-file `name:` (mirroring the warehouse identifier + // verbatim, e.g. Snowflake's uppercase `WIDGET_SALES`), never the filename. The + // record loader resolves standalone files, overlays, manifest-backed sources, + // and mid-edit files whose YAML no longer parses — so readers — `ktx sl read`, + // `ktx sl validate`, and the `sl_read_source` MCP tool — can surface broken + // content for repair instead of failing on it. + const records = await loadLocalSlSourceRecords(project, { + connectionId: input.connectionId, + }); + const record = records.find((source) => source.name === input.sourceName); + return record ? { ...record } : null; } export async function resolveLocalSlSource( diff --git a/packages/cli/src/context/sl/semantic-layer.service.ts b/packages/cli/src/context/sl/semantic-layer.service.ts index 28bf826d..797a8bf7 100644 --- a/packages/cli/src/context/sl/semantic-layer.service.ts +++ b/packages/cli/src/context/sl/semantic-layer.service.ts @@ -6,6 +6,7 @@ import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-sch import type { SlConnectionCatalogPort, SlPythonPort } from './ports.js'; import { normalizeSemanticLayerDescriptions } from './description-normalization.js'; import { isOverlaySource, resolvedSourceSchema, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js'; +import { isSlYamlPath, resolveSlSourceFile, slDeclaredSourceName, slSourceFilePath } from './source-files.js'; import type { ResolvedSemanticLayerSource, SemanticLayerColumnOverride, @@ -135,8 +136,30 @@ export class SemanticLayerService { // ── YAML File Operations ──────────────────────────────── - private sourcePath(connectionId: string, sourceName: string): string { - return `${SL_DIR_PREFIX}/${connectionId}/${sourceName}.yaml`; + // The in-file `name:` is the source's identity; the filename is only a derived + // label. Rewrites land on the file that already declares the name (humans may + // rename files freely); new sources get a derived filename. A file already + // sitting at the derived path that declares a name declares a *different* one + // (the resolver would have matched it otherwise) — fail instead of clobbering + // it. A nameless/unparseable file there is the broken remains of this very + // source (the derived path is a function of the name), so overwriting it is + // the repair path, not data loss. + private async resolveWritePath(connectionId: string, sourceName: string): Promise { + const existing = await resolveSlSourceFile(this.configService, connectionId, sourceName); + if (existing) { + return existing.path; + } + const path = slSourceFilePath(connectionId, sourceName); + let occupant: string | null = null; + try { + occupant = slDeclaredSourceName((await this.configService.readFile(path)).content); + } catch { + return path; + } + if (occupant !== null) { + throw new Error(`Cannot write source '${sourceName}': ${path} already defines source '${occupant}'`); + } + return path; } async writeSource( @@ -185,39 +208,42 @@ export class SemanticLayerService { } } - const path = this.sourcePath(connectionId, source.name); + const path = await this.resolveWritePath(connectionId, source.name); const normalizedSource = normalizeSemanticLayerDescriptions(source); const content = YAML.stringify(normalizedSource, { indent: 2, lineWidth: 0, version: '1.1' }); const message = commitMessage ?? `Update semantic layer source: ${source.name}`; const result = await this.configService.writeFile(path, content, author, authorEmail, message, { skipLock: options?.skipLock, }); - return { ...result, warnings }; + // The filename is derived from (or resolved by) the source name — surface + // the actual path so callers don't have to re-resolve it. + return { ...result, path, warnings }; } - async readSourceFile(connectionId: string, sourceName: string): Promise<{ content: string; path: string }> { - const path = this.sourcePath(connectionId, sourceName); - const result = await this.configService.readFile(path); - return { content: result.content, path }; + /** + * Raw standalone/overlay file for a source, resolved by its in-file `name:`. + * Returns null when no file declares the name (the source may still exist as + * a manifest entry under `_schema/`). + */ + async readSourceFile(connectionId: string, sourceName: string): Promise<{ content: string; path: string } | null> { + const file = await resolveSlSourceFile(this.configService, connectionId, sourceName); + return file ? { content: file.content, path: file.path } : null; } async loadSource(connectionId: string, sourceName: string): Promise { - let content: string; - try { - const result = await this.readSourceFile(connectionId, sourceName); - content = result.content; - } catch { + const file = await this.readSourceFile(connectionId, sourceName); + if (!file) { return null; } try { - return YAML.parse(content) as SemanticLayerSource; + return YAML.parse(file.content) as SemanticLayerSource; } catch (error) { // Distinguish a YAML parse failure from a missing file. The file exists but // its contents are unparseable — callers that treat null as "does not exist" // could otherwise overwrite the broken file. Surface the parse failure via // the service logger so the broken source is at least visible. this.logger.warn( - `[loadSource] ${connectionId}/${sourceName}.yaml: YAML parse failed: ${error instanceof Error ? error.message : String(error)}`, + `[loadSource] ${file.path}: YAML parse failed: ${error instanceof Error ? error.message : String(error)}`, ); return null; } @@ -231,7 +257,7 @@ export class SemanticLayerService { let allFiles: string[]; try { const result = await this.configService.listFiles(dir); - allFiles = result.files.filter((f) => f.endsWith('.yaml')); + allFiles = result.files.filter((f) => isSlYamlPath(f)); } catch (e) { const message = `Failed to list semantic-layer files under ${dir}: ${e instanceof Error ? e.message : String(e)}`; loadErrors.push(message); @@ -338,7 +364,7 @@ export class SemanticLayerService { let allFiles: string[]; try { const listing = await this.configService.listFiles(dir); - allFiles = listing.files.filter((f) => f.endsWith('.yaml')); + allFiles = listing.files.filter((f) => isSlYamlPath(f)); } catch { return result; } @@ -408,7 +434,7 @@ export class SemanticLayerService { const schemaDir = `${SL_DIR_PREFIX}/${connectionId}/_schema`; try { const result = await this.configService.listFiles(schemaDir); - const yamlFiles = result.files.filter((f) => f.endsWith('.yaml')); + const yamlFiles = result.files.filter((f) => isSlYamlPath(f)); for (const filePath of yamlFiles) { try { const { content } = await this.configService.readFile(filePath); @@ -449,7 +475,7 @@ export class SemanticLayerService { let yamlFiles: string[]; try { const result = await this.configService.listFiles(schemaDir); - yamlFiles = result.files.filter((f) => f.endsWith('.yaml')); + yamlFiles = result.files.filter((f) => isSlYamlPath(f)); } catch { return null; } @@ -533,7 +559,7 @@ export class SemanticLayerService { .map((c) => c.name); if (absentDeclaredColumns.length > 0) { errors.push( - `${source.name}.yaml: table "${source.table}" matched manifest ${manifestLabel}, ` + + `${source.name}: table "${source.table}" matched manifest ${manifestLabel}, ` + `but declared column(s) absent from physical table: ${absentDeclaredColumns.join(', ')}. ` + `Available columns: ${[...manifestColumns.values()].join(', ')}`, ); @@ -545,7 +571,7 @@ export class SemanticLayerService { }); if (missingGrainColumns.length > 0) { errors.push( - `${source.name}.yaml: grain column(s) absent from physical table "${source.table}": ${missingGrainColumns.join(', ')}`, + `${source.name}: grain column(s) absent from physical table "${source.table}": ${missingGrainColumns.join(', ')}`, ); } @@ -562,7 +588,7 @@ export class SemanticLayerService { }); if (missing.length > 0) { errors.push( - `${source.name}.yaml: computed column "${column.name}" references unknown column(s): ${missing.join(', ')}`, + `${source.name}: computed column "${column.name}" references unknown column(s): ${missing.join(', ')}`, ); } } @@ -577,7 +603,7 @@ export class SemanticLayerService { }); if (missing.length > 0) { errors.push( - `${source.name}.yaml: segment "${segment.name}" references unknown column(s): ${missing.join(', ')}`, + `${source.name}: segment "${segment.name}" references unknown column(s): ${missing.join(', ')}`, ); } } @@ -592,7 +618,7 @@ export class SemanticLayerService { }); if (exprMissing.length > 0) { errors.push( - `${source.name}.yaml: measure "${measure.name}" references unknown column(s): ${exprMissing.join(', ')}`, + `${source.name}: measure "${measure.name}" references unknown column(s): ${exprMissing.join(', ')}`, ); } @@ -606,7 +632,7 @@ export class SemanticLayerService { }); if (filterMissing.length > 0) { errors.push( - `${source.name}.yaml: measure "${measure.name}" filter references unknown column(s): ${filterMissing.join(', ')}`, + `${source.name}: measure "${measure.name}" filter references unknown column(s): ${filterMissing.join(', ')}`, ); } } @@ -619,7 +645,7 @@ export class SemanticLayerService { } if (!validOutputColumns.has(parsed.localColumn.toLowerCase())) { errors.push( - `${source.name}.yaml: join to "${join.to}" references local column ` + + `${source.name}: join to "${join.to}" references local column ` + `"${parsed.localColumn}" that is not a valid output column`, ); } @@ -631,7 +657,7 @@ export class SemanticLayerService { const targetColumns = new Set(targetSource.columns.map((c) => c.name.toLowerCase())); if (!targetColumns.has(parsed.targetColumn.toLowerCase())) { errors.push( - `${source.name}.yaml: join to "${join.to}" references target column ` + + `${source.name}: join to "${join.to}" references target column ` + `"${parsed.targetColumn}" that does not exist on the target source`, ); } @@ -650,43 +676,30 @@ export class SemanticLayerService { return SemanticLayerService.mapDialect(connection.connectionType); } - async listSourceNames(connectionId: string): Promise { - const dir = `${SL_DIR_PREFIX}/${connectionId}`; - try { - const result = await this.configService.listFiles(dir); - return result.files.filter((f) => f.endsWith('.yaml')).map((f) => f.replace(`${dir}/`, '').replace('.yaml', '')); - } catch { - return []; - } - } - async listFilesForConnection(connectionId: string): Promise { const dir = `${SL_DIR_PREFIX}/${connectionId}`; try { const result = await this.configService.listFiles(dir, true); - return result.files.filter((f) => f.endsWith('.yaml')); + return result.files.filter((f) => isSlYamlPath(f)); } catch { return []; } } - async readFileByPath(connectionId: string, relativePath: string): Promise<{ content: string; readOnly: boolean }> { - const fullPath = `${SL_DIR_PREFIX}/${connectionId}/${relativePath}`; - const result = await this.configService.readFile(fullPath); - return { - content: result.content, - readOnly: relativePath.startsWith('_schema/'), - }; - } - async deleteSource(connectionId: string, sourceName: string, author: string, authorEmail: string) { - const path = this.sourcePath(connectionId, sourceName); - return this.configService.deleteFile(path, author, authorEmail, `Delete semantic layer source: ${sourceName}`); - } - - async getSourceHistory(connectionId: string, sourceName: string) { - const path = this.sourcePath(connectionId, sourceName); - return this.configService.getFileHistory(path); + const file = await resolveSlSourceFile(this.configService, connectionId, sourceName); + if (!file) { + // `deleteFile` returns null for a missing path, which would let a no-op + // delete read as success. Distinguish the two real cases instead. + if (await this.isManifestBacked(connectionId, sourceName)) { + throw new Error( + `Source '${sourceName}' is defined by the scan manifest (_schema/) and has no overlay file to delete. ` + + `Rescan the connection to remove it from the manifest.`, + ); + } + throw new Error(`Semantic-layer source not found: ${connectionId}/${sourceName}`); + } + return this.configService.deleteFile(file.path, author, authorEmail, `Delete semantic layer source: ${sourceName}`); } /** @@ -815,7 +828,7 @@ export class SemanticLayerService { return []; } - const schemaFiles = files.filter((file) => /^semantic-layer\/[^/]+\/_schema\/.+\.ya?ml$/.test(file)); + const schemaFiles = files.filter((file) => /^semantic-layer\/[^/]+\/_schema\//.test(file) && isSlYamlPath(file)); const entries: Array<{ connectionId: string; source: SemanticLayerSource }> = []; for (const filePath of schemaFiles) { const connectionId = filePath.split('/')[1]; @@ -844,7 +857,7 @@ export class SemanticLayerService { let allFiles: string[]; try { const result = await this.configService.listFiles(dir); - allFiles = result.files.filter((f) => f.endsWith('.yaml')); + allFiles = result.files.filter((f) => isSlYamlPath(f)); } catch { return warnings; } @@ -1030,7 +1043,7 @@ export class SemanticLayerService { try { const result = await this.configService.listFiles(dir); - const yamlFiles = result.files.filter((f) => f.endsWith('.yaml')); + const yamlFiles = result.files.filter((f) => isSlYamlPath(f)); for (const filePath of yamlFiles) { try { diff --git a/packages/cli/src/context/sl/source-files.ts b/packages/cli/src/context/sl/source-files.ts new file mode 100644 index 00000000..ae44c683 --- /dev/null +++ b/packages/cli/src/context/sl/source-files.ts @@ -0,0 +1,160 @@ +import { createHash } from 'node:crypto'; +import YAML from 'yaml'; +import type { KtxFileStorePort } from '../../context/core/file-store.js'; + +// Semantic-layer source identity lives in the file's `name:` field, which mirrors +// the warehouse identifier verbatim (Snowflake's uppercase `SIGNED_UP`, `EVENT$LOG`). +// The filename is a derived label and never participates in identity: reads resolve +// a source by scanning the connection directory and matching `name:`, and writes +// reuse the resolved file's path, so files can be freely renamed by humans without +// changing which source they define. + +function assertSafePathToken(kind: string, value: string): string { + if ( + value.trim().length === 0 || + value.includes('..') || + value.includes('\\') || + value.startsWith('/') || + value.startsWith('.') || + value.includes('//') + ) { + throw new Error(`Unsafe ${kind}: ${value}`); + } + return value; +} + +export function assertSafeConnectionId(connectionId: string): string { + if (!isSafeConnectionId(connectionId)) { + throw new Error(`Unsafe connection id: ${connectionId}`); + } + return assertSafePathToken('connection id', connectionId); +} + +export function isSafeConnectionId(connectionId: string | undefined): connectionId is string { + return typeof connectionId === 'string' && /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId); +} + +export function sourceNameFromPath(path: string): string { + return ( + path + .split('/') + .at(-1) + ?.replace(/\.ya?ml$/, '') ?? path + ); +} + +// The one predicate for "this path is a semantic-layer YAML file". ktx itself +// always writes `.yaml` (see `slSourceFileName`), but humans rename freely and +// the dbt ecosystem's habit is `.yml`, so every reader must accept both — a +// listing that recognizes only one extension makes the same file visible to +// some entry points and invisible to others. +export function isSlYamlPath(path: string): boolean { + return path.endsWith('.yaml') || path.endsWith('.yml'); +} + +// Windows refuses these basenames regardless of extension — a genuinely universal +// filesystem invariant, so the static list is acceptable. +const WINDOWS_RESERVED_BASENAME = /^(?:con|prn|aux|nul|com[0-9]|lpt[0-9])$/; + +const SAFE_FILE_BASENAME = /^[a-z0-9][a-z0-9_]{0,63}$/; + +/** + * Derive the filename for a semantic-layer source. Total over all possible + * source names — never throws. + * + * Names that are already safe lowercase snake_case become `.yaml`; + * anything else becomes `-<8 hex of sha256(name)>.yaml`. The two ranges + * are disjoint and the mapping is injective: safe filenames contain no `-`, + * hashed filenames always end in `-<8 hex>`, and slugs are lowercased so names + * differing only by case get distinct hashes instead of colliding paths on + * case-insensitive filesystems (macOS APFS, Windows). + * + * @internal + */ +export function slSourceFileName(sourceName: string): string { + if (SAFE_FILE_BASENAME.test(sourceName) && !WINDOWS_RESERVED_BASENAME.test(sourceName)) { + return `${sourceName}.yaml`; + } + const slug = sourceName + .toLowerCase() + .replace(/[^a-z0-9_]+/g, '_') + .replace(/_+/g, '_') + .replace(/^_+|_+$/g, '') + .slice(0, 64); + const hash = createHash('sha256').update(sourceName, 'utf-8').digest('hex').slice(0, 8); + return `${slug || 'src'}-${hash}.yaml`; +} + +export function slSourceFilePath(connectionId: string, sourceName: string): string { + return `semantic-layer/${assertSafeConnectionId(connectionId)}/${slSourceFileName(sourceName)}`; +} + +export interface SlSourceFile { + path: string; + content: string; +} + +// Same keying as `loadLocalSlSourceRecords`: the in-file `name:` is the identity; +// the filename is only a fallback for files so broken that even the `name:` is +// unrecoverable, or genuinely nameless ones. A file left mid-edit with a syntax +// error below its `name:` line keeps its declared identity (see +// `slDeclaredSourceName`), so a human-renamed source is still addressed by name +// while broken instead of silently reverting to its filename. +export function slSourceNameForFile(path: string, content: string): string { + return slDeclaredSourceName(content) ?? sourceNameFromPath(path); +} + +/** + * The `name:` a semantic-layer YAML file declares, or null when the file is + * nameless or so broken even the name is unrecoverable. Null is how + * `writeSource` tells a genuine name conflict at a derived path apart from the + * broken remains of the source being written, which a rewrite must repair + * rather than refuse. + * + * Uses `parseDocument`, not `parse`: a file with a syntax error below the + * `name:` line still parses into a partial tree whose top-level `name:` is + * intact. `parse` would throw on the same input and drop the source to its + * filename — wrong for human-renamed files, whose filename is not the name. + */ +export function slDeclaredSourceName(content: string): string | null { + let doc: ReturnType; + try { + doc = YAML.parseDocument(content); + } catch { + return null; + } + const name = doc.get('name'); + return typeof name === 'string' && name.length > 0 ? name : null; +} + +/** + * Find the standalone/overlay file that defines `sourceName` for a connection. + * Returns null when no file declares the name (the source may still exist as a + * manifest entry under `_schema/`). Throws when more than one file declares the + * same name — that breaks the one-file-per-name invariant and must be repaired + * by hand rather than silently picking one. + */ +export async function resolveSlSourceFile( + fileStore: Pick, + connectionId: string, + sourceName: string, +): Promise { + const dir = `semantic-layer/${assertSafeConnectionId(connectionId)}`; + const schemaDir = `${dir}/_schema`; + const listed = await fileStore.listFiles(dir); + const paths = listed.files.filter((file) => isSlYamlPath(file) && !file.startsWith(`${schemaDir}/`)).sort(); + + const matches: SlSourceFile[] = []; + for (const path of paths) { + const raw = await fileStore.readFile(path); + if (slSourceNameForFile(path, raw.content) === sourceName) { + matches.push({ path, content: raw.content }); + } + } + if (matches.length > 1) { + throw new Error( + `Multiple semantic-layer files declare source "${sourceName}": ${matches.map((match) => match.path).join(', ')}`, + ); + } + return matches[0] ?? null; +} diff --git a/packages/cli/src/context/sl/tools/base-semantic-layer.tool.ts b/packages/cli/src/context/sl/tools/base-semantic-layer.tool.ts index 22822368..2c14ec1a 100644 --- a/packages/cli/src/context/sl/tools/base-semantic-layer.tool.ts +++ b/packages/cli/src/context/sl/tools/base-semantic-layer.tool.ts @@ -46,12 +46,8 @@ export abstract class BaseSemanticLayerTool ex ): Promise { const semanticLayerService = context?.session?.semanticLayerService ?? this.semanticLayerService; - try { - const { content } = await semanticLayerService.readSourceFile(connectionId, sourceName); - return content; - } catch { - return null; - } + const file = await semanticLayerService.readSourceFile(connectionId, sourceName); + return file?.content ?? null; } protected buildMarkdown( diff --git a/packages/cli/src/context/sl/tools/sl-edit-source.tool.ts b/packages/cli/src/context/sl/tools/sl-edit-source.tool.ts index 94abbb36..c8998fe2 100644 --- a/packages/cli/src/context/sl/tools/sl-edit-source.tool.ts +++ b/packages/cli/src/context/sl/tools/sl-edit-source.tool.ts @@ -113,13 +113,8 @@ If no source exists yet, use sl_write_source instead — this tool will reject t } // Read existing source - let currentYaml: string | null = null; - try { - const { content } = await semanticLayerService.readSourceFile(connectionId, sourceName); - currentYaml = content; - } catch { - currentYaml = null; - } + const currentFile = await semanticLayerService.readSourceFile(connectionId, sourceName); + const currentYaml = currentFile?.content ?? null; if (!currentYaml) { const manifestBacked = await semanticLayerService.isManifestBacked(connectionId, sourceName); if (manifestBacked) { @@ -165,6 +160,20 @@ If no source exists yet, use sl_write_source instead — this tool will reject t } catch (e) { return this.buildOutput(false, [`YAML parse error after edits: ${e}`], sourceName); } + + // The in-file `name:` is the source's identity — an edited name would make + // writeSource create a second source instead of updating this one. + if (source.name !== sourceName) { + return this.buildOutput( + false, + [ + `Edits change "name:" from "${sourceName}" to "${source.name ?? ''}" — renaming is not supported. ` + + `Delete the source and recreate it under the new name.`, + ], + sourceName, + ); + } + source = normalizeSemanticLayerDescriptions(source, { fillMissing: !!context.session?.ingest }); // Re-serialize and write diff --git a/packages/cli/src/context/sl/tools/sl-warehouse-validation.ts b/packages/cli/src/context/sl/tools/sl-warehouse-validation.ts index 742d855a..56f80833 100644 --- a/packages/cli/src/context/sl/tools/sl-warehouse-validation.ts +++ b/packages/cli/src/context/sl/tools/sl-warehouse-validation.ts @@ -1,10 +1,11 @@ import YAML from 'yaml'; import type { GitService } from '../../../context/core/git.service.js'; -import type { KtxFileStorePort } from '../../../context/core/file-store.js'; +import type { KtxFileListResult, KtxFileReadResult, KtxFileStorePort } from '../../../context/core/file-store.js'; import { SYSTEM_GIT_AUTHOR } from '../../../context/tools/authors.js'; import type { SlConnectionCatalogPort, SlSourcesIndexPort } from '../ports.js'; import { sourceOverlaySchema } from '../schemas.js'; import { SemanticLayerService } from '../semantic-layer.service.js'; +import { resolveSlSourceFile, slSourceFilePath } from '../source-files.js'; import type { SemanticLayerSource } from '../types.js'; import { sourceDefinitionSchema } from './base-semantic-layer.tool.js'; @@ -23,9 +24,6 @@ export interface SourceValidationResult { warnings: string[]; } -const slSourcePath = (connectionId: string, sourceName: string): string => - `semantic-layer/${connectionId}/${sourceName}.yaml`; - function resolveDialect(warehouse: string | null): string | null { if (!warehouse) { return null; @@ -63,24 +61,21 @@ export async function validateSingleSource( const errors: string[] = []; const warnings: string[] = []; - let content: string; - try { - const result = await deps.semanticLayerService.readSourceFile(connectionId, sourceName); - content = result.content; - } catch { - errors.push(`${sourceName}.yaml: file not found`); + const file = await deps.semanticLayerService.readSourceFile(connectionId, sourceName); + if (!file) { + errors.push(`${sourceName}: no standalone or overlay file found`); return { errors, warnings }; } let parsed: Record; try { - parsed = YAML.parse(content); + parsed = YAML.parse(file.content); } catch (e) { - errors.push(`${sourceName}.yaml: invalid YAML — ${e instanceof Error ? e.message : String(e)}`); + errors.push(`${sourceName}: invalid YAML — ${e instanceof Error ? e.message : String(e)}`); return { errors, warnings }; } if (!parsed || typeof parsed !== 'object') { - errors.push(`${sourceName}.yaml: top-level content is not an object`); + errors.push(`${sourceName}: top-level content is not an object`); return { errors, warnings }; } @@ -89,7 +84,7 @@ export async function validateSingleSource( const isManifestBacked = await deps.semanticLayerService.isManifestBacked(connectionId, sourceName); if (isManifestBacked) { errors.push( - `${sourceName}.yaml: standalone source shadows an existing manifest entry — ` + + `${sourceName}: standalone source shadows an existing manifest entry — ` + `writing it as-is drops the manifest's columns and joins. ` + `Remove "sql:", "table:", "grain:", and base-table "columns:" and keep only ` + `"name:" plus overlay fields such as "measures:", "segments:", "descriptions:", ` + @@ -103,21 +98,21 @@ export async function validateSingleSource( const result = schema.safeParse(parsed); if (!result.success) { const issues = result.error.issues.map((i) => `${i.path.join('.')}: ${i.message}`).join('; '); - errors.push(`${sourceName}.yaml: schema — ${issues}`); + errors.push(`${sourceName}: schema — ${issues}`); const errorPaths = new Set(result.error.issues.map((i) => String(i.path[0]))); if (errorPaths.has('joins')) { warnings.push( - `${sourceName}.yaml: hint — join format: {to, on: 'local_col = TARGET.col', relationship: 'many_to_one|one_to_many|one_to_one'}`, + `${sourceName}: hint — join format: {to, on: 'local_col = TARGET.col', relationship: 'many_to_one|one_to_many|one_to_one'}`, ); } if (errorPaths.has('columns')) { warnings.push( - `${sourceName}.yaml: hint — overlay columns must be computed: {name, expr, type}. Use column_overrides for manifest column descriptions or metadata.`, + `${sourceName}: hint — overlay columns must be computed: {name, expr, type}. Use column_overrides for manifest column descriptions or metadata.`, ); } if (errorPaths.has('measures')) { warnings.push( - `${sourceName}.yaml: hint — measure format: {name, expr, description (optional), filter (optional)}`, + `${sourceName}: hint — measure format: {name, expr, description (optional), filter (optional)}`, ); } return { errors, warnings }; @@ -135,7 +130,7 @@ export async function validateSingleSource( const seenMeasures = new Set(); for (const m of measures) { if (seenMeasures.has(m.name)) { - errors.push(`${sourceName}.yaml: duplicate measure name "${m.name}"`); + errors.push(`${sourceName}: duplicate measure name "${m.name}"`); } seenMeasures.add(m.name); } @@ -168,7 +163,7 @@ export async function validateSingleSource( const missing = sourceColumns.map((c) => c.name).filter((n) => !actual.has(n.toLowerCase())); if (missing.length > 0) { errors.push( - `${sourceName}.yaml: declared columns absent from sql result — ${missing.join(', ')} (warehouse returned: ${[...actual].slice(0, 10).join(', ')}${actual.size > 10 ? ', …' : ''})`, + `${sourceName}: declared columns absent from sql result — ${missing.join(', ')} (warehouse returned: ${[...actual].slice(0, 10).join(', ')}${actual.size > 10 ? ', …' : ''})`, ); } } catch (e) { @@ -205,7 +200,7 @@ function formatProbeError(args: { const errMsg = error instanceof Error ? error.message : String(error); const refColumns = sourceColumns.filter((c) => referencesColumn(probeSql, c.name)); const lines: string[] = [ - measureName ? `${sourceName}.yaml: measure "${measureName}" ${headline}.` : `${sourceName}.yaml: ${headline}.`, + measureName ? `${sourceName}: measure "${measureName}" ${headline}.` : `${sourceName}: ${headline}.`, ]; if (warehouse) { lines.push(` Warehouse: ${warehouse}`); @@ -249,7 +244,7 @@ async function probeOverlayMeasures( composed = all.find((s) => s.name === sourceName); } catch (e) { errors.push( - `${sourceName}.yaml: failed to load composed source for probe — ${e instanceof Error ? e.message : String(e)}`, + `${sourceName}: failed to load composed source for probe — ${e instanceof Error ? e.message : String(e)}`, ); return errors; } @@ -289,6 +284,26 @@ async function probeOverlayMeasures( return errors; } +/** + * A read-only view of the config repo at one commit, shaped for + * `resolveSlSourceFile` so name→file resolution runs against history exactly as + * it does against the working tree — one resolver, two backing stores. Used to + * recover the path a source occupied at `preHead` after the live file is gone. + */ +function gitCommitFileStore( + git: GitService, + commitHash: string, +): Pick { + return { + async listFiles(path: string): Promise { + return { files: await git.listFilesAtCommit(path, commitHash) }; + }, + async readFile(path: string): Promise { + return { content: await git.getFileAtCommit(path, commitHash) }; + }, + }; +} + /** * Restore `sourceName` to the content it had at `preHead`, or delete it if it didn't * exist then. Used by sl_rollback (agent-driven) and the pre-squash revert gate @@ -300,14 +315,29 @@ export async function revertSourceToPreHead( preHead: string | null, sourceName: string, ): Promise { - const relPath = slSourcePath(connectionId, sourceName); + // Find the file that defines this source. While it is still on disk + // (invalid-but-present) the live resolver finds it by its in-file `name:`. + // Once the session deleted it, the path is gone too — and humans rename files + // freely, so it is NOT the writer-derived filename. Recover it from history by + // resolving the name against the preHead commit instead of guessing. + const live = await resolveSlSourceFile(deps.configService, connectionId, sourceName); + let relPath: string; let preContent: string | null = null; - if (preHead) { - try { - preContent = await deps.gitService.getFileAtCommit(relPath, preHead); - } catch { - preContent = null; + if (live) { + relPath = live.path; + if (preHead) { + try { + preContent = await deps.gitService.getFileAtCommit(relPath, preHead); + } catch { + preContent = null; + } } + } else { + const atPreHead = preHead + ? await resolveSlSourceFile(gitCommitFileStore(deps.gitService, preHead), connectionId, sourceName) + : null; + relPath = atPreHead?.path ?? slSourceFilePath(connectionId, sourceName); + preContent = atPreHead?.content ?? null; } if (preContent !== null) { diff --git a/packages/cli/src/context/sl/tools/sl-write-source.tool.ts b/packages/cli/src/context/sl/tools/sl-write-source.tool.ts index 1ff0f7ee..6f59ce57 100644 --- a/packages/cli/src/context/sl/tools/sl-write-source.tool.ts +++ b/packages/cli/src/context/sl/tools/sl-write-source.tool.ts @@ -22,8 +22,12 @@ const slWriteSourceInputSchema = z.object({ connectionId: slToolConnectionIdSchema.describe('Data source connection ID'), sourceName: z .string() - .regex(/^[a-z0-9][a-z0-9_]*$/, 'Source name must be snake_case (lowercase alphanumeric and underscores)') - .describe('Name of the source to create, edit, or delete'), + .min(1) + .describe( + "Name of the source to create, edit, or delete. Must equal the source's `name:`. Use the verbatim " + + 'warehouse identifier when overlaying a manifest source (e.g. SIGNED_UP); snake_case is recommended ' + + 'for new standalone sources.', + ), source: sourceInputSchema .optional() .describe( @@ -152,6 +156,17 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co ); } + // The in-file `name:` is the source's identity; the file is written under + // source.name while the orphan/shadow checks key on sourceName — a mismatch + // would validate one source and save another. + if (input.source.name !== sourceName) { + return this.buildOutput( + false, + [`source.name "${input.source.name}" does not match sourceName "${sourceName}" — they must be identical.`], + sourceName, + ); + } + return this.writeFullSource( connectionId, input.source, @@ -253,12 +268,8 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co connectionId: string, sourceName: string, ): Promise { - try { - const { content } = await service.readSourceFile(connectionId, sourceName); - return content; - } catch { - return null; - } + const file = await service.readSourceFile(connectionId, sourceName); + return file?.content ?? null; } private async rejectOrphanOverlay( diff --git a/packages/cli/src/context/sql-analysis/dialect.ts b/packages/cli/src/context/sql-analysis/dialect.ts new file mode 100644 index 00000000..54bc9525 --- /dev/null +++ b/packages/cli/src/context/sql-analysis/dialect.ts @@ -0,0 +1,23 @@ +import type { SqlAnalysisDialect } from './ports.js'; + +// One mapping from ktx connection identity to the sqlglot dialect name used by +// the Python daemon (SQL analysis, read-only validation) and semantic-layer +// compute. Keys cover both vocabularies that name a connection's engine: +// ktx.yaml driver names ("postgres", "sqlserver") and the local connection-type +// spellings exposed by KtxConnectionInfo.connectionType ("POSTGRESQL"). +const SQLGLOT_DIALECTS: Record = { + postgres: 'postgres', + postgresql: 'postgres', + bigquery: 'bigquery', + snowflake: 'snowflake', + mysql: 'mysql', + sqlserver: 'tsql', + sqlite: 'sqlite', + duckdb: 'duckdb', + clickhouse: 'clickhouse', + databricks: 'databricks', +}; + +export function sqlAnalysisDialectForDriver(driver: string | undefined): SqlAnalysisDialect { + return SQLGLOT_DIALECTS[(driver ?? '').toLowerCase()] ?? 'postgres'; +} diff --git a/packages/cli/src/ingest.ts b/packages/cli/src/ingest.ts index 233b1b6e..716cf7b8 100644 --- a/packages/cli/src/ingest.ts +++ b/packages/cli/src/ingest.ts @@ -13,7 +13,7 @@ import { resolveProjectEmbeddingProvider } from './embedding-resolution.js'; import { createKtxCliIngestQueryExecutor } from './ingest-query-executor.js'; import { readIngestReportSnapshotFile } from './ingest-report-file.js'; import { createCliOperationalLogger } from './io/logger.js'; -import { createKtxCliLocalIngestAdapters } from './local-adapters.js'; +import { createKtxCliLocalIngestAdapters, resolveKtxCliSqlAnalysis } from './local-adapters.js'; import type { KtxManagedPythonInstallPolicy } from './managed-python-command.js'; import { type KtxMemoryFlowStdin, renderMemoryFlowInteractively } from './memory-flow-interactive.js'; import { @@ -87,6 +87,7 @@ export interface KtxIngestDeps { | 'memoryModel' | 'semanticLayerCompute' | 'queryExecutor' + | 'sqlAnalysis' | 'logger' | 'pullConfigOptions' >; @@ -724,7 +725,7 @@ export async function runKtxIngest( const localIngestOptions = deps.localIngestOptions ?? {}; const managedDaemon = managedDaemonOptionsForIngestRun(args, deps.runtimeIo ?? io); const operationalLogger = createCliOperationalLogger(io, args.outputMode); - const adapterOptions = { + const baseAdapterOptions = { ...(localIngestOptions.pullConfigOptions ?? {}), ...(args.databaseIntrospectionUrl ? { databaseIntrospectionUrl: args.databaseIntrospectionUrl } : {}), ...(managedDaemon ? { managedDaemon } : {}), @@ -734,6 +735,10 @@ export async function runKtxIngest( : {}), logger: operationalLogger, }; + // One parser-backed SQL analysis port per run: the historic-sql adapter and + // the ingest sql_execution tool share the same daemon-backed validator. + const sqlAnalysis = localIngestOptions.sqlAnalysis ?? resolveKtxCliSqlAnalysis(baseAdapterOptions); + const adapterOptions = { ...baseAdapterOptions, sqlAnalysis }; const queryExecutor = localIngestOptions.queryExecutor ?? (deps.createQueryExecutor ?? createKtxCliIngestQueryExecutor)(ingestProject); @@ -783,6 +788,7 @@ export async function runKtxIngest( metabaseConnectionId: args.connectionId, ...localIngestOptions, queryExecutor, + sqlAnalysis, trigger: 'manual_resync', jobIdFactory: deps.jobIdFactory, embeddingProvider, @@ -861,6 +867,7 @@ export async function runKtxIngest( jobId, ...localIngestOptions, queryExecutor, + sqlAnalysis, pullConfigOptions: adapterOptions, embeddingProvider, ...(args.debugLlmRequestFile ? { llmDebugRequestFile: args.debugLlmRequestFile } : {}), diff --git a/packages/cli/src/local-adapters.ts b/packages/cli/src/local-adapters.ts index 3e3b0486..671bce67 100644 --- a/packages/cli/src/local-adapters.ts +++ b/packages/cli/src/local-adapters.ts @@ -71,7 +71,7 @@ function ktxCliLookerOptions( }; } -function ktxCliHistoricSqlAnalysis(options: KtxCliLocalIngestAdaptersOptions) { +export function resolveKtxCliSqlAnalysis(options: KtxCliLocalIngestAdaptersOptions): SqlAnalysisPort { if (options.sqlAnalysis) { return options.sqlAnalysis; } @@ -289,7 +289,7 @@ function historicSqlOptionsForLocalRun( } const base = { - sqlAnalysis: ktxCliHistoricSqlAnalysis(options), + sqlAnalysis: resolveKtxCliSqlAnalysis(options), }; if (dialect === 'postgres') { diff --git a/packages/cli/src/skills/lookml_ingest/SKILL.md b/packages/cli/src/skills/lookml_ingest/SKILL.md index 91640504..8f5afc61 100644 --- a/packages/cli/src/skills/lookml_ingest/SKILL.md +++ b/packages/cli/src/skills/lookml_ingest/SKILL.md @@ -12,7 +12,7 @@ LookML views map to SL sources, `measure:` to measures, `explore: { join: }` to | LookML | KTX form | Notes | |---|---|---| -| `view: X { sql_table_name: …; measure:/dimension:/join: }` | **Overlay** at `/X.yaml` with `measures`, computed-only `columns`, `column_overrides`, `joins`, `segments` | Manifest-backed; inherit grain/columns | +| `view: X { sql_table_name: …; measure:/dimension:/join: }` | **Overlay** named `X` with `measures`, computed-only `columns`, `column_overrides`, `joins`, `segments` | Manifest-backed; inherit grain/columns | | `view: X { derived_table: { sql: … } }` | **Standalone** with top-level `sql:`, explicit `grain:` + `columns:` | No manifest entry exists | | `view: X { sql_always_where:

}` | **Standalone** with `sql: SELECT * FROM WHERE

` | Enforcement, not opt-in | | `explore: { join: Y { sql_on: …; relationship: … } }` | `joins:` entry `{ to: Y, on: " = Y.", relationship: … }` | On the overlay or standalone | diff --git a/packages/cli/src/skills/metricflow_ingest/SKILL.md b/packages/cli/src/skills/metricflow_ingest/SKILL.md index f46d29ec..ab26bb3e 100644 --- a/packages/cli/src/skills/metricflow_ingest/SKILL.md +++ b/packages/cli/src/skills/metricflow_ingest/SKILL.md @@ -12,8 +12,8 @@ A MetricFlow `semantic_model` maps to an SL source; MetricFlow `measures` map to | MetricFlow | KTX form | Notes | |---|---|---| -| `semantic_model: X { model: ref('t') }` with measures + dimensions | **Overlay** at `/X.yaml` with `measures`, computed-only `columns`, `column_overrides`, `joins` | The `model:` ref resolves to a manifest table. | -| `semantic_model: X { model: source('s','t') }` | **Overlay** at `/X.yaml` over table `t`. | Same shape; `source()` still resolves to a physical table. | +| `semantic_model: X { model: ref('t') }` with measures + dimensions | **Overlay** named `X` with `measures`, computed-only `columns`, `column_overrides`, `joins` | The `model:` ref resolves to a manifest table. | +| `semantic_model: X { model: source('s','t') }` | **Overlay** named `X` over table `t`. | Same shape; `source()` still resolves to a physical table. | | `semantic_model: X { model: }` with no manifest entry | **Standalone** with explicit `sql:`, `grain:`, `columns:` | Happens when the dbt manifest isn't available. | | `semantic_model: Y { extends: X }` | **Merge** Y's measures/dimensions/entities into X's overlay, or write a single overlay named for the most-derived child (Y) containing both X's and Y's primitives | Do not emit a second overlay for X - flatten. | | `measures: [{ name, agg, expr }]` | `measures: [{ name, expr: "()" }]` | Aggregation inlined. `agg: count_distinct` → `count(distinct ...)`. | diff --git a/packages/cli/src/skills/sl/SKILL.md b/packages/cli/src/skills/sl/SKILL.md index 53e1bd22..bb072e14 100644 --- a/packages/cli/src/skills/sl/SKILL.md +++ b/packages/cli/src/skills/sl/SKILL.md @@ -21,7 +21,7 @@ skills must verify warehouse identifiers with `discover_data`, ## Part 1 - Schema reference -An SL source is a YAML file at `semantic-layer//.yaml`. There are three flavors: +An SL source is a YAML file under `semantic-layer//`. The file's `name:` field is the source's identity — it mirrors the warehouse identifier verbatim (e.g. Snowflake's uppercase `SIGNED_UP`); the filename is only a derived label. Always address sources by name through the `sl_*` tools, never by file path. There are three flavors: ### Overlay sources diff --git a/packages/cli/test/context/connections/read-only-sql.test.ts b/packages/cli/test/context/connections/read-only-sql.test.ts index 90affa04..4504a126 100644 --- a/packages/cli/test/context/connections/read-only-sql.test.ts +++ b/packages/cli/test/context/connections/read-only-sql.test.ts @@ -1,5 +1,9 @@ import { describe, expect, it } from 'vitest'; -import { assertReadOnlySql, limitSqlForExecution } from '../../../src/context/connections/read-only-sql.js'; +import { + assertReadOnlySql, + limitSqlForExecution, + stripTrailingSqlNoise, +} from '../../../src/context/connections/read-only-sql.js'; describe('assertReadOnlySql', () => { it('allows select and with queries', () => { @@ -15,6 +19,51 @@ describe('assertReadOnlySql', () => { 'Only read-only SELECT/WITH queries can be executed locally', ); }); + + it('accepts read-only queries that begin with leading comments', () => { + expect(assertReadOnlySql('-- daily widget sales\nselect count(*) from public.widget_sales')).toBe( + 'select count(*) from public.widget_sales', + ); + expect(assertReadOnlySql('/* block */\n with paid as (select 1) select * from paid')).toContain('with paid'); + }); + + it('still rejects mutating statements hidden behind leading comments', () => { + expect(() => assertReadOnlySql('-- harmless\n delete from orders')).toThrow( + 'Only read-only SELECT/WITH queries can be executed locally', + ); + }); + + it('rejects a second statement smuggled after a semicolon', () => { + expect(() => assertReadOnlySql('select 1; drop table orders')).toThrow( + 'Only one SQL statement can be executed.', + ); + expect(() => assertReadOnlySql('select 1;\n-- pad\ndelete from orders')).toThrow( + 'Only one SQL statement can be executed.', + ); + expect(() => assertReadOnlySql('select 1; /* pad */ truncate orders;')).toThrow( + 'Only one SQL statement can be executed.', + ); + }); + + it('accepts trailing semicolons, including repeated ones followed by comments', () => { + expect(assertReadOnlySql('select 1;')).toBe('select 1;'); + expect(assertReadOnlySql('select 1 ;; \n')).toBe('select 1 ;;'); + expect(assertReadOnlySql('select 1; -- done')).toBe('select 1; -- done'); + }); + + it('ignores semicolons inside string literals, quoted identifiers, and comments', () => { + expect(assertReadOnlySql("select string_agg(name, '; ') from t")).toBe("select string_agg(name, '; ') from t"); + expect(assertReadOnlySql("select 'it''s; quoted' from t")).toBe("select 'it''s; quoted' from t"); + expect(assertReadOnlySql('select ";" from "t;u"')).toBe('select ";" from "t;u"'); + expect(assertReadOnlySql('select 1 -- tail; comment')).toBe('select 1 -- tail; comment'); + expect(assertReadOnlySql('select 1 /* a;b */ + 2')).toBe('select 1 /* a;b */ + 2'); + }); + + it('rejects statements smuggled after a string literal that closes a semicolon early', () => { + expect(() => assertReadOnlySql("select 'a'; delete from orders")).toThrow( + 'Only one SQL statement can be executed.', + ); + }); }); describe('limitSqlForExecution', () => { @@ -27,4 +76,42 @@ describe('limitSqlForExecution', () => { it('returns the trimmed SQL when no maxRows value is provided', () => { expect(limitSqlForExecution('select * from orders; ', undefined)).toBe('select * from orders'); }); + + it('strips leading comments before wrapping with a row limit', () => { + expect(limitSqlForExecution('-- top customers\nselect * from public.orders', 25)).toBe( + 'select * from (select * from public.orders) as ktx_query_result limit 25', + ); + }); + + it('drops a trailing semicolon followed by a comment so the subquery stays valid', () => { + // The single-statement gate accepts `select 1; -- done`; without stripping + // the terminator the wrapper would embed `select 1; -- done` and comment out + // the closing paren and limit clause. + expect(limitSqlForExecution('select 1; -- done', 5)).toBe( + 'select * from (select 1) as ktx_query_result limit 5', + ); + expect(limitSqlForExecution('select 1; /* note */', 5)).toBe( + 'select * from (select 1) as ktx_query_result limit 5', + ); + }); + + it('drops a trailing line comment with no semicolon before wrapping', () => { + expect(limitSqlForExecution('select 1 -- done', 5)).toBe('select * from (select 1) as ktx_query_result limit 5'); + }); +}); + +describe('stripTrailingSqlNoise', () => { + it('removes trailing semicolons, comments, and whitespace', () => { + expect(stripTrailingSqlNoise('select 1;')).toBe('select 1'); + expect(stripTrailingSqlNoise('select 1 ;; ')).toBe('select 1'); + expect(stripTrailingSqlNoise('select 1; -- done')).toBe('select 1'); + expect(stripTrailingSqlNoise('select 1 -- done')).toBe('select 1'); + expect(stripTrailingSqlNoise('select 1; /* trailing */')).toBe('select 1'); + }); + + it('preserves semicolons and comment markers inside literals and mid-statement', () => { + expect(stripTrailingSqlNoise("select 'a; -- b'")).toBe("select 'a; -- b'"); + expect(stripTrailingSqlNoise('select 1 /* a;b */ + 2')).toBe('select 1 /* a;b */ + 2'); + expect(stripTrailingSqlNoise('select ";" from "t;u"')).toBe('select ";" from "t;u"'); + }); }); diff --git a/packages/cli/test/context/core/git.service.test.ts b/packages/cli/test/context/core/git.service.test.ts index 8b19ed09..c07a5313 100644 --- a/packages/cli/test/context/core/git.service.test.ts +++ b/packages/cli/test/context/core/git.service.test.ts @@ -1,6 +1,6 @@ -import { mkdtemp, readFile, realpath, rm, writeFile } from 'node:fs/promises'; +import { mkdir, mkdtemp, readFile, realpath, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; -import { join } from 'node:path'; +import { dirname, join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; import type { KtxCoreConfig } from '../../../src/context/core/config.js'; import { GitService } from '../../../src/context/core/git.service.js'; @@ -35,10 +35,29 @@ describe('GitService', () => { }); const writeAndCommit = async (filePath: string, content: string, message = 'msg') => { + await mkdir(dirname(join(tempDir, filePath)), { recursive: true }); await writeFile(join(tempDir, filePath), content, 'utf-8'); return service.commitFile(filePath, message, 'Test', 'test@example.com'); }; + describe('listFilesAtCommit', () => { + it('lists matching paths at a commit and recovers files deleted since', async () => { + await writeAndCommit('semantic-layer/warehouse/custom.yaml', 'name: orders\n'); + const atSeed = await service.revParseHead(); + await service.deleteFile('semantic-layer/warehouse/custom.yaml', 'drop', 'Test', 'test@example.com'); + + // HEAD no longer has the file; the seed commit still does. + await expect(service.listFilesAtCommit('semantic-layer/warehouse', 'HEAD')).resolves.toEqual([]); + await expect(service.listFilesAtCommit('semantic-layer/warehouse', atSeed)).resolves.toEqual([ + 'semantic-layer/warehouse/custom.yaml', + ]); + }); + + it('returns [] for a pathspec that matches nothing', async () => { + await expect(service.listFilesAtCommit('does/not/exist', 'HEAD')).resolves.toEqual([]); + }); + }); + describe('cold-start bootstrap commit', () => { it('writes an empty commit on init so HEAD always resolves', async () => { // beforeEach already ran onModuleInit() against an empty temp dir. diff --git a/packages/cli/test/context/index-sync/reindex.test.ts b/packages/cli/test/context/index-sync/reindex.test.ts index fe7698ba..3f6cb6b1 100644 --- a/packages/cli/test/context/index-sync/reindex.test.ts +++ b/packages/cli/test/context/index-sync/reindex.test.ts @@ -159,13 +159,16 @@ describe('reindexLocalIndexes', () => { '---\nsummary: Revenue\nusage_mode: auto\n---\n\nPaid orders.\n', 'utf-8', ); - await mkdir(join(project.projectDir, 'semantic-layer/warehouse'), { recursive: true }); - await writeFile(join(project.projectDir, 'semantic-layer/warehouse/broken.yaml'), 'not: [valid', 'utf-8'); + // A broken standalone source is surfaced for repair rather than failing the + // scope, so use a corrupt machine-generated manifest shard, which is the + // remaining fatal read failure that the per-scope catch must isolate. + await mkdir(join(project.projectDir, 'semantic-layer/warehouse/_schema'), { recursive: true }); + await writeFile(join(project.projectDir, 'semantic-layer/warehouse/_schema/broken.yaml'), 'not: [valid', 'utf-8'); const summary = await reindexLocalIndexes(project, { force: false, embeddingService: null }); expect(summary.scopes.find((scope) => scope.label === 'global')?.error).toBeUndefined(); - expect(summary.scopes.find((scope) => scope.label === 'warehouse')?.error).toContain('YAML'); + expect(summary.scopes.find((scope) => scope.label === 'warehouse')?.error).toContain('_schema/broken.yaml'); }); it('marks a scope errored when configured embeddings fail', async () => { diff --git a/packages/cli/test/context/ingest/final-gate-repair.test.ts b/packages/cli/test/context/ingest/final-gate-repair.test.ts index 1a52442c..8a21eab6 100644 --- a/packages/cli/test/context/ingest/final-gate-repair.test.ts +++ b/packages/cli/test/context/ingest/final-gate-repair.test.ts @@ -33,14 +33,14 @@ async function makeHarness() { } describe('finalGateRepairPaths', () => { - it('derives sorted wiki and semantic-layer file paths', () => { + it('derives sorted, deduplicated wiki and semantic-layer file paths', () => { expect( finalGateRepairPaths({ changedWikiPageKeys: ['account-segments', 'overview', 'account-segments'], - touchedSlSources: [ - { connectionId: 'warehouse', sourceName: 'mart_account_segments' }, - { connectionId: 'warehouse', sourceName: 'orders' }, - { connectionId: 'warehouse', sourceName: 'orders' }, + touchedSlSourcePaths: [ + 'semantic-layer/warehouse/mart_account_segments.yaml', + 'semantic-layer/warehouse/orders.yaml', + 'semantic-layer/warehouse/orders.yaml', ], }), ).toEqual([ diff --git a/packages/cli/test/context/ingest/finalization-scope.test.ts b/packages/cli/test/context/ingest/finalization-scope.test.ts index 02c535df..1913a17f 100644 --- a/packages/cli/test/context/ingest/finalization-scope.test.ts +++ b/packages/cli/test/context/ingest/finalization-scope.test.ts @@ -18,19 +18,49 @@ describe('deriveFinalizationWikiPageKeys', () => { }); describe('deriveFinalizationTouchedSources', () => { - it('maps standalone semantic-layer files directly', async () => { - const result = await deriveFinalizationTouchedSources({ + it('resolves standalone files by the source diff, not the filename', () => { + // The file carries a derived label (`signed_up-.yaml`); the source it + // defines is the in-file `name:` (`SIGNED_UP`), visible only via the diff. + const result = deriveFinalizationTouchedSources({ + changedPaths: ['semantic-layer/warehouse/signed_up-1a2b3c4d.yaml'], + beforeSourcesByConnection: new Map([['warehouse', []]]), + afterSourcesByConnection: new Map([ + ['warehouse', [{ name: 'SIGNED_UP', grain: [], columns: [], joins: [], measures: [] }]], + ]), + }); + expect(result).toEqual({ + touchedSources: [{ connectionId: 'warehouse', sourceName: 'SIGNED_UP' }], + unresolvedPaths: [], + }); + }); + + it('resolves deleted standalone files by the name that disappeared', () => { + const result = deriveFinalizationTouchedSources({ + changedPaths: ['semantic-layer/warehouse/signed_up-1a2b3c4d.yaml'], + beforeSourcesByConnection: new Map([ + ['warehouse', [{ name: 'SIGNED_UP', grain: [], columns: [], joins: [], measures: [] }]], + ]), + afterSourcesByConnection: new Map([['warehouse', []]]), + }); + expect(result).toEqual({ + touchedSources: [{ connectionId: 'warehouse', sourceName: 'SIGNED_UP' }], + unresolvedPaths: [], + }); + }); + + it('flags standalone changes that produce no source diff', () => { + const result = deriveFinalizationTouchedSources({ changedPaths: ['semantic-layer/warehouse/orders.yaml'], beforeSourcesByConnection: new Map(), afterSourcesByConnection: new Map(), }); expect(result).toEqual({ - touchedSources: [{ connectionId: 'warehouse', sourceName: 'orders' }], - unresolvedPaths: [], + touchedSources: [], + unresolvedPaths: ['semantic-layer/warehouse/orders.yaml'], }); }); - it('resolves aggregate _schema changes by comparing loaded source snapshots', async () => { + it('resolves aggregate _schema changes by comparing loaded source snapshots', () => { const beforeSourcesByConnection = new Map([ [ 'warehouse', @@ -72,7 +102,7 @@ describe('deriveFinalizationTouchedSources', () => { ], ]); - const result = await deriveFinalizationTouchedSources({ + const result = deriveFinalizationTouchedSources({ changedPaths: ['semantic-layer/warehouse/_schema/public.yaml'], beforeSourcesByConnection, afterSourcesByConnection, @@ -84,11 +114,11 @@ describe('deriveFinalizationTouchedSources', () => { }); }); - it('flags aggregate _schema changes that cannot be resolved to logical sources', async () => { + it('flags aggregate _schema changes that cannot be resolved to logical sources', () => { const beforeSourcesByConnection = new Map([['warehouse', []]]); const afterSourcesByConnection = new Map([['warehouse', []]]); - const result = await deriveFinalizationTouchedSources({ + const result = deriveFinalizationTouchedSources({ changedPaths: ['semantic-layer/warehouse/_schema/public.yaml'], beforeSourcesByConnection, afterSourcesByConnection, diff --git a/packages/cli/test/context/ingest/ingest-bundle.runner.isolated-diff.test.ts b/packages/cli/test/context/ingest/ingest-bundle.runner.isolated-diff.test.ts index bad40098..f5d5822b 100644 --- a/packages/cli/test/context/ingest/ingest-bundle.runner.isolated-diff.test.ts +++ b/packages/cli/test/context/ingest/ingest-bundle.runner.isolated-diff.test.ts @@ -70,6 +70,15 @@ async function loadSourcesFromRoot(root: string) { }; } +// Mirrors the production contract: resolve the standalone/overlay file for a +// source, null when absent. Fixtures keep filename == name, so a direct read +// is a faithful shortcut. +async function readSourceFileFromRoot(root: string, connectionId: string, sourceName: string) { + const relPath = `semantic-layer/${connectionId}/${sourceName}.yaml`; + const content = await readFile(join(root, relPath), 'utf-8').catch(() => null); + return content === null ? null : { content, path: relPath }; +} + async function listGlobalWikiPageKeys(root: string): Promise { const dir = join(root, 'wiki/global'); const entries = await readdir(dir).catch(() => []); @@ -172,11 +181,17 @@ function makeDeps( const semanticLayerService: any = { loadAllSources: vi.fn(async () => loadSourcesFromRoot(runtime.configDir)), listFilesForConnection: vi.fn().mockResolvedValue(['mart_account_segments.yaml']), + readSourceFile: vi.fn((connectionId: string, sourceName: string) => + readSourceFileFromRoot(runtime.configDir, connectionId, sourceName), + ), }; semanticLayerService.forWorktree = vi.fn((workdir: string) => ({ ...semanticLayerService, loadAllSources: vi.fn(async () => loadSourcesFromRoot(workdir)), listFilesForConnection: vi.fn().mockResolvedValue(['mart_account_segments.yaml']), + readSourceFile: vi.fn((connectionId: string, sourceName: string) => + readSourceFileFromRoot(workdir, connectionId, sourceName), + ), })); const deps: IngestBundleRunnerDeps = { @@ -2366,8 +2381,11 @@ describe('IngestBundleRunner isolated diff path', () => { join(runtime.configDir, '.ktx/ingest-traces/job-finalization-target-policy/trace.jsonl'), 'utf-8', ); - expect(trace).toContain('finalization_committed'); - expect(trace).toContain('semantic_layer_target_policy'); + // The policy check runs inside finalization, before touched-source + // derivation — an out-of-scope write fails the finalization stage + // instead of reading as committed. + expect(trace).not.toContain('finalization_committed'); + expect(trace).toContain('semantic_layer_target_policy_failed'); expect(trace).toContain('ingest_failed'); } finally { await rm(runtime.homeDir, { recursive: true, force: true }); diff --git a/packages/cli/test/context/ingest/ingest-bundle.runner.test.ts b/packages/cli/test/context/ingest/ingest-bundle.runner.test.ts index 68814792..9f668b7b 100644 --- a/packages/cli/test/context/ingest/ingest-bundle.runner.test.ts +++ b/packages/cli/test/context/ingest/ingest-bundle.runner.test.ts @@ -1776,12 +1776,24 @@ describe('IngestBundleRunner — Stages 1 → 7', () => { }, ], }); - deps.semanticLayerService.loadAllSources.mockImplementation((connectionId: string) => - Promise.resolve({ sources: [{ name: `${connectionId}_source` }], loadErrors: [] }), - ); let head = 'pre-finalization'; + // Touched-source derivation diffs composed sources before/after finalization + // (the filename never carries identity), so the mock must reflect the write: + // `orders` exists only once the finalization commit lands. + deps.semanticLayerService.loadAllSources.mockImplementation((connectionId: string) => + Promise.resolve({ + sources: + connectionId === 'warehouse-2' && head === 'post-finalization' + ? [{ name: `${connectionId}_source` }, { name: 'orders' }] + : [{ name: `${connectionId}_source` }], + loadErrors: [], + }), + ); const git = { revParseHead: vi.fn(async () => head), + // Touched-source derivation reads each changed file's `name:`; the worktree + // is mocked (no files on disk), so serve the source content from history. + getFileAtCommit: vi.fn(async () => 'name: orders\n'), commitFiles: vi.fn().mockImplementation(async (paths: string[]) => { if (paths.includes('semantic-layer/warehouse-2/orders.yaml')) { head = 'post-finalization'; @@ -1854,10 +1866,41 @@ describe('IngestBundleRunner — Stages 1 → 7', () => { }), ); expect(deps.semanticLayerService.loadAllSources).toHaveBeenCalledWith('warehouse-2'); - expect(deps.slSearchService.indexSources).toHaveBeenCalledWith('warehouse-2', [{ name: 'warehouse-2_source' }]); + expect(deps.slSearchService.indexSources).toHaveBeenCalledWith('warehouse-2', [ + { name: 'warehouse-2_source' }, + { name: 'orders' }, + ]); expect(deps.sessionWorktreeService.cleanup).toHaveBeenCalledWith(expect.any(Object), 'success'); }); + it('recovers a deleted hash-named SL source by its in-file name, not its filename', async () => { + const runner = buildRunner(); + // An uppercase warehouse source lives in a hash-derived filename, so parsing + // the basename yields the phantom `widget_sales-1a2b3c4d`. The real name must + // come from the file's `name:`, recovered from history once it was deleted. + const deletedPath = 'semantic-layer/warehouse/widget_sales-1a2b3c4d.yaml'; + const getFileAtCommit = vi.fn(async () => 'name: WIDGET_SALES\ntable: WIDGET_SALES\n'); + const worktree = { workdir: join(tmpdir(), 'ktx-absent-worktree-recover'), git: { getFileAtCommit } }; + + const touched = await (runner as any).touchedSlSourcesFromPaths(worktree, [deletedPath], 'pre-change-sha'); + + expect(touched).toEqual([{ connectionId: 'warehouse', sourceName: 'WIDGET_SALES' }]); + expect(getFileAtCommit).toHaveBeenCalledWith(deletedPath, 'pre-change-sha'); + }); + + it('falls back to the filename only when a deleted SL file is unrecoverable from history', async () => { + const runner = buildRunner(); + const deletedPath = 'semantic-layer/warehouse/orders.yaml'; + const getFileAtCommit = vi.fn(async () => { + throw new Error('path not present at commit'); + }); + const worktree = { workdir: join(tmpdir(), 'ktx-absent-worktree-fallback'), git: { getFileAtCommit } }; + + const touched = await (runner as any).touchedSlSourcesFromPaths(worktree, [deletedPath], 'pre-change-sha'); + + expect(touched).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]); + }); + it('includes finalization actions in memory-flow saved counts', async () => { const deps = makeDeps(); deps.adapter.source = 'historic-sql'; diff --git a/packages/cli/test/context/ingest/tools/warehouse-verification/sql-execution.tool.test.ts b/packages/cli/test/context/ingest/tools/warehouse-verification/sql-execution.tool.test.ts index 0fb87655..308c7466 100644 --- a/packages/cli/test/context/ingest/tools/warehouse-verification/sql-execution.tool.test.ts +++ b/packages/cli/test/context/ingest/tools/warehouse-verification/sql-execution.tool.test.ts @@ -1,13 +1,21 @@ -import { describe, expect, it, vi } from 'vitest'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; import type { SlConnectionCatalogPort } from '../../../../../src/context/sl/ports.js'; +import type { SqlAnalysisPort } from '../../../../../src/context/sql-analysis/ports.js'; import type { ToolContext } from '../../../../../src/context/tools/base-tool.js'; import { SqlExecutionTool } from '../../../../../src/context/ingest/tools/warehouse-verification/sql-execution.tool.js'; describe('SqlExecutionTool', () => { const connections = { executeQuery: vi.fn(), - } as unknown as SlConnectionCatalogPort & { executeQuery: ReturnType }; - const tool = new SqlExecutionTool(connections); + getConnectionById: vi.fn(async () => ({ id: 'warehouse', name: 'warehouse', connectionType: 'POSTGRESQL' })), + } as unknown as SlConnectionCatalogPort & { + executeQuery: ReturnType; + getConnectionById: ReturnType; + }; + const sqlAnalysis = { + validateReadOnly: vi.fn(async () => ({ ok: true, error: null })), + } as unknown as SqlAnalysisPort & { validateReadOnly: ReturnType }; + const tool = new SqlExecutionTool(connections, sqlAnalysis); const context: ToolContext = { sourceId: 'ingest', messageId: 'm1', @@ -15,7 +23,15 @@ describe('SqlExecutionTool', () => { session: { allowedConnectionNames: new Set(['warehouse']) } as any, }; - it('wraps read-only SQL with a capped row limit', async () => { + beforeEach(() => { + connections.executeQuery.mockReset(); + connections.getConnectionById.mockReset(); + connections.getConnectionById.mockResolvedValue({ id: 'warehouse', name: 'warehouse', connectionType: 'POSTGRESQL' }); + sqlAnalysis.validateReadOnly.mockReset(); + sqlAnalysis.validateReadOnly.mockResolvedValue({ ok: true, error: null }); + }); + + it('validates with the parser-backed validator in the connection dialect, then wraps with a capped row limit', async () => { connections.executeQuery.mockResolvedValue({ headers: ['status'], rows: [['paid']], totalRows: 1 }); const result = await tool.call( @@ -23,6 +39,7 @@ describe('SqlExecutionTool', () => { context, ); + expect(sqlAnalysis.validateReadOnly).toHaveBeenCalledWith('select status from public.orders', 'postgres'); expect(connections.executeQuery).toHaveBeenCalledWith( 'warehouse', 'select * from (select status from public.orders) as ktx_query_result limit 5', @@ -31,15 +48,47 @@ describe('SqlExecutionTool', () => { expect(result.structured.wrappedSql).toContain('limit 5'); }); - it.each(['insert into x values (1)', 'drop table x', 'vacuum'])('rejects mutating SQL: %s', async (sql) => { - connections.executeQuery.mockClear(); + it('maps connection types to sqlglot dialects', async () => { + connections.getConnectionById.mockResolvedValue({ id: 'warehouse', name: 'warehouse', connectionType: 'SNOWFLAKE' }); + connections.executeQuery.mockResolvedValue({ headers: [], rows: [], totalRows: 0 }); - const result = await tool.call({ connectionId: 'warehouse', sql }, context); + await tool.call({ connectionId: 'warehouse', sql: 'select 1' }, context); - expect(result.markdown).toContain('Only read-only SELECT/WITH queries can be executed locally.'); + expect(sqlAnalysis.validateReadOnly).toHaveBeenCalledWith('select 1', 'snowflake'); + }); + + it('returns the validator error without executing when validation fails', async () => { + sqlAnalysis.validateReadOnly.mockResolvedValue({ ok: false, error: 'SQL contains read/write operation: Insert' }); + + const result = await tool.call( + { connectionId: 'warehouse', sql: 'with x as (insert into t values (1) returning *) select * from x' }, + context, + ); + + expect(result.markdown).toContain('SQL contains read/write operation: Insert'); + expect(result.structured.error).toContain('SQL contains read/write operation: Insert'); expect(connections.executeQuery).not.toHaveBeenCalled(); }); + it('throws when no parser-backed validator is configured', async () => { + const unvalidated = new SqlExecutionTool(connections); + + await expect(unvalidated.call({ connectionId: 'warehouse', sql: 'select 1' }, context)).rejects.toThrow( + 'sql_execution requires parser-backed SQL validation.', + ); + expect(connections.executeQuery).not.toHaveBeenCalled(); + }); + + it.each(['insert into x values (1)', 'drop table x', 'vacuum'])( + 'keeps the local backstop even when the validator approves: %s', + async (sql) => { + const result = await tool.call({ connectionId: 'warehouse', sql }, context); + + expect(result.markdown).toContain('Only read-only SELECT/WITH queries can be executed locally.'); + expect(connections.executeQuery).not.toHaveBeenCalled(); + }, + ); + it('surfaces connector errors verbatim', async () => { connections.executeQuery.mockRejectedValue(new Error('relation "orbit_analytics.customer" does not exist')); diff --git a/packages/cli/test/context/mcp/local-project-ports.test.ts b/packages/cli/test/context/mcp/local-project-ports.test.ts index 18cea01b..b2699d53 100644 --- a/packages/cli/test/context/mcp/local-project-ports.test.ts +++ b/packages/cli/test/context/mcp/local-project-ports.test.ts @@ -5,7 +5,9 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { initKtxProject } from '../../../src/context/project/project.js'; import { KtxQueryError } from '../../../src/errors.js'; import { createKtxConnectorCapabilities, type KtxQueryResult, type KtxScanConnector, type KtxSchemaSnapshot } from '../../../src/context/scan/types.js'; -import { writeLocalSlSource } from '../../../src/context/sl/local-sl.js'; +import { SemanticLayerService } from '../../../src/context/sl/semantic-layer.service.js'; +import type { SemanticLayerSource } from '../../../src/context/sl/types.js'; +import { seedSlSourceFile } from '../sl/sl-source-seeding.test-utils.js'; import { createLocalProjectMcpContextPorts } from '../../../src/context/mcp/local-project-ports.js'; describe('createLocalProjectMcpContextPorts', () => { @@ -739,7 +741,7 @@ describe('createLocalProjectMcpContextPorts', () => { it('reads seeded semantic-layer sources', async () => { const project = await initKtxProject({ projectDir: tempDir }); - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: [ @@ -763,7 +765,92 @@ describe('createLocalProjectMcpContextPorts', () => { }); }); - it('rejects path traversal keys before touching the project directory', async () => { + it('reads manifest-backed sources with uppercase warehouse identifiers', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + await project.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/PUBLIC.yaml', + [ + 'tables:', + ' WIDGET_SALES:', + ' table: PUBLIC.WIDGET_SALES', + ' columns:', + ' - name: ID', + ' type: number', + ' pk: true', + '', + ].join('\n'), + 'ktx', + 'ktx@example.com', + 'seed uppercase manifest shard', + ); + const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null }); + + await expect( + ports.semanticLayer?.readSource({ connectionId: 'warehouse', sourceName: 'WIDGET_SALES' }), + ).resolves.toMatchObject({ + sourceName: 'WIDGET_SALES', + yaml: expect.stringContaining('table: PUBLIC.WIDGET_SALES'), + }); + }); + + it('composes an overlay written for an uppercase manifest source at a derived filename', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + await project.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/PUBLIC.yaml', + [ + 'tables:', + ' WIDGET_SALES:', + ' table: PUBLIC.WIDGET_SALES', + ' columns:', + ' - name: ID', + ' type: number', + ' pk: true', + '', + ].join('\n'), + 'ktx', + 'ktx@example.com', + 'seed uppercase manifest shard', + ); + + // The production write path: agents overlay manifest sources via + // SemanticLayerService.writeSource using the verbatim warehouse name. + const service = new SemanticLayerService(project.fileStore as never, {} as never, {} as never); + const overlay = { + name: 'WIDGET_SALES', + measures: [{ name: 'widget_sales_count', expr: 'count(*)' }], + } as SemanticLayerSource; + const write = await service.writeSource('warehouse', overlay, 'ktx', 'ktx@example.com'); + expect(write.path).toMatch(/^semantic-layer\/warehouse\/widget_sales-[0-9a-f]{8}\.yaml$/); + + const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null }); + await expect( + ports.semanticLayer?.readSource({ connectionId: 'warehouse', sourceName: 'WIDGET_SALES' }), + ).resolves.toMatchObject({ + sourceName: 'WIDGET_SALES', + yaml: expect.stringContaining('widget_sales_count'), + }); + }); + + it('returns a standalone source verbatim even when its YAML is currently broken', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + await project.fileStore.writeFile( + 'semantic-layer/warehouse/orders.yaml', + 'name: orders\nmeasures:\n - name: revenue\n expr: [unterminated\n', + 'ktx', + 'ktx@example.com', + 'seed broken source mid-edit', + ); + const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null }); + + await expect( + ports.semanticLayer?.readSource({ connectionId: 'warehouse', sourceName: 'orders' }), + ).resolves.toMatchObject({ + sourceName: 'orders', + yaml: expect.stringContaining('[unterminated'), + }); + }); + + it('keeps path-traversal keys away from the project directory', async () => { const project = await initKtxProject({ projectDir: tempDir }); const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null }); @@ -774,12 +861,14 @@ describe('createLocalProjectMcpContextPorts', () => { }), ).rejects.toThrow('Invalid wiki key "../outside". Wiki keys must be flat; use "outside".'); + // Source reads never derive a file path from the name; a traversal-style + // name simply matches no record. await expect( ports.semanticLayer?.readSource({ connectionId: 'warehouse', sourceName: '../orders', }), - ).rejects.toThrow('Unsafe semantic-layer source name'); + ).resolves.toBeNull(); }); it('uses semantic compute for compile-only sl_query when supplied', async () => { @@ -788,7 +877,7 @@ describe('createLocalProjectMcpContextPorts', () => { driver: 'postgres', url: 'env:DATABASE_URL', }; - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: [ @@ -850,7 +939,7 @@ describe('createLocalProjectMcpContextPorts', () => { driver: 'postgres', url: 'env:DATABASE_URL', }; - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: [ diff --git a/packages/cli/test/context/memory/memory-agent.service.test.ts b/packages/cli/test/context/memory/memory-agent.service.test.ts index ba91444e..36bae425 100644 --- a/packages/cli/test/context/memory/memory-agent.service.test.ts +++ b/packages/cli/test/context/memory/memory-agent.service.test.ts @@ -357,6 +357,10 @@ describe('MemoryAgentService.gateRevertInvalidSources (J3)', () => { const configService = { writeFile: overrides.writeFile ?? vi.fn().mockResolvedValue({}), deleteFile: overrides.deleteFile ?? vi.fn().mockResolvedValue({}), + // Revert resolves the live file by name; with no listing it falls back + // to the writer-derived filename. + listFiles: vi.fn().mockResolvedValue({ files: [] }), + readFile: vi.fn().mockRejectedValue(new Error('ENOENT')), }; const gitService = { getFileAtCommit: overrides.getFileAtCommit ?? vi.fn().mockRejectedValue(new Error('not present')), diff --git a/packages/cli/test/context/search/backend-conformance.test-utils.test.ts b/packages/cli/test/context/search/backend-conformance.test-utils.test.ts index b49f866a..9e7cd337 100644 --- a/packages/cli/test/context/search/backend-conformance.test-utils.test.ts +++ b/packages/cli/test/context/search/backend-conformance.test-utils.test.ts @@ -5,7 +5,8 @@ import { afterEach, beforeEach, describe, it } from 'vitest'; import { SqliteContextEvidenceStore } from '../../../src/context/ingest/context-evidence/sqlite-context-evidence-store.js'; import type { JsonValue } from '../../../src/context/ingest/ports.js'; import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js'; -import { type LocalSlSourceSearchResult, searchLocalSlSources, writeLocalSlSource } from '../../../src/context/sl/local-sl.js'; +import { type LocalSlSourceSearchResult, searchLocalSlSources } from '../../../src/context/sl/local-sl.js'; +import { seedSlSourceFile } from '../sl/sl-source-seeding.test-utils.js'; import type { ContextEvidenceSearchResult } from '../../../src/context/tools/context-evidence-tool-store.js'; import { type LocalKnowledgeSearchResult, @@ -99,12 +100,12 @@ function toContextConformanceResult(result: ContextEvidenceSearchResult): Search } async function seedSemanticLayerProject(project: KtxLocalProject): Promise { - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, }); - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'finance', sourceName: 'orders', yaml: FINANCE_ORDERS_YAML, diff --git a/packages/cli/test/context/sl/local-sl.test.ts b/packages/cli/test/context/sl/local-sl.test.ts index b3a9b7d6..c7272f40 100644 --- a/packages/cli/test/context/sl/local-sl.test.ts +++ b/packages/cli/test/context/sl/local-sl.test.ts @@ -9,8 +9,8 @@ import { resolveLocalSlSource, searchLocalSlSources, validateLocalSlSource, - writeLocalSlSource, } from '../../../src/context/sl/local-sl.js'; +import { seedSlSourceFile } from './sl-source-seeding.test-utils.js'; const ORDERS_YAML = [ 'name: orders', @@ -60,7 +60,7 @@ describe('local semantic-layer helpers', () => { }); it('writes, reads, lists, and validates semantic-layer sources', async () => { - const write = await writeLocalSlSource(project, { + const write = await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, @@ -92,7 +92,7 @@ describe('local semantic-layer helpers', () => { }); it('resolves a scoped source by connection id', async () => { - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, @@ -115,7 +115,7 @@ describe('local semantic-layer helpers', () => { }); it('returns not-found for a missing scoped source', async () => { - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, @@ -130,12 +130,12 @@ describe('local semantic-layer helpers', () => { }); it('resolves a unique source name across all connections', async () => { - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, }); - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'analytics', sourceName: 'tickets', yaml: SUPPORT_YAML, @@ -157,7 +157,7 @@ describe('local semantic-layer helpers', () => { }); it('returns not-found for a missing unscoped source', async () => { - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, @@ -169,12 +169,12 @@ describe('local semantic-layer helpers', () => { }); it('reports sorted ambiguous connection ids for duplicate source names', async () => { - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, }); - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'analytics', sourceName: 'orders', yaml: ORDERS_YAML, @@ -261,6 +261,153 @@ describe('local semantic-layer helpers', () => { ); }); + it('reads manifest-backed scan sources whose warehouse identifiers are uppercase', async () => { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/PUBLIC.yaml', + `tables: + WIDGET_SALES: + table: PUBLIC.WIDGET_SALES + columns: + - name: ID + type: number + pk: true + - name: EMAIL + type: string +`, + 'ktx', + 'ktx@example.com', + 'Add uppercase manifest shard', + ); + + await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'WIDGET_SALES' })).resolves.toEqual( + expect.objectContaining({ + connectionId: 'warehouse', + name: 'WIDGET_SALES', + path: 'semantic-layer/warehouse/_schema/PUBLIC.yaml#WIDGET_SALES', + yaml: expect.stringContaining('table: PUBLIC.WIDGET_SALES'), + }), + ); + }); + + it('reads manifest-backed sources whose names are not filename-safe', async () => { + // Snowflake and Postgres unquoted identifiers allow `$`; manifest keys + // carry the warehouse name verbatim, so the lookup must accept it. + await project.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/PUBLIC.yaml', + `tables: + EVENT$LOG: + table: PUBLIC.EVENT$LOG + columns: + - name: ID + type: number + pk: true +`, + 'ktx', + 'ktx@example.com', + 'Add manifest shard with dollar-sign table name', + ); + + await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'EVENT$LOG' })).resolves.toEqual( + expect.objectContaining({ + connectionId: 'warehouse', + name: 'EVENT$LOG', + path: 'semantic-layer/warehouse/_schema/PUBLIC.yaml#EVENT$LOG', + yaml: expect.stringContaining('table: PUBLIC.EVENT$LOG'), + }), + ); + }); + + it('reads a manifest-backed source while a sibling standalone file has broken YAML', async () => { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/PUBLIC.yaml', + `tables: + WIDGET_SALES: + table: PUBLIC.WIDGET_SALES + columns: + - name: ID + type: number + pk: true +`, + 'ktx', + 'ktx@example.com', + 'Add manifest shard', + ); + await project.fileStore.writeFile( + 'semantic-layer/warehouse/orders.yaml', + 'name: orders\nmeasures:\n - name: revenue\n expr: [unterminated\n', + 'ktx', + 'ktx@example.com', + 'seed a sibling source mid-edit with broken YAML', + ); + + await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'WIDGET_SALES' })).resolves.toEqual( + expect.objectContaining({ + name: 'WIDGET_SALES', + yaml: expect.stringContaining('table: PUBLIC.WIDGET_SALES'), + }), + ); + + // The broken sibling stays visible in listings instead of hiding or + // failing the whole connection. + await expect(listLocalSlSources(project, { connectionId: 'warehouse' })).resolves.toEqual([ + expect.objectContaining({ name: 'orders', columnCount: 0 }), + expect.objectContaining({ name: 'WIDGET_SALES', columnCount: 1 }), + ]); + }); + + it('returns the raw YAML of a standalone source whose content no longer parses', async () => { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/orders.yaml', + 'name: orders\nmeasures:\n - name: revenue\n expr: [unterminated\n', + 'ktx', + 'ktx@example.com', + 'seed a source mid-edit with broken YAML', + ); + + await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'orders' })).resolves.toEqual( + expect.objectContaining({ + connectionId: 'warehouse', + name: 'orders', + path: 'semantic-layer/warehouse/orders.yaml', + yaml: expect.stringContaining('[unterminated'), + }), + ); + }); + + it('reads a broken source by its declared name even when the filename differs', async () => { + // Identity is the intact top-level `name:`, recovered via parseDocument even + // when the YAML is broken below it — never the filename. A human-renamed or + // hashed-filename source (e.g. an uppercase warehouse name) saved mid-edit + // must stay reachable under the name it declares, matching the writer side + // (resolveSlSourceFile). Keying it by the filename would make it invisible + // under its real name. + await project.fileStore.writeFile( + 'semantic-layer/warehouse/renamed-by-hand.yaml', + 'name: SIGNED_UP\nmeasures:\n - name: signups\n expr: [unterminated\n', + 'ktx', + 'ktx@example.com', + 'seed a human-renamed source mid-edit with broken YAML', + ); + + await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'SIGNED_UP' })).resolves.toEqual( + expect.objectContaining({ + connectionId: 'warehouse', + name: 'SIGNED_UP', + path: 'semantic-layer/warehouse/renamed-by-hand.yaml', + yaml: expect.stringContaining('[unterminated'), + }), + ); + + // The filename is not the identity, so it does not resolve a source. + await expect( + readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'renamed-by-hand' }), + ).resolves.toBeNull(); + + await expect(listLocalSlSources(project, { connectionId: 'warehouse' })).resolves.toEqual([ + expect.objectContaining({ name: 'SIGNED_UP', columnCount: 0 }), + ]); + }); + it('expands manifest-backed scan sources when listing all connections', async () => { await project.fileStore.writeFile( 'semantic-layer/warehouse/_schema/public.yaml', @@ -292,12 +439,12 @@ describe('local semantic-layer helpers', () => { }); it('searches local semantic-layer source text through SQLite FTS', async () => { - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, }); - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'tickets', yaml: SUPPORT_YAML, @@ -365,12 +512,12 @@ describe('local semantic-layer helpers', () => { }); it('searches all connections with one global hybrid ranking pass', async () => { - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, }); - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'finance', sourceName: 'orders', yaml: [ @@ -403,7 +550,7 @@ describe('local semantic-layer helpers', () => { }); it('returns dictionary evidence when collected sample values explain a match', async () => { - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, @@ -456,7 +603,7 @@ describe('local semantic-layer helpers', () => { }); it('adds the token lane alongside lexical matches for normalized query terms', async () => { - await writeLocalSlSource(project, { + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML, @@ -471,21 +618,13 @@ describe('local semantic-layer helpers', () => { }); }); - it('reports schema validation errors without writing invalid YAML', async () => { + it('reports schema validation errors for invalid YAML', async () => { const invalidYaml = ['name: broken', 'table: public.orders', 'columns: []', ''].join('\n'); await expect(validateLocalSlSource(invalidYaml)).resolves.toMatchObject({ valid: false, errors: expect.arrayContaining([expect.stringContaining('grain')]), }); - - await expect( - writeLocalSlSource(project, { - connectionId: 'warehouse', - sourceName: 'broken', - yaml: invalidYaml, - }), - ).rejects.toThrow('Invalid semantic-layer source'); }); it('reports overlay columns that are not computed columns', async () => { @@ -506,12 +645,40 @@ describe('local semantic-layer helpers', () => { }); }); - it('rejects unsafe source paths', async () => { + it('never derives a file path from a traversal-style source name', async () => { + // Reads match names against loaded records, so a traversal-style name is + // simply not found; the writer-side guarantee (derived filenames contain + // no separators) is covered by the source-files tests. await expect( readLocalSlSource(project, { connectionId: 'warehouse', sourceName: '../orders', }), - ).rejects.toThrow('Unsafe semantic-layer source name'); + ).resolves.toBeNull(); + }); + + it('reads a source from a human-renamed file by its in-file name', async () => { + // The filename is a derived label, not identity: a file renamed by a human + // still resolves under the `name:` it declares. + await project.fileStore.writeFile( + 'semantic-layer/warehouse/custom-file-name.yaml', + ORDERS_YAML, + 'ktx', + 'ktx@example.com', + 'Seed source at a human-chosen filename', + ); + + await expect( + readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'orders' }), + ).resolves.toMatchObject({ + connectionId: 'warehouse', + name: 'orders', + path: 'semantic-layer/warehouse/custom-file-name.yaml', + yaml: ORDERS_YAML, + }); + + await expect( + readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'custom-file-name' }), + ).resolves.toBeNull(); }); }); diff --git a/packages/cli/test/context/sl/pglite-sl-search-prototype.test.ts b/packages/cli/test/context/sl/pglite-sl-search-prototype.test.ts index 8ebb8646..d2ecc7ea 100644 --- a/packages/cli/test/context/sl/pglite-sl-search-prototype.test.ts +++ b/packages/cli/test/context/sl/pglite-sl-search-prototype.test.ts @@ -5,8 +5,9 @@ import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js'; import { assertSearchBackendConformanceCase } from '../search/backend-conformance.test-utils.js'; -import { searchLocalSlSources, writeLocalSlSource, type LocalSlSourceSearchResult } from '../../../src/context/sl/local-sl.js'; +import { searchLocalSlSources, type LocalSlSourceSearchResult } from '../../../src/context/sl/local-sl.js'; import { searchLocalSlSourcesWithPglitePrototype } from '../../../src/context/sl/pglite-sl-search-prototype.js'; +import { seedSlSourceFile } from './sl-source-seeding.test-utils.js'; const ORDERS_YAML = [ 'name: orders', @@ -107,9 +108,9 @@ function toConformanceResult(result: LocalSlSourceSearchResult) { } async function seedSemanticLayerProject(project: KtxLocalProject): Promise { - await writeLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML }); - await writeLocalSlSource(project, { connectionId: 'finance', sourceName: 'orders', yaml: FINANCE_ORDERS_YAML }); - await writeLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'customers', yaml: CUSTOMERS_YAML }); + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML }); + await seedSlSourceFile(project, { connectionId: 'finance', sourceName: 'orders', yaml: FINANCE_ORDERS_YAML }); + await seedSlSourceFile(project, { connectionId: 'warehouse', sourceName: 'customers', yaml: CUSTOMERS_YAML }); await project.fileStore.writeFile( 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json', diff --git a/packages/cli/test/context/sl/semantic-layer.service.test.ts b/packages/cli/test/context/sl/semantic-layer.service.test.ts index f8b919bb..b004e635 100644 --- a/packages/cli/test/context/sl/semantic-layer.service.test.ts +++ b/packages/cli/test/context/sl/semantic-layer.service.test.ts @@ -1,5 +1,9 @@ +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; import type { Mock } from 'vitest'; -import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js'; import { ColumnNameCollisionError, @@ -71,6 +75,7 @@ describe('loadSource', () => { it('warns and returns null when an existing source file has invalid YAML', async () => { const logger = { log: vi.fn(), warn: vi.fn(), error: vi.fn() }; const configService = { + listFiles: vi.fn().mockResolvedValue({ files: ['semantic-layer/warehouse/orders.yaml'] }), readFile: vi.fn().mockResolvedValue({ content: 'name: [' }), }; const service = new SemanticLayerService(configService as never, connectionCatalog(), pythonPort, logger as never); @@ -79,9 +84,33 @@ describe('loadSource', () => { expect(configService.readFile).toHaveBeenCalledWith('semantic-layer/warehouse/orders.yaml'); expect(logger.warn).toHaveBeenCalledWith( - expect.stringContaining('[loadSource] warehouse/orders.yaml: YAML parse failed:'), + expect.stringContaining('[loadSource] semantic-layer/warehouse/orders.yaml: YAML parse failed:'), ); }); + + it('returns null when no file declares the source name', async () => { + const configService = { + listFiles: vi.fn().mockResolvedValue({ files: [] }), + readFile: vi.fn(), + }; + const service = new SemanticLayerService(configService as never, connectionCatalog(), pythonPort); + + await expect(service.loadSource('warehouse', 'orders')).resolves.toBeNull(); + expect(configService.readFile).not.toHaveBeenCalled(); + }); + + it('resolves a source by its in-file name when the filename differs', async () => { + const configService = { + listFiles: vi.fn().mockResolvedValue({ files: ['semantic-layer/warehouse/renamed.yaml'] }), + readFile: vi.fn().mockResolvedValue({ content: 'name: SIGNED_UP\nmeasures: []\n' }), + }; + const service = new SemanticLayerService(configService as never, connectionCatalog(), pythonPort); + + await expect(service.loadSource('warehouse', 'SIGNED_UP')).resolves.toEqual({ + name: 'SIGNED_UP', + measures: [], + }); + }); }); describe('composeOverlay', () => { @@ -1242,3 +1271,177 @@ describe('findDanglingSegmentRefs', () => { expect(findDanglingSegmentRefs({})).toEqual([]); }); }); + +describe('writeSource / deleteSource file naming', () => { + let tempDir: string; + let project: KtxLocalProject; + let service: SemanticLayerService; + + const author = 'T U'; + const authorEmail = 't@u.com'; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-sl-service-files-')); + project = await initKtxProject({ projectDir: join(tempDir, 'project') }); + service = new SemanticLayerService(project.fileStore as never, connectionCatalog() as never, pythonPort as never); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + const signedUp: SemanticLayerSource = { + name: 'SIGNED_UP', + table: 'PUBLIC.SIGNED_UP', + grain: ['ID'], + columns: [{ name: 'ID', type: 'number' }], + joins: [], + measures: [], + }; + + it('writes a new uppercase source at a derived filename and reads it back by name', async () => { + const result = await service.writeSource('warehouse', signedUp, author, authorEmail); + + expect(result.path).toMatch(/^semantic-layer\/warehouse\/signed_up-[0-9a-f]{8}\.yaml$/); + + const file = await service.readSourceFile('warehouse', 'SIGNED_UP'); + expect(file?.path).toBe(result.path); + expect(file?.content).toContain('name: SIGNED_UP'); + + // Rewriting lands on the same file instead of deriving a second one. + const rewrite = await service.writeSource('warehouse', signedUp, author, authorEmail); + expect(rewrite.path).toBe(result.path); + }); + + it('repairs a broken file occupying the derived path instead of refusing the write', async () => { + const written = await service.writeSource('warehouse', signedUp, author, authorEmail); + await project.fileStore.writeFile( + written.path, + 'name: SIGNED_UP\nmeasures: [unterminated\n', + author, + authorEmail, + 'break the file', + ); + + const repaired = await service.writeSource('warehouse', signedUp, author, authorEmail); + + expect(repaired.path).toBe(written.path); + const file = await service.readSourceFile('warehouse', 'SIGNED_UP'); + expect(file?.path).toBe(written.path); + expect(file?.content).toContain('name: SIGNED_UP'); + }); + + it('rewrites a human-renamed file in place', async () => { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/custom.yaml', + 'name: orders\nmeasures: []\n', + author, + authorEmail, + 'seed renamed file', + ); + + const result = await service.writeSource( + 'warehouse', + { name: 'orders', grain: [], columns: [], joins: [], measures: [] }, + author, + authorEmail, + ); + + expect(result.path).toBe('semantic-layer/warehouse/custom.yaml'); + const listed = await project.fileStore.listFiles('semantic-layer/warehouse'); + expect(listed.files).toEqual(['semantic-layer/warehouse/custom.yaml']); + }); + + it('repairs a human-renamed broken file in place instead of deriving a second one', async () => { + // Renamed (filename ≠ name) AND mid-edit broken: identity must survive the + // syntax error so the rewrite lands on the original file rather than creating + // a duplicate at the derived path that later trips the by-name resolver. + await project.fileStore.writeFile( + 'semantic-layer/warehouse/custom.yaml', + 'name: SIGNED_UP\nmeasures: [unterminated\n', + author, + authorEmail, + 'seed broken renamed file', + ); + + const repaired = await service.writeSource('warehouse', signedUp, author, authorEmail); + + expect(repaired.path).toBe('semantic-layer/warehouse/custom.yaml'); + const listed = await project.fileStore.listFiles('semantic-layer/warehouse'); + expect(listed.files).toEqual(['semantic-layer/warehouse/custom.yaml']); + const file = await service.readSourceFile('warehouse', 'SIGNED_UP'); + expect(file?.path).toBe('semantic-layer/warehouse/custom.yaml'); + expect(file?.content).toContain('name: SIGNED_UP'); + }); + + it('keeps a .yml-renamed file visible to the loader and the by-name resolver alike', async () => { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/custom.yml', + 'name: orders\ntable: public.orders\ngrain: [id]\ncolumns:\n - name: id\n type: number\nmeasures: []\n', + author, + authorEmail, + 'seed .yml file', + ); + + const { sources, loadErrors } = await service.loadAllSources('warehouse'); + expect(loadErrors).toEqual([]); + expect(sources.map((source) => source.name)).toEqual(['orders']); + + const file = await service.readSourceFile('warehouse', 'orders'); + expect(file?.path).toBe('semantic-layer/warehouse/custom.yml'); + }); + + it('refuses to clobber a derived path occupied by a different source', async () => { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/orders.yaml', + 'name: other_source\nmeasures: []\n', + author, + authorEmail, + 'seed conflicting file', + ); + + await expect( + service.writeSource( + 'warehouse', + { name: 'orders', grain: [], columns: [], joins: [], measures: [] }, + author, + authorEmail, + ), + ).rejects.toThrow("already defines source 'other_source'"); + }); + + it('deletes the file resolved by name, wherever it lives', async () => { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/custom.yaml', + 'name: orders\nmeasures: []\n', + author, + authorEmail, + 'seed renamed file', + ); + + await service.deleteSource('warehouse', 'orders', author, authorEmail); + + const listed = await project.fileStore.listFiles('semantic-layer/warehouse'); + expect(listed.files).toEqual([]); + }); + + it('explains manifest-backed deletes instead of silently succeeding', async () => { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/_schema/public.yaml', + 'tables:\n payments:\n table: public.payments\n columns:\n - name: id\n type: number\n', + author, + authorEmail, + 'seed manifest shard', + ); + + await expect(service.deleteSource('warehouse', 'payments', author, authorEmail)).rejects.toThrow( + /scan manifest/, + ); + }); + + it('throws a plain not-found error for unknown sources', async () => { + await expect(service.deleteSource('warehouse', 'missing', author, authorEmail)).rejects.toThrow( + 'Semantic-layer source not found: warehouse/missing', + ); + }); +}); diff --git a/packages/cli/test/context/sl/sl-source-seeding.test-utils.ts b/packages/cli/test/context/sl/sl-source-seeding.test-utils.ts new file mode 100644 index 00000000..edd23719 --- /dev/null +++ b/packages/cli/test/context/sl/sl-source-seeding.test-utils.ts @@ -0,0 +1,22 @@ +import type { KtxFileWriteResult } from '../../../src/context/core/file-store.js'; +import type { KtxLocalProject } from '../../../src/context/project/project.js'; +import { slSourceFilePath } from '../../../src/context/sl/source-files.js'; + +/** + * Seed a standalone/overlay semantic-layer file at the writer-derived path, + * bypassing tool-level validation. Production writes go through + * `SemanticLayerService.writeSource`; tests that only need a file on disk use + * this instead. + */ +export async function seedSlSourceFile( + project: KtxLocalProject, + input: { connectionId: string; sourceName: string; yaml: string }, +): Promise { + return project.fileStore.writeFile( + slSourceFilePath(input.connectionId, input.sourceName), + input.yaml, + 'ktx', + 'ktx@example.com', + `Seed semantic-layer source: ${input.connectionId}/${input.sourceName}`, + ); +} diff --git a/packages/cli/test/context/sl/source-files.test.ts b/packages/cli/test/context/sl/source-files.test.ts new file mode 100644 index 00000000..f8026d14 --- /dev/null +++ b/packages/cli/test/context/sl/source-files.test.ts @@ -0,0 +1,154 @@ +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { initKtxProject, type KtxLocalProject } from '../../../src/context/project/project.js'; +import { + resolveSlSourceFile, + slSourceFileName, + slSourceFilePath, + slSourceNameForFile, +} from '../../../src/context/sl/source-files.js'; + +describe('slSourceFileName', () => { + it('keeps safe lowercase snake_case names verbatim', () => { + expect(slSourceFileName('orders')).toBe('orders.yaml'); + expect(slSourceFileName('mart_account_segments')).toBe('mart_account_segments.yaml'); + expect(slSourceFileName('orders2')).toBe('orders2.yaml'); + }); + + it('derives a slug-hash filename for any other name and never throws', () => { + expect(slSourceFileName('SIGNED_UP')).toMatch(/^signed_up-[0-9a-f]{8}\.yaml$/); + expect(slSourceFileName('EVENT$LOG')).toMatch(/^event_log-[0-9a-f]{8}\.yaml$/); + expect(slSourceFileName('my.dotted.name')).toMatch(/^my_dotted_name-[0-9a-f]{8}\.yaml$/); + expect(slSourceFileName('汉字')).toMatch(/^src-[0-9a-f]{8}\.yaml$/); + expect(slSourceFileName(' ')).toMatch(/^src-[0-9a-f]{8}\.yaml$/); + }); + + it('is deterministic', () => { + expect(slSourceFileName('EVENT$LOG')).toBe(slSourceFileName('EVENT$LOG')); + }); + + it('never emits path separators or traversal segments', () => { + for (const name of ['../orders', 'a/b', 'a\\b', '..', './x']) { + const fileName = slSourceFileName(name); + expect(fileName).not.toContain('/'); + expect(fileName).not.toContain('\\'); + expect(fileName).not.toContain('..'); + } + }); + + it('keeps case-differing names disjoint on case-insensitive filesystems', () => { + // Safe-branch filenames contain no `-`; hash-branch filenames always end + // in `-<8 hex>` with a hash of the raw name, so `events` vs `EVENTS` + // cannot collide even when the filesystem folds case (macOS, Windows). + const lower = slSourceFileName('events'); + const upper = slSourceFileName('EVENTS'); + expect(lower).toBe('events.yaml'); + expect(upper).toMatch(/^events-[0-9a-f]{8}\.yaml$/); + expect(upper.toLowerCase()).not.toBe(lower.toLowerCase()); + expect(lower).not.toContain('-'); + }); + + it('routes Windows reserved device basenames through the hash branch', () => { + expect(slSourceFileName('con')).toMatch(/^con-[0-9a-f]{8}\.yaml$/); + expect(slSourceFileName('lpt1')).toMatch(/^lpt1-[0-9a-f]{8}\.yaml$/); + }); + + it('caps overlong names', () => { + const longName = `a${'b'.repeat(300)}`; + const fileName = slSourceFileName(longName); + expect(fileName.length).toBeLessThanOrEqual(64 + '-12345678.yaml'.length); + expect(fileName).toMatch(/^ab+-[0-9a-f]{8}\.yaml$/); + }); +}); + +describe('slSourceFilePath', () => { + it('rejects unsafe connection ids but accepts any source name', () => { + expect(slSourceFilePath('warehouse', 'EVENT$LOG')).toMatch( + /^semantic-layer\/warehouse\/event_log-[0-9a-f]{8}\.yaml$/, + ); + expect(() => slSourceFilePath('../warehouse', 'orders')).toThrow('Unsafe connection id'); + }); +}); + +describe('slSourceNameForFile', () => { + it('prefers the in-file name and falls back to the filename', () => { + expect(slSourceNameForFile('semantic-layer/warehouse/custom.yaml', 'name: SIGNED_UP\n')).toBe('SIGNED_UP'); + expect(slSourceNameForFile('semantic-layer/warehouse/orders.yaml', 'measures: []\n')).toBe('orders'); + expect(slSourceNameForFile('semantic-layer/warehouse/orders.yaml', 'measures: [unterminated\n')).toBe('orders'); + }); + + it('recovers the declared name when the file is broken below the name: line', () => { + // A human-renamed file left mid-edit keeps its identity: the syntax error is + // under `measures:`, so the top-level `name:` is still recoverable and must + // win over the (unrelated) filename. + expect(slSourceNameForFile('semantic-layer/warehouse/renamed-by-hand.yaml', 'name: SIGNED_UP\nmeasures: [oops\n')).toBe( + 'SIGNED_UP', + ); + }); +}); + +describe('resolveSlSourceFile', () => { + let tempDir: string; + let project: KtxLocalProject; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-sl-source-files-')); + project = await initKtxProject({ projectDir: join(tempDir, 'project') }); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + async function seed(path: string, content: string): Promise { + await project.fileStore.writeFile(path, content, 'ktx', 'ktx@example.com', `seed ${path}`); + } + + it('matches by in-file name regardless of the filename', async () => { + await seed('semantic-layer/warehouse/renamed-by-hand.yaml', 'name: SIGNED_UP\nmeasures: []\n'); + + await expect(resolveSlSourceFile(project.fileStore, 'warehouse', 'SIGNED_UP')).resolves.toEqual({ + path: 'semantic-layer/warehouse/renamed-by-hand.yaml', + content: 'name: SIGNED_UP\nmeasures: []\n', + }); + }); + + it('returns null when no file declares the name and ignores manifest shards', async () => { + await seed('semantic-layer/warehouse/_schema/public.yaml', 'tables:\n orders:\n table: public.orders\n'); + + await expect(resolveSlSourceFile(project.fileStore, 'warehouse', 'orders')).resolves.toBeNull(); + }); + + it('falls back to the filename for broken YAML', async () => { + const broken = 'name: orders\nmeasures: [unterminated\n'; + await seed('semantic-layer/warehouse/orders.yaml', broken); + + await expect(resolveSlSourceFile(project.fileStore, 'warehouse', 'orders')).resolves.toEqual({ + path: 'semantic-layer/warehouse/orders.yaml', + content: broken, + }); + }); + + it('matches a human-renamed broken file by its still-recoverable name', async () => { + // Filename ≠ name, so the filename fallback cannot find it; resolution must + // come from the intact top-level `name:` even though the YAML is broken. + const broken = 'name: SIGNED_UP\nmeasures: [unterminated\n'; + await seed('semantic-layer/warehouse/renamed-by-hand.yaml', broken); + + await expect(resolveSlSourceFile(project.fileStore, 'warehouse', 'SIGNED_UP')).resolves.toEqual({ + path: 'semantic-layer/warehouse/renamed-by-hand.yaml', + content: broken, + }); + }); + + it('throws when two files declare the same source name', async () => { + await seed('semantic-layer/warehouse/orders.yaml', 'name: orders\nmeasures: []\n'); + await seed('semantic-layer/warehouse/orders_copy.yaml', 'name: orders\nmeasures: []\n'); + + await expect(resolveSlSourceFile(project.fileStore, 'warehouse', 'orders')).rejects.toThrow( + 'Multiple semantic-layer files declare source "orders"', + ); + }); +}); diff --git a/packages/cli/test/context/sl/tools/sl-edit-source.tool.test.ts b/packages/cli/test/context/sl/tools/sl-edit-source.tool.test.ts index fee83ea6..2c0beb8e 100644 --- a/packages/cli/test/context/sl/tools/sl-edit-source.tool.test.ts +++ b/packages/cli/test/context/sl/tools/sl-edit-source.tool.test.ts @@ -188,7 +188,7 @@ describe('SlEditSourceTool — manifest-backed source without overlay', () => { it('returns a directed hint pointing at sl_write_source + overlay shape', async () => { const { tool, semanticLayerService } = makeTool({ semanticLayerService: { - readSourceFile: vi.fn().mockRejectedValue(new Error('ENOENT')), + readSourceFile: vi.fn().mockResolvedValue(null), isManifestBacked: vi.fn().mockResolvedValue(true), }, }); @@ -222,7 +222,7 @@ describe('SlEditSourceTool — manifest-backed source without overlay', () => { it('still returns the plain "Source not found" error for truly-missing names', async () => { const { tool, semanticLayerService } = makeTool({ semanticLayerService: { - readSourceFile: vi.fn().mockRejectedValue(new Error('ENOENT')), + readSourceFile: vi.fn().mockResolvedValue(null), isManifestBacked: vi.fn().mockResolvedValue(false), }, }); @@ -241,3 +241,20 @@ describe('SlEditSourceTool — manifest-backed source without overlay', () => { expect(semanticLayerService.writeSource).not.toHaveBeenCalled(); }); }); + +describe('SlEditSourceTool — name edits', () => { + it('rejects edits that change the in-file name', async () => { + const { tool, semanticLayerService } = makeTool(); + const result = await tool.call( + { + connectionId: '11111111-1111-1111-1111-111111111111', + sourceName: 'orders', + yaml_edits: [{ oldText: 'name: orders', newText: 'name: renamed_orders' }], + } as any, + baseContext, + ); + expect(result.structured.success).toBe(false); + expect(result.markdown).toMatch(/renaming is not supported/i); + expect(semanticLayerService.writeSource).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/cli/test/context/sl/tools/sl-rollback.tool.test.ts b/packages/cli/test/context/sl/tools/sl-rollback.tool.test.ts index 6d2d787a..c91a1d96 100644 --- a/packages/cli/test/context/sl/tools/sl-rollback.tool.test.ts +++ b/packages/cli/test/context/sl/tools/sl-rollback.tool.test.ts @@ -16,8 +16,15 @@ function makeSession(overrides: Partial = {}): ToolSession { configService: { writeFile: vi.fn().mockResolvedValue(undefined), deleteFile: vi.fn().mockResolvedValue(undefined), + // No live file for `orders` — revert recovers the preHead path from history. + listFiles: vi.fn().mockResolvedValue({ files: [] }), + readFile: vi.fn().mockRejectedValue(new Error('ENOENT')), + } as any, + gitService: { + // The source lived at its derived filename at preHead. + listFilesAtCommit: vi.fn().mockResolvedValue(['semantic-layer/conn-1/orders.yaml']), + getFileAtCommit: vi.fn().mockResolvedValue('name: orders\nmeasures: []\n'), } as any, - gitService: { getFileAtCommit: vi.fn().mockResolvedValue('pre: content') } as any, ...overrides, }; } @@ -65,4 +72,33 @@ describe('SlRollbackTool', () => { expect(hasTouchedSlSource(session.touchedSlSources, 'conn-1', 'orders')).toBe(false); expect(session.actions).toEqual([]); }); + + it('restores a deleted human-renamed source at the path it occupied at preHead', async () => { + // The source lived at a custom filename (≠ the writer-derived `orders.yaml`) + // and the session deleted it. Revert must recover the custom path from the + // preHead commit and restore there, not write/no-op against the derived path. + const slSourcesRepository = { deleteByConnectionAndName: vi.fn().mockResolvedValue(undefined) }; + const tool = new SlRollbackTool(slSourcesRepository as never, connections as never, 1); + const renamedContent = 'name: orders\ntable: public.orders\nmeasures: []\n'; + const session = makeSession({ + gitService: { + listFilesAtCommit: vi.fn().mockResolvedValue(['semantic-layer/conn-1/custom.yaml']), + getFileAtCommit: vi.fn().mockResolvedValue(renamedContent), + } as any, + }); + const context: ToolContext = { sourceId: 's', messageId: 'm', userId: 'u', session }; + + const result = await tool.call({ sourceName: 'orders' } as any, context); + + expect(result.structured.success).toBe(true); + expect((session.configService as any).writeFile).toHaveBeenCalledWith( + 'semantic-layer/conn-1/custom.yaml', + renamedContent, + expect.anything(), + expect.anything(), + expect.anything(), + expect.anything(), + ); + expect((session.configService as any).deleteFile).not.toHaveBeenCalled(); + }); }); diff --git a/packages/cli/test/context/sl/tools/sl-write-source.tool.test.ts b/packages/cli/test/context/sl/tools/sl-write-source.tool.test.ts index ab3ee308..d74d6b1e 100644 --- a/packages/cli/test/context/sl/tools/sl-write-source.tool.test.ts +++ b/packages/cli/test/context/sl/tools/sl-write-source.tool.test.ts @@ -13,7 +13,7 @@ function makeTool(overrides: Partial> = {}) { validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }), writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }), deleteSource: vi.fn().mockResolvedValue(undefined), - readSourceFile: vi.fn().mockRejectedValue(new Error('not found')), + readSourceFile: vi.fn().mockResolvedValue(null), ...overrides.semanticLayerService, }; const slSearchService = { @@ -66,7 +66,7 @@ describe('SlWriteSourceTool — session gating', () => { deleteSource: vi.fn().mockResolvedValue(undefined), listManifestSourceNames: vi.fn().mockResolvedValue([]), isManifestBacked: vi.fn().mockResolvedValue(false), - readSourceFile: vi.fn().mockRejectedValue(new Error('not found')), + readSourceFile: vi.fn().mockResolvedValue(null), findManifestEntryByTableRef: vi.fn().mockResolvedValue(null), } as any, wikiService: {} as any, @@ -248,7 +248,7 @@ describe('SlWriteSourceTool — session gating', () => { deleteSource: vi.fn().mockResolvedValue(undefined), listManifestSourceNames: vi.fn().mockResolvedValue(['mart_account_segments']), isManifestBacked: vi.fn().mockResolvedValue(false), - readSourceFile: vi.fn().mockRejectedValue(new Error('not found')), + readSourceFile: vi.fn().mockResolvedValue(null), findManifestEntryByTableRef: vi.fn().mockResolvedValue(null), } as any, }); @@ -377,3 +377,36 @@ describe('SlWriteSourceTool — standalone shadow guard', () => { expect(result.markdown).toMatch(/shadows an existing manifest entry|already exists/i); }); }); + +describe('SlWriteSourceTool — source name identity', () => { + it('accepts verbatim warehouse identifiers as sourceName', () => { + const { tool } = makeTool(); + const base = { connectionId: '11111111-1111-1111-1111-111111111111' }; + expect(tool.inputSchema.safeParse({ ...base, sourceName: 'SIGNED_UP' }).success).toBe(true); + expect(tool.inputSchema.safeParse({ ...base, sourceName: 'EVENT$LOG' }).success).toBe(true); + expect(tool.inputSchema.safeParse({ ...base, sourceName: 'orders' }).success).toBe(true); + expect(tool.inputSchema.safeParse({ ...base, sourceName: '' }).success).toBe(false); + }); + + it('rejects a source whose name does not match sourceName', async () => { + const { tool, semanticLayerService } = makeTool(); + const result = await tool.call( + { + connectionId: '11111111-1111-1111-1111-111111111111', + sourceName: 'orders', + source: { + name: 'other_orders', + sql: 'select 1 as id', + grain: ['id'], + columns: [{ name: 'id', type: 'string' }], + measures: [], + joins: [], + } as any, + } as any, + baseContext, + ); + expect(result.structured.success).toBe(false); + expect(result.markdown).toMatch(/does not match sourceName/); + expect(semanticLayerService.writeSource).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/cli/test/context/sql-analysis/dialect.test.ts b/packages/cli/test/context/sql-analysis/dialect.test.ts new file mode 100644 index 00000000..68a92af5 --- /dev/null +++ b/packages/cli/test/context/sql-analysis/dialect.test.ts @@ -0,0 +1,29 @@ +import { describe, expect, it } from 'vitest'; +import { sqlAnalysisDialectForDriver } from '../../../src/context/sql-analysis/dialect.js'; + +describe('sqlAnalysisDialectForDriver', () => { + it('maps ktx.yaml driver names to sqlglot dialects', () => { + expect(sqlAnalysisDialectForDriver('postgres')).toBe('postgres'); + expect(sqlAnalysisDialectForDriver('bigquery')).toBe('bigquery'); + expect(sqlAnalysisDialectForDriver('snowflake')).toBe('snowflake'); + expect(sqlAnalysisDialectForDriver('mysql')).toBe('mysql'); + expect(sqlAnalysisDialectForDriver('sqlserver')).toBe('tsql'); + expect(sqlAnalysisDialectForDriver('sqlite')).toBe('sqlite'); + expect(sqlAnalysisDialectForDriver('duckdb')).toBe('duckdb'); + expect(sqlAnalysisDialectForDriver('clickhouse')).toBe('clickhouse'); + expect(sqlAnalysisDialectForDriver('databricks')).toBe('databricks'); + }); + + it('maps local connection-type spellings to sqlglot dialects', () => { + expect(sqlAnalysisDialectForDriver('POSTGRESQL')).toBe('postgres'); + expect(sqlAnalysisDialectForDriver('SQLSERVER')).toBe('tsql'); + expect(sqlAnalysisDialectForDriver('BIGQUERY')).toBe('bigquery'); + expect(sqlAnalysisDialectForDriver('SQLITE')).toBe('sqlite'); + }); + + it('defaults to postgres for unknown or missing drivers', () => { + expect(sqlAnalysisDialectForDriver(undefined)).toBe('postgres'); + expect(sqlAnalysisDialectForDriver('')).toBe('postgres'); + expect(sqlAnalysisDialectForDriver('unknown')).toBe('postgres'); + }); +});