fix: read semantic sources safely

This commit is contained in:
Andrey Avtomonov 2026-06-10 01:06:52 +02:00
parent 65de75ebd7
commit de1f1a8d5e
8 changed files with 292 additions and 66 deletions

View file

@ -97,30 +97,6 @@ function sqlitePathFromUrl(url: string): string {
return url; return url;
} }
function stripLeadingSqlComments(sql: string): string {
let index = 0;
while (index < sql.length) {
while (/\s/.test(sql[index] ?? '')) {
index += 1;
}
if (sql.startsWith('--', index)) {
const end = sql.indexOf('\n', index + 2);
index = end === -1 ? sql.length : end + 1;
continue;
}
if (sql.startsWith('/*', index)) {
const end = sql.indexOf('*/', index + 2);
if (end === -1) {
return sql.slice(index);
}
index = end + 2;
continue;
}
break;
}
return sql.slice(index);
}
export function isKtxSqliteConnectionConfig( export function isKtxSqliteConnectionConfig(
connection: KtxSqliteConnectionConfig | undefined, connection: KtxSqliteConnectionConfig | undefined,
): connection is KtxSqliteConnectionConfig { ): connection is KtxSqliteConnectionConfig {
@ -255,7 +231,7 @@ export class KtxSqliteScanConnector implements KtxScanConnector {
async executeReadOnly(input: KtxSqliteReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> { async executeReadOnly(input: KtxSqliteReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
this.assertConnection(input.connectionId); this.assertConnection(input.connectionId);
const result = this.query(limitSqlForExecution(stripLeadingSqlComments(input.sql), input.maxRows), input.params); const result = this.query(limitSqlForExecution(input.sql, input.maxRows), input.params);
return { ...result, rowCount: result.rows.length }; return { ...result, rowCount: result.rows.length };
} }

View file

@ -2,8 +2,36 @@ const MUTATING_SQL =
/^\s*(insert|update|delete|merge|alter|drop|create|truncate|grant|revoke|copy|call|do|vacuum|analyze|refresh)\b/i; /^\s*(insert|update|delete|merge|alter|drop|create|truncate|grant|revoke|copy|call|do|vacuum|analyze|refresh)\b/i;
const READ_SQL = /^\s*(select|with)\b/i; const READ_SQL = /^\s*(select|with)\b/i;
// Agents (and the daemon's sqlglot validator, which ignores comments) routinely
// emit read-only queries prefixed with `-- ...` or `/* ... */`. Strip leading
// comments so the prefix check sees the real statement; otherwise valid SELECT/WITH
// SQL is rejected here while the parser-backed validator accepts it.
function stripLeadingSqlComments(sql: string): string {
let index = 0;
while (index < sql.length) {
while (/\s/.test(sql[index] ?? '')) {
index += 1;
}
if (sql.startsWith('--', index)) {
const end = sql.indexOf('\n', index + 2);
index = end === -1 ? sql.length : end + 1;
continue;
}
if (sql.startsWith('/*', index)) {
const end = sql.indexOf('*/', index + 2);
if (end === -1) {
return sql.slice(index);
}
index = end + 2;
continue;
}
break;
}
return sql.slice(index);
}
export function assertReadOnlySql(sql: string): string { export function assertReadOnlySql(sql: string): string {
const trimmed = sql.trim(); const trimmed = stripLeadingSqlComments(sql).trim();
if (!READ_SQL.test(trimmed) || MUTATING_SQL.test(trimmed)) { if (!READ_SQL.test(trimmed) || MUTATING_SQL.test(trimmed)) {
throw new Error('Only read-only SELECT/WITH queries can be executed locally.'); throw new Error('Only read-only SELECT/WITH queries can be executed locally.');
} }

View file

@ -10,6 +10,7 @@ import { createKtxDiscoverDataService } from '../../context/search/discover.js';
import type { SqlAnalysisDialect, SqlAnalysisPort } from '../../context/sql-analysis/ports.js'; import type { SqlAnalysisDialect, SqlAnalysisPort } from '../../context/sql-analysis/ports.js';
import { compileLocalSlQuery } from '../../context/sl/local-query.js'; import { compileLocalSlQuery } from '../../context/sl/local-query.js';
import { createKtxDictionarySearchService } from '../../context/sl/dictionary-search.js'; import { createKtxDictionarySearchService } from '../../context/sl/dictionary-search.js';
import { readLocalSlSource } from '../../context/sl/local-sl.js';
import { readLocalKnowledgePage, searchLocalKnowledgePages } from '../wiki/local-knowledge.js'; import { readLocalKnowledgePage, searchLocalKnowledgePages } from '../wiki/local-knowledge.js';
import type { KtxMcpContextPorts, KtxMcpProgressCallback, KtxSqlExecutionResponse } from './types.js'; import type { KtxMcpContextPorts, KtxMcpProgressCallback, KtxSqlExecutionResponse } from './types.js';
@ -62,23 +63,12 @@ function assertSafeConnectionId(connectionId: string): string {
return assertSafePathToken('connection id', connectionId); return assertSafePathToken('connection id', connectionId);
} }
function assertSafeSourceName(sourceName: string): string {
if (!/^[a-z0-9][a-z0-9_]*$/.test(sourceName)) {
throw new Error(`Unsafe semantic-layer source name: ${sourceName}`);
}
return assertSafePathToken('semantic-layer source name', sourceName);
}
async function cleanupConnector(connector: KtxScanConnector | null): Promise<void> { async function cleanupConnector(connector: KtxScanConnector | null): Promise<void> {
if (connector?.cleanup) { if (connector?.cleanup) {
await connector.cleanup(); await connector.cleanup();
} }
} }
function slPath(connectionId: string, sourceName: string): string {
return `semantic-layer/${assertSafeConnectionId(connectionId)}/${assertSafeSourceName(sourceName)}.yaml`;
}
async function executeValidatedReadOnlySql( async function executeValidatedReadOnlySql(
project: KtxLocalProject, project: KtxLocalProject,
options: CreateLocalProjectMcpContextPortsOptions, options: CreateLocalProjectMcpContextPortsOptions,
@ -188,13 +178,11 @@ export function createLocalProjectMcpContextPorts(
}, },
semanticLayer: { semanticLayer: {
async readSource(input) { async readSource(input) {
const path = slPath(input.connectionId, input.sourceName); const source = await readLocalSlSource(project, {
try { connectionId: input.connectionId,
const result = await project.fileStore.readFile(path); sourceName: input.sourceName,
return { sourceName: input.sourceName, yaml: result.content }; });
} catch { return source ? { sourceName: source.name, yaml: source.yaml } : null;
return null;
}
}, },
async query(input, executionOptions) { async query(input, executionOptions) {
if (!options.semanticLayerCompute) { if (!options.semanticLayerCompute) {

View file

@ -97,8 +97,17 @@ function isSafeConnectionId(connectionId: string | undefined): connectionId is s
return typeof connectionId === 'string' && /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId); return typeof connectionId === 'string' && /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId);
} }
// Standalone files live at a path derived from the source name, so only
// conservative filename-safe names may reach `slPath`. Source names themselves
// mirror the warehouse identifier verbatim (Snowflake's uppercase `SIGNED_UP`
// or `EVENT$LOG`); manifest-backed sources are matched by name in memory and
// must never be gated on this filename rule.
function isSafeSourceName(sourceName: string): boolean {
return /^[A-Za-z0-9][A-Za-z0-9_]*$/.test(sourceName);
}
function assertSafeSourceName(sourceName: string): string { function assertSafeSourceName(sourceName: string): string {
if (!/^[a-z0-9][a-z0-9_]*$/.test(sourceName)) { if (!isSafeSourceName(sourceName)) {
throw new Error(`Unsafe semantic-layer source name: ${sourceName}`); throw new Error(`Unsafe semantic-layer source name: ${sourceName}`);
} }
return assertSafePathToken('semantic-layer source name', sourceName); return assertSafePathToken('semantic-layer source name', sourceName);
@ -220,7 +229,12 @@ export async function loadLocalSlSourceRecords(
for (const path of paths.filter((file) => file.startsWith(`${schemaDir}/`))) { for (const path of paths.filter((file) => file.startsWith(`${schemaDir}/`))) {
const raw = await project.fileStore.readFile(path); const raw = await project.fileStore.readFile(path);
const tables = manifestTables(parseYamlRecord(raw.content)); let tables: Record<string, ManifestTableEntry> | null;
try {
tables = manifestTables(parseYamlRecord(raw.content));
} catch (error) {
throw new Error(`${path}: ${error instanceof Error ? error.message : String(error)}`);
}
if (!tables) { if (!tables) {
continue; continue;
} }
@ -237,7 +251,26 @@ export async function loadLocalSlSourceRecords(
for (const path of paths.filter((file) => !file.startsWith(`${schemaDir}/`))) { for (const path of paths.filter((file) => !file.startsWith(`${schemaDir}/`))) {
const raw = await project.fileStore.readFile(path); const raw = await project.fileStore.readFile(path);
const parsed = parseYamlRecord(raw.content); let parsed: Record<string, unknown>;
try {
parsed = parseYamlRecord(raw.content);
} catch {
// A source mid-edit (e.g. an agent saved half-written YAML) must not take
// down reads, listings, or search for its siblings. Keep the file visible
// under its filename and surface the raw content for repair.
const brokenName = sourceNameFromPath(path);
sources.set(brokenName, {
connectionId,
name: brokenName,
path,
columnCount: 0,
measureCount: 0,
joinCount: 0,
yaml: raw.content,
source: { name: brokenName, grain: [], columns: [], joins: [], measures: [] },
});
continue;
}
const name = typeof parsed.name === 'string' && parsed.name.length > 0 ? parsed.name : sourceNameFromPath(path); const name = typeof parsed.name === 'string' && parsed.name.length > 0 ? parsed.name : sourceNameFromPath(path);
if (parsed.table || parsed.sql) { if (parsed.table || parsed.sql) {
const source = parsedStandaloneSource(parsed, name); const source = parsedStandaloneSource(parsed, name);
@ -317,25 +350,35 @@ export async function writeLocalSlSource(
); );
} }
/** @internal */
export async function readLocalSlSource( export async function readLocalSlSource(
project: KtxLocalProject, project: KtxLocalProject,
input: { connectionId: string; sourceName: string }, input: { connectionId: string; sourceName: string },
): Promise<LocalSlSource | null> { ): Promise<LocalSlSource | null> {
const path = slPath(input.connectionId, input.sourceName); // Only filename-safe names can have a standalone file; manifest-backed
try { // sources mirror raw warehouse identifiers (e.g. `EVENT$LOG`) and are found
const result = await project.fileStore.readFile(path); // by the record lookup below, which never derives a path from the name.
return { if (isSafeSourceName(input.sourceName)) {
...summarizeSource({ connectionId: input.connectionId, path, raw: result.content }), const path = slPath(input.connectionId, input.sourceName);
yaml: result.content, try {
}; const result = await project.fileStore.readFile(path);
} catch { return {
const records = await loadLocalSlSourceRecords(project, { ...summarizeSource({ connectionId: input.connectionId, path, raw: result.content }),
connectionId: input.connectionId, yaml: result.content,
}); };
const record = records.find((source) => source.name === input.sourceName); } catch {
return record ? { ...record } : null; // Missing or mid-edit standalone file — the record loader covers both:
// it skips absent files and represents a file whose YAML no longer
// parses verbatim, so readers — `ktx sl read`, `ktx sl validate`, and
// the `sl_read_source` MCP tool — can surface the broken content for
// repair instead of failing on it.
}
} }
const records = await loadLocalSlSourceRecords(project, {
connectionId: input.connectionId,
});
const record = records.find((source) => source.name === input.sourceName);
return record ? { ...record } : null;
} }
export async function resolveLocalSlSource( export async function resolveLocalSlSource(

View file

@ -15,6 +15,19 @@ describe('assertReadOnlySql', () => {
'Only read-only SELECT/WITH queries can be executed locally', 'Only read-only SELECT/WITH queries can be executed locally',
); );
}); });
it('accepts read-only queries that begin with leading comments', () => {
expect(assertReadOnlySql('-- signups per day\nselect count(*) from public.signed_up')).toBe(
'select count(*) from public.signed_up',
);
expect(assertReadOnlySql('/* block */\n with paid as (select 1) select * from paid')).toContain('with paid');
});
it('still rejects mutating statements hidden behind leading comments', () => {
expect(() => assertReadOnlySql('-- harmless\n delete from orders')).toThrow(
'Only read-only SELECT/WITH queries can be executed locally',
);
});
}); });
describe('limitSqlForExecution', () => { describe('limitSqlForExecution', () => {
@ -27,4 +40,10 @@ describe('limitSqlForExecution', () => {
it('returns the trimmed SQL when no maxRows value is provided', () => { it('returns the trimmed SQL when no maxRows value is provided', () => {
expect(limitSqlForExecution('select * from orders; ', undefined)).toBe('select * from orders'); expect(limitSqlForExecution('select * from orders; ', undefined)).toBe('select * from orders');
}); });
it('strips leading comments before wrapping with a row limit', () => {
expect(limitSqlForExecution('-- top customers\nselect * from public.orders', 25)).toBe(
'select * from (select * from public.orders) as ktx_query_result limit 25',
);
});
}); });

View file

@ -690,7 +690,54 @@ describe('createLocalProjectMcpContextPorts', () => {
}); });
}); });
it('rejects path traversal keys before touching the project directory', async () => { it('reads manifest-backed sources with uppercase warehouse identifiers', async () => {
const project = await initKtxProject({ projectDir: tempDir });
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/PUBLIC.yaml',
[
'tables:',
' SIGNED_UP:',
' table: PUBLIC.SIGNED_UP',
' columns:',
' - name: ID',
' type: number',
' pk: true',
'',
].join('\n'),
'ktx',
'ktx@example.com',
'seed uppercase manifest shard',
);
const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null });
await expect(
ports.semanticLayer?.readSource({ connectionId: 'warehouse', sourceName: 'SIGNED_UP' }),
).resolves.toMatchObject({
sourceName: 'SIGNED_UP',
yaml: expect.stringContaining('table: PUBLIC.SIGNED_UP'),
});
});
it('returns a standalone source verbatim even when its YAML is currently broken', async () => {
const project = await initKtxProject({ projectDir: tempDir });
await project.fileStore.writeFile(
'semantic-layer/warehouse/orders.yaml',
'name: orders\nmeasures:\n - name: revenue\n expr: [unterminated\n',
'ktx',
'ktx@example.com',
'seed broken source mid-edit',
);
const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null });
await expect(
ports.semanticLayer?.readSource({ connectionId: 'warehouse', sourceName: 'orders' }),
).resolves.toMatchObject({
sourceName: 'orders',
yaml: expect.stringContaining('[unterminated'),
});
});
it('keeps path-traversal keys away from the project directory', async () => {
const project = await initKtxProject({ projectDir: tempDir }); const project = await initKtxProject({ projectDir: tempDir });
const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null }); const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null });
@ -701,12 +748,14 @@ describe('createLocalProjectMcpContextPorts', () => {
}), }),
).rejects.toThrow('Invalid wiki key "../outside". Wiki keys must be flat; use "outside".'); ).rejects.toThrow('Invalid wiki key "../outside". Wiki keys must be flat; use "outside".');
// Source reads never derive a file path from the name; a traversal-style
// name simply matches no record.
await expect( await expect(
ports.semanticLayer?.readSource({ ports.semanticLayer?.readSource({
connectionId: 'warehouse', connectionId: 'warehouse',
sourceName: '../orders', sourceName: '../orders',
}), }),
).rejects.toThrow('Unsafe semantic-layer source name'); ).resolves.toBeNull();
}); });
it('uses semantic compute for compile-only sl_query when supplied', async () => { it('uses semantic compute for compile-only sl_query when supplied', async () => {

View file

@ -261,6 +261,119 @@ describe('local semantic-layer helpers', () => {
); );
}); });
it('reads manifest-backed scan sources whose warehouse identifiers are uppercase', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/PUBLIC.yaml',
`tables:
SIGNED_UP:
table: PUBLIC.SIGNED_UP
columns:
- name: ID
type: number
pk: true
- name: EMAIL
type: string
`,
'ktx',
'ktx@example.com',
'Add uppercase manifest shard',
);
await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'SIGNED_UP' })).resolves.toEqual(
expect.objectContaining({
connectionId: 'warehouse',
name: 'SIGNED_UP',
path: 'semantic-layer/warehouse/_schema/PUBLIC.yaml#SIGNED_UP',
yaml: expect.stringContaining('table: PUBLIC.SIGNED_UP'),
}),
);
});
it('reads manifest-backed sources whose names are not filename-safe', async () => {
// Snowflake and Postgres unquoted identifiers allow `$`; manifest keys
// carry the warehouse name verbatim, so the lookup must accept it.
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/PUBLIC.yaml',
`tables:
EVENT$LOG:
table: PUBLIC.EVENT$LOG
columns:
- name: ID
type: number
pk: true
`,
'ktx',
'ktx@example.com',
'Add manifest shard with dollar-sign table name',
);
await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'EVENT$LOG' })).resolves.toEqual(
expect.objectContaining({
connectionId: 'warehouse',
name: 'EVENT$LOG',
path: 'semantic-layer/warehouse/_schema/PUBLIC.yaml#EVENT$LOG',
yaml: expect.stringContaining('table: PUBLIC.EVENT$LOG'),
}),
);
});
it('reads a manifest-backed source while a sibling standalone file has broken YAML', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/PUBLIC.yaml',
`tables:
SIGNED_UP:
table: PUBLIC.SIGNED_UP
columns:
- name: ID
type: number
pk: true
`,
'ktx',
'ktx@example.com',
'Add manifest shard',
);
await project.fileStore.writeFile(
'semantic-layer/warehouse/orders.yaml',
'name: orders\nmeasures:\n - name: revenue\n expr: [unterminated\n',
'ktx',
'ktx@example.com',
'seed a sibling source mid-edit with broken YAML',
);
await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'SIGNED_UP' })).resolves.toEqual(
expect.objectContaining({
name: 'SIGNED_UP',
yaml: expect.stringContaining('table: PUBLIC.SIGNED_UP'),
}),
);
// The broken sibling stays visible in listings instead of hiding or
// failing the whole connection.
await expect(listLocalSlSources(project, { connectionId: 'warehouse' })).resolves.toEqual([
expect.objectContaining({ name: 'orders', columnCount: 0 }),
expect.objectContaining({ name: 'SIGNED_UP', columnCount: 1 }),
]);
});
it('returns the raw YAML of a standalone source whose content no longer parses', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/orders.yaml',
'name: orders\nmeasures:\n - name: revenue\n expr: [unterminated\n',
'ktx',
'ktx@example.com',
'seed a source mid-edit with broken YAML',
);
await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'orders' })).resolves.toEqual(
expect.objectContaining({
connectionId: 'warehouse',
name: 'orders',
path: 'semantic-layer/warehouse/orders.yaml',
yaml: expect.stringContaining('[unterminated'),
}),
);
});
it('expands manifest-backed scan sources when listing all connections', async () => { it('expands manifest-backed scan sources when listing all connections', async () => {
await project.fileStore.writeFile( await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml', 'semantic-layer/warehouse/_schema/public.yaml',
@ -506,12 +619,22 @@ describe('local semantic-layer helpers', () => {
}); });
}); });
it('rejects unsafe source paths', async () => { it('never derives a file path from a traversal-style source name', async () => {
// Reads match names against loaded records, so a traversal-style name is
// simply not found; writes build a file path from the name and must throw.
await expect( await expect(
readLocalSlSource(project, { readLocalSlSource(project, {
connectionId: 'warehouse', connectionId: 'warehouse',
sourceName: '../orders', sourceName: '../orders',
}), }),
).resolves.toBeNull();
await expect(
writeLocalSlSource(project, {
connectionId: 'warehouse',
sourceName: '../orders',
yaml: ORDERS_YAML.replace('name: orders', 'name: ../orders'),
}),
).rejects.toThrow('Unsafe semantic-layer source name'); ).rejects.toThrow('Unsafe semantic-layer source name');
}); });
}); });

4
uv.lock generated
View file

@ -466,7 +466,7 @@ wheels = [
[[package]] [[package]]
name = "ktx-daemon" name = "ktx-daemon"
version = "0.10.0" version = "0.11.0"
source = { editable = "python/ktx-daemon" } source = { editable = "python/ktx-daemon" }
dependencies = [ dependencies = [
{ name = "fastapi" }, { name = "fastapi" },
@ -523,7 +523,7 @@ dev = [
[[package]] [[package]]
name = "ktx-sl" name = "ktx-sl"
version = "0.10.0" version = "0.11.0"
source = { editable = "python/ktx-sl" } source = { editable = "python/ktx-sl" }
dependencies = [ dependencies = [
{ name = "pydantic" }, { name = "pydantic" },