mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-10 08:05:14 +02:00
fix: read semantic sources safely
This commit is contained in:
parent
65de75ebd7
commit
de1f1a8d5e
8 changed files with 292 additions and 66 deletions
|
|
@ -97,30 +97,6 @@ function sqlitePathFromUrl(url: string): string {
|
|||
return url;
|
||||
}
|
||||
|
||||
function stripLeadingSqlComments(sql: string): string {
|
||||
let index = 0;
|
||||
while (index < sql.length) {
|
||||
while (/\s/.test(sql[index] ?? '')) {
|
||||
index += 1;
|
||||
}
|
||||
if (sql.startsWith('--', index)) {
|
||||
const end = sql.indexOf('\n', index + 2);
|
||||
index = end === -1 ? sql.length : end + 1;
|
||||
continue;
|
||||
}
|
||||
if (sql.startsWith('/*', index)) {
|
||||
const end = sql.indexOf('*/', index + 2);
|
||||
if (end === -1) {
|
||||
return sql.slice(index);
|
||||
}
|
||||
index = end + 2;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return sql.slice(index);
|
||||
}
|
||||
|
||||
export function isKtxSqliteConnectionConfig(
|
||||
connection: KtxSqliteConnectionConfig | undefined,
|
||||
): connection is KtxSqliteConnectionConfig {
|
||||
|
|
@ -255,7 +231,7 @@ export class KtxSqliteScanConnector implements KtxScanConnector {
|
|||
|
||||
async executeReadOnly(input: KtxSqliteReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||
this.assertConnection(input.connectionId);
|
||||
const result = this.query(limitSqlForExecution(stripLeadingSqlComments(input.sql), input.maxRows), input.params);
|
||||
const result = this.query(limitSqlForExecution(input.sql, input.maxRows), input.params);
|
||||
return { ...result, rowCount: result.rows.length };
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,36 @@ const MUTATING_SQL =
|
|||
/^\s*(insert|update|delete|merge|alter|drop|create|truncate|grant|revoke|copy|call|do|vacuum|analyze|refresh)\b/i;
|
||||
const READ_SQL = /^\s*(select|with)\b/i;
|
||||
|
||||
// Agents (and the daemon's sqlglot validator, which ignores comments) routinely
|
||||
// emit read-only queries prefixed with `-- ...` or `/* ... */`. Strip leading
|
||||
// comments so the prefix check sees the real statement; otherwise valid SELECT/WITH
|
||||
// SQL is rejected here while the parser-backed validator accepts it.
|
||||
function stripLeadingSqlComments(sql: string): string {
|
||||
let index = 0;
|
||||
while (index < sql.length) {
|
||||
while (/\s/.test(sql[index] ?? '')) {
|
||||
index += 1;
|
||||
}
|
||||
if (sql.startsWith('--', index)) {
|
||||
const end = sql.indexOf('\n', index + 2);
|
||||
index = end === -1 ? sql.length : end + 1;
|
||||
continue;
|
||||
}
|
||||
if (sql.startsWith('/*', index)) {
|
||||
const end = sql.indexOf('*/', index + 2);
|
||||
if (end === -1) {
|
||||
return sql.slice(index);
|
||||
}
|
||||
index = end + 2;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return sql.slice(index);
|
||||
}
|
||||
|
||||
export function assertReadOnlySql(sql: string): string {
|
||||
const trimmed = sql.trim();
|
||||
const trimmed = stripLeadingSqlComments(sql).trim();
|
||||
if (!READ_SQL.test(trimmed) || MUTATING_SQL.test(trimmed)) {
|
||||
throw new Error('Only read-only SELECT/WITH queries can be executed locally.');
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import { createKtxDiscoverDataService } from '../../context/search/discover.js';
|
|||
import type { SqlAnalysisDialect, SqlAnalysisPort } from '../../context/sql-analysis/ports.js';
|
||||
import { compileLocalSlQuery } from '../../context/sl/local-query.js';
|
||||
import { createKtxDictionarySearchService } from '../../context/sl/dictionary-search.js';
|
||||
import { readLocalSlSource } from '../../context/sl/local-sl.js';
|
||||
import { readLocalKnowledgePage, searchLocalKnowledgePages } from '../wiki/local-knowledge.js';
|
||||
import type { KtxMcpContextPorts, KtxMcpProgressCallback, KtxSqlExecutionResponse } from './types.js';
|
||||
|
||||
|
|
@ -62,23 +63,12 @@ function assertSafeConnectionId(connectionId: string): string {
|
|||
return assertSafePathToken('connection id', connectionId);
|
||||
}
|
||||
|
||||
function assertSafeSourceName(sourceName: string): string {
|
||||
if (!/^[a-z0-9][a-z0-9_]*$/.test(sourceName)) {
|
||||
throw new Error(`Unsafe semantic-layer source name: ${sourceName}`);
|
||||
}
|
||||
return assertSafePathToken('semantic-layer source name', sourceName);
|
||||
}
|
||||
|
||||
async function cleanupConnector(connector: KtxScanConnector | null): Promise<void> {
|
||||
if (connector?.cleanup) {
|
||||
await connector.cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
function slPath(connectionId: string, sourceName: string): string {
|
||||
return `semantic-layer/${assertSafeConnectionId(connectionId)}/${assertSafeSourceName(sourceName)}.yaml`;
|
||||
}
|
||||
|
||||
async function executeValidatedReadOnlySql(
|
||||
project: KtxLocalProject,
|
||||
options: CreateLocalProjectMcpContextPortsOptions,
|
||||
|
|
@ -188,13 +178,11 @@ export function createLocalProjectMcpContextPorts(
|
|||
},
|
||||
semanticLayer: {
|
||||
async readSource(input) {
|
||||
const path = slPath(input.connectionId, input.sourceName);
|
||||
try {
|
||||
const result = await project.fileStore.readFile(path);
|
||||
return { sourceName: input.sourceName, yaml: result.content };
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
const source = await readLocalSlSource(project, {
|
||||
connectionId: input.connectionId,
|
||||
sourceName: input.sourceName,
|
||||
});
|
||||
return source ? { sourceName: source.name, yaml: source.yaml } : null;
|
||||
},
|
||||
async query(input, executionOptions) {
|
||||
if (!options.semanticLayerCompute) {
|
||||
|
|
|
|||
|
|
@ -97,8 +97,17 @@ function isSafeConnectionId(connectionId: string | undefined): connectionId is s
|
|||
return typeof connectionId === 'string' && /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId);
|
||||
}
|
||||
|
||||
// Standalone files live at a path derived from the source name, so only
|
||||
// conservative filename-safe names may reach `slPath`. Source names themselves
|
||||
// mirror the warehouse identifier verbatim (Snowflake's uppercase `SIGNED_UP`
|
||||
// or `EVENT$LOG`); manifest-backed sources are matched by name in memory and
|
||||
// must never be gated on this filename rule.
|
||||
function isSafeSourceName(sourceName: string): boolean {
|
||||
return /^[A-Za-z0-9][A-Za-z0-9_]*$/.test(sourceName);
|
||||
}
|
||||
|
||||
function assertSafeSourceName(sourceName: string): string {
|
||||
if (!/^[a-z0-9][a-z0-9_]*$/.test(sourceName)) {
|
||||
if (!isSafeSourceName(sourceName)) {
|
||||
throw new Error(`Unsafe semantic-layer source name: ${sourceName}`);
|
||||
}
|
||||
return assertSafePathToken('semantic-layer source name', sourceName);
|
||||
|
|
@ -220,7 +229,12 @@ export async function loadLocalSlSourceRecords(
|
|||
|
||||
for (const path of paths.filter((file) => file.startsWith(`${schemaDir}/`))) {
|
||||
const raw = await project.fileStore.readFile(path);
|
||||
const tables = manifestTables(parseYamlRecord(raw.content));
|
||||
let tables: Record<string, ManifestTableEntry> | null;
|
||||
try {
|
||||
tables = manifestTables(parseYamlRecord(raw.content));
|
||||
} catch (error) {
|
||||
throw new Error(`${path}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
if (!tables) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -237,7 +251,26 @@ export async function loadLocalSlSourceRecords(
|
|||
|
||||
for (const path of paths.filter((file) => !file.startsWith(`${schemaDir}/`))) {
|
||||
const raw = await project.fileStore.readFile(path);
|
||||
const parsed = parseYamlRecord(raw.content);
|
||||
let parsed: Record<string, unknown>;
|
||||
try {
|
||||
parsed = parseYamlRecord(raw.content);
|
||||
} catch {
|
||||
// A source mid-edit (e.g. an agent saved half-written YAML) must not take
|
||||
// down reads, listings, or search for its siblings. Keep the file visible
|
||||
// under its filename and surface the raw content for repair.
|
||||
const brokenName = sourceNameFromPath(path);
|
||||
sources.set(brokenName, {
|
||||
connectionId,
|
||||
name: brokenName,
|
||||
path,
|
||||
columnCount: 0,
|
||||
measureCount: 0,
|
||||
joinCount: 0,
|
||||
yaml: raw.content,
|
||||
source: { name: brokenName, grain: [], columns: [], joins: [], measures: [] },
|
||||
});
|
||||
continue;
|
||||
}
|
||||
const name = typeof parsed.name === 'string' && parsed.name.length > 0 ? parsed.name : sourceNameFromPath(path);
|
||||
if (parsed.table || parsed.sql) {
|
||||
const source = parsedStandaloneSource(parsed, name);
|
||||
|
|
@ -317,25 +350,35 @@ export async function writeLocalSlSource(
|
|||
);
|
||||
}
|
||||
|
||||
/** @internal */
|
||||
export async function readLocalSlSource(
|
||||
project: KtxLocalProject,
|
||||
input: { connectionId: string; sourceName: string },
|
||||
): Promise<LocalSlSource | null> {
|
||||
const path = slPath(input.connectionId, input.sourceName);
|
||||
try {
|
||||
const result = await project.fileStore.readFile(path);
|
||||
return {
|
||||
...summarizeSource({ connectionId: input.connectionId, path, raw: result.content }),
|
||||
yaml: result.content,
|
||||
};
|
||||
} catch {
|
||||
const records = await loadLocalSlSourceRecords(project, {
|
||||
connectionId: input.connectionId,
|
||||
});
|
||||
const record = records.find((source) => source.name === input.sourceName);
|
||||
return record ? { ...record } : null;
|
||||
// Only filename-safe names can have a standalone file; manifest-backed
|
||||
// sources mirror raw warehouse identifiers (e.g. `EVENT$LOG`) and are found
|
||||
// by the record lookup below, which never derives a path from the name.
|
||||
if (isSafeSourceName(input.sourceName)) {
|
||||
const path = slPath(input.connectionId, input.sourceName);
|
||||
try {
|
||||
const result = await project.fileStore.readFile(path);
|
||||
return {
|
||||
...summarizeSource({ connectionId: input.connectionId, path, raw: result.content }),
|
||||
yaml: result.content,
|
||||
};
|
||||
} catch {
|
||||
// Missing or mid-edit standalone file — the record loader covers both:
|
||||
// it skips absent files and represents a file whose YAML no longer
|
||||
// parses verbatim, so readers — `ktx sl read`, `ktx sl validate`, and
|
||||
// the `sl_read_source` MCP tool — can surface the broken content for
|
||||
// repair instead of failing on it.
|
||||
}
|
||||
}
|
||||
|
||||
const records = await loadLocalSlSourceRecords(project, {
|
||||
connectionId: input.connectionId,
|
||||
});
|
||||
const record = records.find((source) => source.name === input.sourceName);
|
||||
return record ? { ...record } : null;
|
||||
}
|
||||
|
||||
export async function resolveLocalSlSource(
|
||||
|
|
|
|||
|
|
@ -15,6 +15,19 @@ describe('assertReadOnlySql', () => {
|
|||
'Only read-only SELECT/WITH queries can be executed locally',
|
||||
);
|
||||
});
|
||||
|
||||
it('accepts read-only queries that begin with leading comments', () => {
|
||||
expect(assertReadOnlySql('-- signups per day\nselect count(*) from public.signed_up')).toBe(
|
||||
'select count(*) from public.signed_up',
|
||||
);
|
||||
expect(assertReadOnlySql('/* block */\n with paid as (select 1) select * from paid')).toContain('with paid');
|
||||
});
|
||||
|
||||
it('still rejects mutating statements hidden behind leading comments', () => {
|
||||
expect(() => assertReadOnlySql('-- harmless\n delete from orders')).toThrow(
|
||||
'Only read-only SELECT/WITH queries can be executed locally',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('limitSqlForExecution', () => {
|
||||
|
|
@ -27,4 +40,10 @@ describe('limitSqlForExecution', () => {
|
|||
it('returns the trimmed SQL when no maxRows value is provided', () => {
|
||||
expect(limitSqlForExecution('select * from orders; ', undefined)).toBe('select * from orders');
|
||||
});
|
||||
|
||||
it('strips leading comments before wrapping with a row limit', () => {
|
||||
expect(limitSqlForExecution('-- top customers\nselect * from public.orders', 25)).toBe(
|
||||
'select * from (select * from public.orders) as ktx_query_result limit 25',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -690,7 +690,54 @@ describe('createLocalProjectMcpContextPorts', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('rejects path traversal keys before touching the project directory', async () => {
|
||||
it('reads manifest-backed sources with uppercase warehouse identifiers', async () => {
|
||||
const project = await initKtxProject({ projectDir: tempDir });
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/PUBLIC.yaml',
|
||||
[
|
||||
'tables:',
|
||||
' SIGNED_UP:',
|
||||
' table: PUBLIC.SIGNED_UP',
|
||||
' columns:',
|
||||
' - name: ID',
|
||||
' type: number',
|
||||
' pk: true',
|
||||
'',
|
||||
].join('\n'),
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed uppercase manifest shard',
|
||||
);
|
||||
const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null });
|
||||
|
||||
await expect(
|
||||
ports.semanticLayer?.readSource({ connectionId: 'warehouse', sourceName: 'SIGNED_UP' }),
|
||||
).resolves.toMatchObject({
|
||||
sourceName: 'SIGNED_UP',
|
||||
yaml: expect.stringContaining('table: PUBLIC.SIGNED_UP'),
|
||||
});
|
||||
});
|
||||
|
||||
it('returns a standalone source verbatim even when its YAML is currently broken', async () => {
|
||||
const project = await initKtxProject({ projectDir: tempDir });
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/orders.yaml',
|
||||
'name: orders\nmeasures:\n - name: revenue\n expr: [unterminated\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed broken source mid-edit',
|
||||
);
|
||||
const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null });
|
||||
|
||||
await expect(
|
||||
ports.semanticLayer?.readSource({ connectionId: 'warehouse', sourceName: 'orders' }),
|
||||
).resolves.toMatchObject({
|
||||
sourceName: 'orders',
|
||||
yaml: expect.stringContaining('[unterminated'),
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps path-traversal keys away from the project directory', async () => {
|
||||
const project = await initKtxProject({ projectDir: tempDir });
|
||||
const ports = createLocalProjectMcpContextPorts(project, { embeddingService: null });
|
||||
|
||||
|
|
@ -701,12 +748,14 @@ describe('createLocalProjectMcpContextPorts', () => {
|
|||
}),
|
||||
).rejects.toThrow('Invalid wiki key "../outside". Wiki keys must be flat; use "outside".');
|
||||
|
||||
// Source reads never derive a file path from the name; a traversal-style
|
||||
// name simply matches no record.
|
||||
await expect(
|
||||
ports.semanticLayer?.readSource({
|
||||
connectionId: 'warehouse',
|
||||
sourceName: '../orders',
|
||||
}),
|
||||
).rejects.toThrow('Unsafe semantic-layer source name');
|
||||
).resolves.toBeNull();
|
||||
});
|
||||
|
||||
it('uses semantic compute for compile-only sl_query when supplied', async () => {
|
||||
|
|
|
|||
|
|
@ -261,6 +261,119 @@ describe('local semantic-layer helpers', () => {
|
|||
);
|
||||
});
|
||||
|
||||
it('reads manifest-backed scan sources whose warehouse identifiers are uppercase', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/PUBLIC.yaml',
|
||||
`tables:
|
||||
SIGNED_UP:
|
||||
table: PUBLIC.SIGNED_UP
|
||||
columns:
|
||||
- name: ID
|
||||
type: number
|
||||
pk: true
|
||||
- name: EMAIL
|
||||
type: string
|
||||
`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Add uppercase manifest shard',
|
||||
);
|
||||
|
||||
await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'SIGNED_UP' })).resolves.toEqual(
|
||||
expect.objectContaining({
|
||||
connectionId: 'warehouse',
|
||||
name: 'SIGNED_UP',
|
||||
path: 'semantic-layer/warehouse/_schema/PUBLIC.yaml#SIGNED_UP',
|
||||
yaml: expect.stringContaining('table: PUBLIC.SIGNED_UP'),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('reads manifest-backed sources whose names are not filename-safe', async () => {
|
||||
// Snowflake and Postgres unquoted identifiers allow `$`; manifest keys
|
||||
// carry the warehouse name verbatim, so the lookup must accept it.
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/PUBLIC.yaml',
|
||||
`tables:
|
||||
EVENT$LOG:
|
||||
table: PUBLIC.EVENT$LOG
|
||||
columns:
|
||||
- name: ID
|
||||
type: number
|
||||
pk: true
|
||||
`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Add manifest shard with dollar-sign table name',
|
||||
);
|
||||
|
||||
await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'EVENT$LOG' })).resolves.toEqual(
|
||||
expect.objectContaining({
|
||||
connectionId: 'warehouse',
|
||||
name: 'EVENT$LOG',
|
||||
path: 'semantic-layer/warehouse/_schema/PUBLIC.yaml#EVENT$LOG',
|
||||
yaml: expect.stringContaining('table: PUBLIC.EVENT$LOG'),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('reads a manifest-backed source while a sibling standalone file has broken YAML', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/PUBLIC.yaml',
|
||||
`tables:
|
||||
SIGNED_UP:
|
||||
table: PUBLIC.SIGNED_UP
|
||||
columns:
|
||||
- name: ID
|
||||
type: number
|
||||
pk: true
|
||||
`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Add manifest shard',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/orders.yaml',
|
||||
'name: orders\nmeasures:\n - name: revenue\n expr: [unterminated\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed a sibling source mid-edit with broken YAML',
|
||||
);
|
||||
|
||||
await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'SIGNED_UP' })).resolves.toEqual(
|
||||
expect.objectContaining({
|
||||
name: 'SIGNED_UP',
|
||||
yaml: expect.stringContaining('table: PUBLIC.SIGNED_UP'),
|
||||
}),
|
||||
);
|
||||
|
||||
// The broken sibling stays visible in listings instead of hiding or
|
||||
// failing the whole connection.
|
||||
await expect(listLocalSlSources(project, { connectionId: 'warehouse' })).resolves.toEqual([
|
||||
expect.objectContaining({ name: 'orders', columnCount: 0 }),
|
||||
expect.objectContaining({ name: 'SIGNED_UP', columnCount: 1 }),
|
||||
]);
|
||||
});
|
||||
|
||||
it('returns the raw YAML of a standalone source whose content no longer parses', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/orders.yaml',
|
||||
'name: orders\nmeasures:\n - name: revenue\n expr: [unterminated\n',
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'seed a source mid-edit with broken YAML',
|
||||
);
|
||||
|
||||
await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'orders' })).resolves.toEqual(
|
||||
expect.objectContaining({
|
||||
connectionId: 'warehouse',
|
||||
name: 'orders',
|
||||
path: 'semantic-layer/warehouse/orders.yaml',
|
||||
yaml: expect.stringContaining('[unterminated'),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('expands manifest-backed scan sources when listing all connections', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
|
|
@ -506,12 +619,22 @@ describe('local semantic-layer helpers', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('rejects unsafe source paths', async () => {
|
||||
it('never derives a file path from a traversal-style source name', async () => {
|
||||
// Reads match names against loaded records, so a traversal-style name is
|
||||
// simply not found; writes build a file path from the name and must throw.
|
||||
await expect(
|
||||
readLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: '../orders',
|
||||
}),
|
||||
).resolves.toBeNull();
|
||||
|
||||
await expect(
|
||||
writeLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: '../orders',
|
||||
yaml: ORDERS_YAML.replace('name: orders', 'name: ../orders'),
|
||||
}),
|
||||
).rejects.toThrow('Unsafe semantic-layer source name');
|
||||
});
|
||||
});
|
||||
|
|
|
|||
4
uv.lock
generated
4
uv.lock
generated
|
|
@ -466,7 +466,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "ktx-daemon"
|
||||
version = "0.10.0"
|
||||
version = "0.11.0"
|
||||
source = { editable = "python/ktx-daemon" }
|
||||
dependencies = [
|
||||
{ name = "fastapi" },
|
||||
|
|
@ -523,7 +523,7 @@ dev = [
|
|||
|
||||
[[package]]
|
||||
name = "ktx-sl"
|
||||
version = "0.10.0"
|
||||
version = "0.11.0"
|
||||
source = { editable = "python/ktx-sl" }
|
||||
dependencies = [
|
||||
{ name = "pydantic" },
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue