mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-25 08:48:08 +02:00
Initial open-source release
This commit is contained in:
commit
1a42152e6f
1199 changed files with 257054 additions and 0 deletions
34
packages/context/src/sl/descriptions.ts
Normal file
34
packages/context/src/sl/descriptions.ts
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
const DESCRIPTION_SOURCES = ['user', 'ai', 'dbt', 'db'] as const;
|
||||
type DescriptionSource = (typeof DESCRIPTION_SOURCES)[number];
|
||||
|
||||
type DescriptionSources = Record<string, string>;
|
||||
|
||||
interface DescriptionResolutionConfig {
|
||||
priority: string[];
|
||||
}
|
||||
|
||||
export const DEFAULT_PRIORITY: DescriptionSource[] = [...DESCRIPTION_SOURCES];
|
||||
|
||||
/**
|
||||
* Resolves which description to surface based on a priority list.
|
||||
* Returns the first non-empty description matching a priority key,
|
||||
* falling back to the first available value for unknown sources.
|
||||
*/
|
||||
export function resolveDescription(
|
||||
descriptions: DescriptionSources | undefined,
|
||||
config: DescriptionResolutionConfig,
|
||||
): string | null {
|
||||
if (!descriptions || Object.keys(descriptions).length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
for (const source of config.priority) {
|
||||
const text = descriptions[source];
|
||||
if (text) {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: first available value (for unknown future sources)
|
||||
return Object.values(descriptions).find(Boolean) ?? null;
|
||||
}
|
||||
32
packages/context/src/sl/index.ts
Normal file
32
packages/context/src/sl/index.ts
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
export type { SlValidationResult, SlValidatorPort } from './sl-validator.port.js';
|
||||
export type {
|
||||
SemanticLayerQueryExecutionResult,
|
||||
SemanticLayerQueryInput,
|
||||
SemanticLayerSource,
|
||||
SlDictionaryMatch,
|
||||
SlSearchLaneSummary,
|
||||
SlSearchMatchReason,
|
||||
SlSearchMetadata,
|
||||
} from './types.js';
|
||||
export type {
|
||||
KloConnectionInfo,
|
||||
KloQueryResult,
|
||||
SlConnectionCatalogPort,
|
||||
SlPythonPort,
|
||||
SlSourcesIndexPort,
|
||||
} from './ports.js';
|
||||
export { DEFAULT_PRIORITY, resolveDescription } from './descriptions.js';
|
||||
export { isOverlaySource, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js';
|
||||
export {
|
||||
composeOverlay,
|
||||
enrichColumnsFromManifest,
|
||||
findDanglingSegmentRefs,
|
||||
SemanticLayerService,
|
||||
} from './semantic-layer.service.js';
|
||||
export { loadLatestSlDictionaryEntries } from './sl-dictionary-profile.js';
|
||||
export type { SlDictionaryEntry } from './sl-dictionary-profile.js';
|
||||
export { buildSemanticLayerSourceSearchText, SlSearchService } from './sl-search.service.js';
|
||||
export { SqliteSlSourcesIndex, type SqliteSlSourcesIndexOptions } from './sqlite-sl-sources-index.js';
|
||||
export * from './local-sl.js';
|
||||
export * from './local-query.js';
|
||||
export * from './tools/index.js';
|
||||
260
packages/context/src/sl/local-query.test.ts
Normal file
260
packages/context/src/sl/local-query.test.ts
Normal file
|
|
@ -0,0 +1,260 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
import type { KloSemanticLayerComputePort } from '../daemon/index.js';
|
||||
import { initKloProject, type KloLocalProject } from '../project/index.js';
|
||||
import { compileLocalSlQuery } from './local-query.js';
|
||||
|
||||
describe('compileLocalSlQuery', () => {
|
||||
let tempDir: string;
|
||||
let project: KloLocalProject;
|
||||
let compute: KloSemanticLayerComputePort;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-local-query-'));
|
||||
project = await initKloProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' });
|
||||
project.config.connections.warehouse = { driver: 'postgres', readonly: true };
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/orders.yaml',
|
||||
`name: orders
|
||||
table: public.orders
|
||||
grain:
|
||||
- id
|
||||
columns:
|
||||
- name: id
|
||||
type: number
|
||||
- name: status
|
||||
type: string
|
||||
measures:
|
||||
- name: order_count
|
||||
expr: count(*)
|
||||
joins: []
|
||||
`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Add orders source',
|
||||
);
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/orders_overlay.yaml',
|
||||
`name: orders_overlay
|
||||
inherits_columns_from: orders
|
||||
columns:
|
||||
- name: paid_at
|
||||
type: timestamp
|
||||
joins: []
|
||||
measures: []
|
||||
grain: []
|
||||
`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Add overlay source',
|
||||
);
|
||||
|
||||
compute = {
|
||||
query: vi.fn(async (input) => ({
|
||||
sql: 'select status, count(*) as order_count from public.orders group by status',
|
||||
dialect: input.dialect,
|
||||
columns: [{ name: 'orders.status' }, { name: 'orders.order_count' }],
|
||||
plan: { measures: input.query.measures, dimensions: input.query.dimensions },
|
||||
})),
|
||||
validateSources: vi.fn(),
|
||||
generateSources: vi.fn(),
|
||||
};
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('compiles a local semantic-layer query with computable sources only', async () => {
|
||||
const result = await compileLocalSlQuery(project, {
|
||||
connectionId: 'warehouse',
|
||||
query: {
|
||||
measures: ['orders.order_count'],
|
||||
dimensions: ['orders.status'],
|
||||
limit: 25,
|
||||
},
|
||||
compute,
|
||||
});
|
||||
|
||||
expect(compute.query).toHaveBeenCalledWith({
|
||||
sources: [
|
||||
{
|
||||
name: 'orders',
|
||||
table: 'public.orders',
|
||||
grain: ['id'],
|
||||
columns: [
|
||||
{ name: 'id', type: 'number' },
|
||||
{ name: 'status', type: 'string' },
|
||||
],
|
||||
measures: [{ name: 'order_count', expr: 'count(*)' }],
|
||||
joins: [],
|
||||
},
|
||||
],
|
||||
dialect: 'postgres',
|
||||
query: {
|
||||
measures: ['orders.order_count'],
|
||||
dimensions: ['orders.status'],
|
||||
limit: 25,
|
||||
},
|
||||
});
|
||||
expect(result).toEqual({
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
sql: 'select status, count(*) as order_count from public.orders group by status',
|
||||
headers: ['orders.status', 'orders.order_count'],
|
||||
rows: [],
|
||||
totalRows: 0,
|
||||
plan: {
|
||||
measures: ['orders.order_count'],
|
||||
dimensions: ['orders.status'],
|
||||
execution: {
|
||||
mode: 'compile_only',
|
||||
reason: 'Local semantic-layer query compiled SQL but no data-source execution adapter is configured.',
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('compiles a local semantic-layer query from manifest-backed scan sources', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
`tables:
|
||||
payments:
|
||||
table: public.payments
|
||||
columns:
|
||||
- name: payment_id
|
||||
type: number
|
||||
pk: true
|
||||
- name: amount
|
||||
type: number
|
||||
`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Add manifest shard',
|
||||
);
|
||||
|
||||
await compileLocalSlQuery(project, {
|
||||
connectionId: 'warehouse',
|
||||
query: {
|
||||
measures: ['sum(payments.amount)'],
|
||||
dimensions: [],
|
||||
},
|
||||
compute,
|
||||
});
|
||||
|
||||
expect(compute.query).toHaveBeenLastCalledWith({
|
||||
sources: expect.arrayContaining([
|
||||
{
|
||||
name: 'payments',
|
||||
table: 'public.payments',
|
||||
grain: ['payment_id'],
|
||||
columns: [
|
||||
{
|
||||
name: 'payment_id',
|
||||
type: 'number',
|
||||
role: undefined,
|
||||
descriptions: undefined,
|
||||
constraints: undefined,
|
||||
enum_values: undefined,
|
||||
tests: undefined,
|
||||
},
|
||||
{
|
||||
name: 'amount',
|
||||
type: 'number',
|
||||
role: undefined,
|
||||
descriptions: undefined,
|
||||
constraints: undefined,
|
||||
enum_values: undefined,
|
||||
tests: undefined,
|
||||
},
|
||||
],
|
||||
joins: [],
|
||||
measures: [],
|
||||
},
|
||||
]),
|
||||
dialect: 'postgres',
|
||||
query: {
|
||||
measures: ['sum(payments.amount)'],
|
||||
dimensions: [],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('resolves the only configured connection when connectionId is omitted', async () => {
|
||||
await compileLocalSlQuery(project, {
|
||||
query: { measures: ['orders.order_count'], dimensions: [] },
|
||||
compute,
|
||||
});
|
||||
|
||||
expect(compute.query).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
dialect: 'postgres',
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('executes compiled SQL through a local query executor when requested', async () => {
|
||||
const queryExecutor = {
|
||||
execute: vi.fn(async () => ({
|
||||
headers: ['status', 'order_count'],
|
||||
rows: [['paid', 2]],
|
||||
totalRows: 1,
|
||||
command: 'SELECT',
|
||||
rowCount: 1,
|
||||
})),
|
||||
};
|
||||
|
||||
const result = await compileLocalSlQuery(project, {
|
||||
connectionId: 'warehouse',
|
||||
query: {
|
||||
measures: ['orders.order_count'],
|
||||
dimensions: ['orders.status'],
|
||||
limit: 25,
|
||||
},
|
||||
compute,
|
||||
execute: true,
|
||||
maxRows: 10,
|
||||
queryExecutor,
|
||||
});
|
||||
|
||||
expect(queryExecutor.execute).toHaveBeenCalledWith({
|
||||
connectionId: 'warehouse',
|
||||
projectDir: project.projectDir,
|
||||
connection: { driver: 'postgres', readonly: true },
|
||||
sql: 'select status, count(*) as order_count from public.orders group by status',
|
||||
maxRows: 10,
|
||||
});
|
||||
expect(result.rows).toEqual([['paid', 2]]);
|
||||
expect(result.totalRows).toBe(1);
|
||||
expect(result.plan.execution).toEqual({
|
||||
mode: 'executed',
|
||||
driver: 'postgres',
|
||||
maxRows: 10,
|
||||
rowCount: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it('requires a query executor for executed mode', async () => {
|
||||
await expect(
|
||||
compileLocalSlQuery(project, {
|
||||
connectionId: 'warehouse',
|
||||
query: { measures: ['orders.order_count'], dimensions: [] },
|
||||
compute,
|
||||
execute: true,
|
||||
}),
|
||||
).rejects.toThrow('Local semantic-layer execution requires a query executor.');
|
||||
});
|
||||
|
||||
it('requires connectionId when multiple connections are configured', async () => {
|
||||
project.config.connections.analytics = { driver: 'bigquery', readonly: true };
|
||||
|
||||
await expect(
|
||||
compileLocalSlQuery(project, {
|
||||
query: { measures: ['orders.order_count'], dimensions: [] },
|
||||
compute,
|
||||
}),
|
||||
).rejects.toThrow('connectionId is required when the local project has zero or multiple connections.');
|
||||
});
|
||||
});
|
||||
150
packages/context/src/sl/local-query.ts
Normal file
150
packages/context/src/sl/local-query.ts
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
import type { KloSqlQueryExecutorPort } from '../connections/index.js';
|
||||
import type { KloSemanticLayerComputePort } from '../daemon/index.js';
|
||||
import type { KloLocalProject } from '../project/index.js';
|
||||
import { loadLocalSlSourceRecords } from './local-sl.js';
|
||||
import type { SemanticLayerQueryExecutionResult, SemanticLayerQueryInput } from './types.js';
|
||||
|
||||
const COMPILE_ONLY_REASON =
|
||||
'Local semantic-layer query compiled SQL but no data-source execution adapter is configured.';
|
||||
|
||||
export interface CompileLocalSlQueryOptions {
|
||||
connectionId?: string;
|
||||
query: SemanticLayerQueryInput;
|
||||
compute: KloSemanticLayerComputePort;
|
||||
execute?: boolean;
|
||||
maxRows?: number;
|
||||
queryExecutor?: KloSqlQueryExecutorPort;
|
||||
}
|
||||
|
||||
export interface CompileLocalSlQueryResult extends SemanticLayerQueryExecutionResult {
|
||||
connectionId: string;
|
||||
dialect: string;
|
||||
}
|
||||
|
||||
function assertSafePathToken(kind: string, value: string): string {
|
||||
if (
|
||||
value.trim().length === 0 ||
|
||||
value.includes('..') ||
|
||||
value.includes('\\') ||
|
||||
value.startsWith('/') ||
|
||||
value.startsWith('.') ||
|
||||
value.includes('//')
|
||||
) {
|
||||
throw new Error(`Unsafe ${kind}: ${value}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function assertSafeConnectionId(connectionId: string): string {
|
||||
if (!/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId)) {
|
||||
throw new Error(`Unsafe connection id: ${connectionId}`);
|
||||
}
|
||||
return assertSafePathToken('connection id', connectionId);
|
||||
}
|
||||
|
||||
function dialectForDriver(driver: string | undefined): string {
|
||||
const normalized = (driver ?? 'postgres').toUpperCase();
|
||||
const map: Record<string, string> = {
|
||||
POSTGRESQL: 'postgres',
|
||||
POSTGRES: 'postgres',
|
||||
BIGQUERY: 'bigquery',
|
||||
SNOWFLAKE: 'snowflake',
|
||||
MYSQL: 'mysql',
|
||||
SQLSERVER: 'tsql',
|
||||
MSSQL: 'tsql',
|
||||
SQLITE: 'sqlite',
|
||||
DUCKDB: 'duckdb',
|
||||
CLICKHOUSE: 'clickhouse',
|
||||
REDSHIFT: 'redshift',
|
||||
DATABRICKS: 'databricks',
|
||||
};
|
||||
return map[normalized] ?? 'postgres';
|
||||
}
|
||||
|
||||
function resolveLocalConnectionId(project: KloLocalProject, requested: string | undefined): string {
|
||||
if (requested) {
|
||||
return assertSafeConnectionId(requested);
|
||||
}
|
||||
const ids = Object.keys(project.config.connections).sort();
|
||||
if (ids.length === 1) {
|
||||
return assertSafeConnectionId(ids[0]);
|
||||
}
|
||||
throw new Error('connectionId is required when the local project has zero or multiple connections.');
|
||||
}
|
||||
|
||||
async function loadComputableSources(
|
||||
project: KloLocalProject,
|
||||
connectionId: string,
|
||||
): Promise<Record<string, unknown>[]> {
|
||||
return (await loadLocalSlSourceRecords(project, { connectionId: assertSafeConnectionId(connectionId) }))
|
||||
.map((record) => ({ ...record.source }))
|
||||
.filter((source) => source.table || source.sql);
|
||||
}
|
||||
|
||||
function headersFromColumns(columns: Array<Record<string, unknown>>): string[] {
|
||||
return columns
|
||||
.map((column) => column.name)
|
||||
.filter((name): name is string => typeof name === 'string' && name.length > 0);
|
||||
}
|
||||
|
||||
export async function compileLocalSlQuery(
|
||||
project: KloLocalProject,
|
||||
options: CompileLocalSlQueryOptions,
|
||||
): Promise<CompileLocalSlQueryResult> {
|
||||
const connectionId = resolveLocalConnectionId(project, options.connectionId);
|
||||
const dialect = dialectForDriver(project.config.connections[connectionId]?.driver);
|
||||
const response = await options.compute.query({
|
||||
sources: await loadComputableSources(project, connectionId),
|
||||
dialect,
|
||||
query: options.query,
|
||||
});
|
||||
|
||||
if (!options.execute) {
|
||||
return {
|
||||
connectionId,
|
||||
dialect: response.dialect,
|
||||
sql: response.sql,
|
||||
headers: headersFromColumns(response.columns),
|
||||
rows: [],
|
||||
totalRows: 0,
|
||||
plan: {
|
||||
...response.plan,
|
||||
execution: {
|
||||
mode: 'compile_only',
|
||||
reason: COMPILE_ONLY_REASON,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
if (!options.queryExecutor) {
|
||||
throw new Error('Local semantic-layer execution requires a query executor.');
|
||||
}
|
||||
|
||||
const maxRows = options.maxRows ?? options.query.limit;
|
||||
const execution = await options.queryExecutor.execute({
|
||||
connectionId,
|
||||
projectDir: project.projectDir,
|
||||
connection: project.config.connections[connectionId],
|
||||
sql: response.sql,
|
||||
maxRows,
|
||||
});
|
||||
|
||||
return {
|
||||
connectionId,
|
||||
dialect: response.dialect,
|
||||
sql: response.sql,
|
||||
headers: execution.headers,
|
||||
rows: execution.rows,
|
||||
totalRows: execution.totalRows,
|
||||
plan: {
|
||||
...response.plan,
|
||||
execution: {
|
||||
mode: 'executed',
|
||||
driver: project.config.connections[connectionId]?.driver ?? 'unknown',
|
||||
maxRows,
|
||||
rowCount: execution.rowCount,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
321
packages/context/src/sl/local-sl.test.ts
Normal file
321
packages/context/src/sl/local-sl.test.ts
Normal file
|
|
@ -0,0 +1,321 @@
|
|||
import { access, mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { initKloProject, type KloLocalProject } from '../project/index.js';
|
||||
import {
|
||||
listLocalSlSources,
|
||||
readLocalSlSource,
|
||||
searchLocalSlSources,
|
||||
validateLocalSlSource,
|
||||
writeLocalSlSource,
|
||||
} from './local-sl.js';
|
||||
|
||||
const ORDERS_YAML = [
|
||||
'name: orders',
|
||||
'table: public.orders',
|
||||
'grain:',
|
||||
' - order_id',
|
||||
'columns:',
|
||||
' - name: order_id',
|
||||
' type: string',
|
||||
' - name: revenue',
|
||||
' type: number',
|
||||
'measures:',
|
||||
' - name: total_revenue',
|
||||
' expr: sum(revenue)',
|
||||
'',
|
||||
].join('\n');
|
||||
|
||||
const SUPPORT_YAML = [
|
||||
'name: tickets',
|
||||
'description: Support tickets grouped by priority.',
|
||||
'table: public.tickets',
|
||||
'grain:',
|
||||
' - ticket_id',
|
||||
'columns:',
|
||||
' - name: ticket_id',
|
||||
' type: string',
|
||||
' - name: priority',
|
||||
' type: string',
|
||||
'measures:',
|
||||
' - name: ticket_count',
|
||||
' expr: count(*)',
|
||||
'',
|
||||
].join('\n');
|
||||
|
||||
describe('local semantic-layer helpers', () => {
|
||||
let tempDir: string;
|
||||
let project: KloLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-local-sl-'));
|
||||
project = await initKloProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' });
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('writes, reads, lists, and validates semantic-layer sources', async () => {
|
||||
const write = await writeLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'orders',
|
||||
yaml: ORDERS_YAML,
|
||||
});
|
||||
|
||||
expect(write.path).toBe('semantic-layer/warehouse/orders.yaml');
|
||||
|
||||
await expect(
|
||||
readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'orders' }),
|
||||
).resolves.toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
name: 'orders',
|
||||
path: 'semantic-layer/warehouse/orders.yaml',
|
||||
yaml: ORDERS_YAML,
|
||||
});
|
||||
|
||||
await expect(listLocalSlSources(project, { connectionId: 'warehouse' })).resolves.toEqual([
|
||||
{
|
||||
columnCount: 2,
|
||||
connectionId: 'warehouse',
|
||||
joinCount: 0,
|
||||
measureCount: 1,
|
||||
name: 'orders',
|
||||
path: 'semantic-layer/warehouse/orders.yaml',
|
||||
},
|
||||
]);
|
||||
|
||||
await expect(validateLocalSlSource(ORDERS_YAML)).resolves.toEqual({ valid: true, errors: [] });
|
||||
});
|
||||
|
||||
it('lists and reads manifest-backed scan sources as queryable sources', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
`tables:
|
||||
payments:
|
||||
table: public.payments
|
||||
columns:
|
||||
- name: payment_id
|
||||
type: number
|
||||
pk: true
|
||||
- name: amount
|
||||
type: number
|
||||
`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Add manifest shard',
|
||||
);
|
||||
|
||||
await expect(listLocalSlSources(project, { connectionId: 'warehouse' })).resolves.toEqual([
|
||||
{
|
||||
columnCount: 2,
|
||||
connectionId: 'warehouse',
|
||||
joinCount: 0,
|
||||
measureCount: 0,
|
||||
name: 'payments',
|
||||
path: 'semantic-layer/warehouse/_schema/public.yaml#payments',
|
||||
},
|
||||
]);
|
||||
|
||||
await expect(readLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'payments' })).resolves.toEqual(
|
||||
expect.objectContaining({
|
||||
columnCount: 2,
|
||||
connectionId: 'warehouse',
|
||||
joinCount: 0,
|
||||
measureCount: 0,
|
||||
name: 'payments',
|
||||
path: 'semantic-layer/warehouse/_schema/public.yaml#payments',
|
||||
yaml: expect.stringContaining('table: public.payments'),
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('expands manifest-backed scan sources when listing all connections', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
`tables:
|
||||
payments:
|
||||
table: public.payments
|
||||
columns:
|
||||
- name: payment_id
|
||||
type: number
|
||||
pk: true
|
||||
- name: amount
|
||||
type: number
|
||||
`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Add manifest shard',
|
||||
);
|
||||
|
||||
await expect(listLocalSlSources(project)).resolves.toEqual([
|
||||
{
|
||||
columnCount: 2,
|
||||
connectionId: 'warehouse',
|
||||
joinCount: 0,
|
||||
measureCount: 0,
|
||||
name: 'payments',
|
||||
path: 'semantic-layer/warehouse/_schema/public.yaml#payments',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('searches local semantic-layer source text through SQLite FTS', async () => {
|
||||
await writeLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'orders',
|
||||
yaml: ORDERS_YAML,
|
||||
});
|
||||
await writeLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'tickets',
|
||||
yaml: SUPPORT_YAML,
|
||||
});
|
||||
|
||||
const results = await searchLocalSlSources(project, { connectionId: 'warehouse', query: 'total revenue' });
|
||||
|
||||
expect(results).toEqual([
|
||||
expect.objectContaining({
|
||||
connectionId: 'warehouse',
|
||||
name: 'orders',
|
||||
path: 'semantic-layer/warehouse/orders.yaml',
|
||||
score: expect.any(Number),
|
||||
}),
|
||||
]);
|
||||
expect(results[0]?.score).toBeGreaterThan(0);
|
||||
await expect(access(join(project.projectDir, '.klo/db.sqlite'))).resolves.toBeUndefined();
|
||||
});
|
||||
|
||||
it('searches all connections with one global hybrid ranking pass', async () => {
|
||||
await writeLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'orders',
|
||||
yaml: ORDERS_YAML,
|
||||
});
|
||||
await writeLocalSlSource(project, {
|
||||
connectionId: 'finance',
|
||||
sourceName: 'orders',
|
||||
yaml: [
|
||||
'name: orders',
|
||||
'description: Finance orders used for invoice reconciliation.',
|
||||
'table: finance.orders',
|
||||
'grain:',
|
||||
' - order_id',
|
||||
'columns:',
|
||||
' - name: order_id',
|
||||
' type: string',
|
||||
' - name: invoice_status',
|
||||
' type: string',
|
||||
'',
|
||||
].join('\n'),
|
||||
});
|
||||
|
||||
const results = await searchLocalSlSources(project, { query: 'orders' });
|
||||
|
||||
expect(results.map((result) => `${result.connectionId}/${result.name}`)).toEqual([
|
||||
'finance/orders',
|
||||
'warehouse/orders',
|
||||
]);
|
||||
expect(results[0]).toMatchObject({
|
||||
score: expect.any(Number),
|
||||
matchReasons: expect.arrayContaining(['lexical']),
|
||||
lanes: expect.arrayContaining([expect.objectContaining({ lane: 'lexical', status: 'available' })]),
|
||||
});
|
||||
});
|
||||
|
||||
it('returns dictionary evidence when collected sample values explain a match', async () => {
|
||||
await writeLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'orders',
|
||||
yaml: ORDERS_YAML,
|
||||
});
|
||||
await project.fileStore.writeFile(
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
|
||||
`${JSON.stringify(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 2,
|
||||
tables: [],
|
||||
columns: {
|
||||
'orders.status': {
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'string',
|
||||
rowCount: 10,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 0.2,
|
||||
nullRate: 0,
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 8,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed dictionary profile',
|
||||
);
|
||||
|
||||
const results = await searchLocalSlSources(project, { connectionId: 'warehouse', query: 'refunded' });
|
||||
|
||||
expect(results).toEqual([
|
||||
expect.objectContaining({
|
||||
connectionId: 'warehouse',
|
||||
name: 'orders',
|
||||
matchReasons: ['dictionary'],
|
||||
dictionaryMatches: [{ column: 'status', values: ['refunded'] }],
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it('adds the token lane alongside lexical matches for normalized query terms', async () => {
|
||||
await writeLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'orders',
|
||||
yaml: ORDERS_YAML,
|
||||
});
|
||||
|
||||
const results = await searchLocalSlSources(project, { connectionId: 'warehouse', query: 'orders---' });
|
||||
|
||||
expect(results[0]).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
name: 'orders',
|
||||
matchReasons: expect.arrayContaining(['token']),
|
||||
});
|
||||
});
|
||||
|
||||
it('reports schema validation errors without writing invalid YAML', async () => {
|
||||
const invalidYaml = ['name: broken', 'table: public.orders', 'columns: []', ''].join('\n');
|
||||
|
||||
await expect(validateLocalSlSource(invalidYaml)).resolves.toMatchObject({
|
||||
valid: false,
|
||||
errors: [expect.stringContaining('grain')],
|
||||
});
|
||||
|
||||
await expect(
|
||||
writeLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'broken',
|
||||
yaml: invalidYaml,
|
||||
}),
|
||||
).rejects.toThrow('Invalid semantic-layer source');
|
||||
});
|
||||
|
||||
it('rejects unsafe source paths', async () => {
|
||||
await expect(
|
||||
readLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
sourceName: '../orders',
|
||||
}),
|
||||
).rejects.toThrow('Unsafe semantic-layer source name');
|
||||
});
|
||||
});
|
||||
595
packages/context/src/sl/local-sl.ts
Normal file
595
packages/context/src/sl/local-sl.ts
Normal file
|
|
@ -0,0 +1,595 @@
|
|||
import { join } from 'node:path';
|
||||
import YAML from 'yaml';
|
||||
import { z } from 'zod';
|
||||
import type { KloEmbeddingPort, KloFileWriteResult } from '../core/index.js';
|
||||
import type { KloLocalProject } from '../project/index.js';
|
||||
import { HybridSearchCore, type SearchCandidateGenerator } from '../search/index.js';
|
||||
import { DEFAULT_PRIORITY, resolveDescription } from './descriptions.js';
|
||||
import { sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js';
|
||||
import { composeOverlay, type ManifestTableEntry, projectManifestEntry } from './semantic-layer.service.js';
|
||||
import type { PgliteSlSearchPrototypeOwnerOptions } from './pglite-sl-search-prototype.js';
|
||||
import { loadLatestSlDictionaryEntries } from './sl-dictionary-profile.js';
|
||||
import { buildSemanticLayerSourceSearchText, SlSearchService } from './sl-search.service.js';
|
||||
import { SqliteSlSourcesIndex } from './sqlite-sl-sources-index.js';
|
||||
import type { SemanticLayerSource, SlDictionaryMatch, SlSearchLaneSummary, SlSearchMatchReason } from './types.js';
|
||||
|
||||
export interface LocalSlSourceSummary {
|
||||
connectionId: string;
|
||||
name: string;
|
||||
path: string;
|
||||
description?: string;
|
||||
columnCount: number;
|
||||
measureCount: number;
|
||||
joinCount: number;
|
||||
}
|
||||
|
||||
export interface LocalSlSourceSearchResult extends LocalSlSourceSummary {
|
||||
score: number;
|
||||
matchReasons?: SlSearchMatchReason[];
|
||||
dictionaryMatches?: SlDictionaryMatch[];
|
||||
lanes?: SlSearchLaneSummary[];
|
||||
}
|
||||
|
||||
export interface LocalSlSearchInput {
|
||||
connectionId?: string;
|
||||
query: string;
|
||||
embeddingService?: KloEmbeddingPort | null;
|
||||
limit?: number;
|
||||
backend?: 'pglite-owner-prototype';
|
||||
pglite?: PgliteSlSearchPrototypeOwnerOptions;
|
||||
}
|
||||
|
||||
export interface LocalSlSource extends LocalSlSourceSummary {
|
||||
yaml: string;
|
||||
}
|
||||
|
||||
export interface LocalSlSourceRecord extends LocalSlSource {
|
||||
source: SemanticLayerSource;
|
||||
}
|
||||
|
||||
export interface LocalSlValidationResult {
|
||||
valid: boolean;
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
const LOCAL_AUTHOR = 'klo';
|
||||
const LOCAL_AUTHOR_EMAIL = 'klo@example.com';
|
||||
|
||||
function assertSafePathToken(kind: string, value: string): string {
|
||||
if (
|
||||
value.trim().length === 0 ||
|
||||
value.includes('..') ||
|
||||
value.includes('\\') ||
|
||||
value.startsWith('/') ||
|
||||
value.startsWith('.') ||
|
||||
value.includes('//')
|
||||
) {
|
||||
throw new Error(`Unsafe ${kind}: ${value}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function assertSafeConnectionId(connectionId: string): string {
|
||||
if (!/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId)) {
|
||||
throw new Error(`Unsafe connection id: ${connectionId}`);
|
||||
}
|
||||
return assertSafePathToken('connection id', connectionId);
|
||||
}
|
||||
|
||||
function isSafeConnectionId(connectionId: string | undefined): connectionId is string {
|
||||
return typeof connectionId === 'string' && /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId);
|
||||
}
|
||||
|
||||
function assertSafeSourceName(sourceName: string): string {
|
||||
if (!/^[a-z0-9][a-z0-9_]*$/.test(sourceName)) {
|
||||
throw new Error(`Unsafe semantic-layer source name: ${sourceName}`);
|
||||
}
|
||||
return assertSafePathToken('semantic-layer source name', sourceName);
|
||||
}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function slPath(connectionId: string, sourceName: string): string {
|
||||
return `semantic-layer/${assertSafeConnectionId(connectionId)}/${assertSafeSourceName(sourceName)}.yaml`;
|
||||
}
|
||||
|
||||
function sourceNameFromPath(path: string): string {
|
||||
return (
|
||||
path
|
||||
.split('/')
|
||||
.at(-1)
|
||||
?.replace(/\.ya?ml$/, '') ?? path
|
||||
);
|
||||
}
|
||||
|
||||
function parseYamlRecord(raw: string): Record<string, unknown> {
|
||||
const parsed = YAML.parse(raw) as unknown;
|
||||
if (!isRecord(parsed)) {
|
||||
throw new Error('Semantic-layer source YAML must contain an object');
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
function descriptionMap(value: Record<string, unknown>): Record<string, string> | undefined {
|
||||
const result: Record<string, string> = {};
|
||||
const descriptions = value.descriptions;
|
||||
if (isRecord(descriptions)) {
|
||||
for (const [key, text] of Object.entries(descriptions)) {
|
||||
if (typeof text === 'string' && text.trim().length > 0) {
|
||||
result[key] = text;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const flatDescription = value.description;
|
||||
if (!result.user && typeof flatDescription === 'string' && flatDescription.trim().length > 0) {
|
||||
result.user = flatDescription;
|
||||
}
|
||||
|
||||
return Object.keys(result).length > 0 ? result : undefined;
|
||||
}
|
||||
|
||||
function validationErrors(error: unknown): string[] {
|
||||
if (error instanceof z.ZodError) {
|
||||
return error.issues.map((issue) => `${issue.path.join('.') || '<root>'}: ${issue.message}`);
|
||||
}
|
||||
return [error instanceof Error ? error.message : String(error)];
|
||||
}
|
||||
|
||||
function summarizeSource(args: { connectionId: string; path: string; raw: string }): LocalSlSourceSummary {
|
||||
const parsed = parseYamlRecord(args.raw);
|
||||
const name = typeof parsed.name === 'string' && parsed.name.length > 0 ? parsed.name : sourceNameFromPath(args.path);
|
||||
const description = resolveDescription(descriptionMap(parsed), { priority: DEFAULT_PRIORITY }) ?? undefined;
|
||||
return {
|
||||
connectionId: args.connectionId,
|
||||
name,
|
||||
path: args.path,
|
||||
...(description ? { description } : {}),
|
||||
columnCount: Array.isArray(parsed.columns) ? parsed.columns.length : 0,
|
||||
measureCount: Array.isArray(parsed.measures) ? parsed.measures.length : 0,
|
||||
joinCount: Array.isArray(parsed.joins) ? parsed.joins.length : 0,
|
||||
};
|
||||
}
|
||||
|
||||
function sourceToYaml(source: SemanticLayerSource): string {
|
||||
return YAML.stringify(source, { indent: 2, lineWidth: 0 });
|
||||
}
|
||||
|
||||
function summarizeSemanticSource(args: {
|
||||
connectionId: string;
|
||||
path: string;
|
||||
source: SemanticLayerSource;
|
||||
}): LocalSlSourceSummary {
|
||||
const description = resolveDescription(args.source.descriptions, { priority: DEFAULT_PRIORITY }) ?? undefined;
|
||||
return {
|
||||
connectionId: args.connectionId,
|
||||
name: args.source.name,
|
||||
path: args.path,
|
||||
...(description ? { description } : {}),
|
||||
columnCount: args.source.columns.length,
|
||||
measureCount: args.source.measures.length,
|
||||
joinCount: args.source.joins.length,
|
||||
};
|
||||
}
|
||||
|
||||
function manifestTables(value: Record<string, unknown>): Record<string, ManifestTableEntry> | null {
|
||||
return isRecord(value.tables) ? (value.tables as Record<string, ManifestTableEntry>) : null;
|
||||
}
|
||||
|
||||
function parsedStandaloneSource(parsed: Record<string, unknown>, name: string): SemanticLayerSource {
|
||||
const source = parsed as Partial<SemanticLayerSource>;
|
||||
return {
|
||||
...source,
|
||||
name,
|
||||
grain: Array.isArray(parsed.grain) ? (parsed.grain.filter((item) => typeof item === 'string') as string[]) : [],
|
||||
columns: Array.isArray(parsed.columns) ? (parsed.columns as SemanticLayerSource['columns']) : [],
|
||||
joins: Array.isArray(parsed.joins) ? (parsed.joins as SemanticLayerSource['joins']) : [],
|
||||
measures: Array.isArray(parsed.measures) ? (parsed.measures as SemanticLayerSource['measures']) : [],
|
||||
};
|
||||
}
|
||||
|
||||
export async function loadLocalSlSourceRecords(
|
||||
project: KloLocalProject,
|
||||
input: { connectionId: string },
|
||||
): Promise<LocalSlSourceRecord[]> {
|
||||
const connectionId = assertSafeConnectionId(input.connectionId);
|
||||
const dir = `semantic-layer/${connectionId}`;
|
||||
const schemaDir = `${dir}/_schema`;
|
||||
const listed = await project.fileStore.listFiles(dir);
|
||||
const paths = listed.files.filter((file) => file.endsWith('.yaml') || file.endsWith('.yml')).sort();
|
||||
const sources = new Map<string, LocalSlSourceRecord>();
|
||||
|
||||
for (const path of paths.filter((file) => file.startsWith(`${schemaDir}/`))) {
|
||||
const raw = await project.fileStore.readFile(path);
|
||||
const tables = manifestTables(parseYamlRecord(raw.content));
|
||||
if (!tables) {
|
||||
continue;
|
||||
}
|
||||
for (const [name, entry] of Object.entries(tables)) {
|
||||
const source = projectManifestEntry(name, entry);
|
||||
const projectedPath = `${path}#${name}`;
|
||||
sources.set(name, {
|
||||
...summarizeSemanticSource({ connectionId, path: projectedPath, source }),
|
||||
yaml: sourceToYaml(source),
|
||||
source,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
for (const path of paths.filter((file) => !file.startsWith(`${schemaDir}/`))) {
|
||||
const raw = await project.fileStore.readFile(path);
|
||||
const parsed = parseYamlRecord(raw.content);
|
||||
const name = typeof parsed.name === 'string' && parsed.name.length > 0 ? parsed.name : sourceNameFromPath(path);
|
||||
if (parsed.table || parsed.sql) {
|
||||
const source = parsedStandaloneSource(parsed, name);
|
||||
sources.set(name, { ...summarizeSource({ connectionId, path, raw: raw.content }), yaml: raw.content, source });
|
||||
continue;
|
||||
}
|
||||
|
||||
const base = sources.get(name);
|
||||
if (!base) {
|
||||
continue;
|
||||
}
|
||||
const source = composeOverlay(base.source, parsed);
|
||||
sources.set(name, {
|
||||
...summarizeSemanticSource({ connectionId, path, source }),
|
||||
yaml: sourceToYaml(source),
|
||||
source,
|
||||
});
|
||||
}
|
||||
|
||||
return [...sources.values()].sort((left, right) => left.name.localeCompare(right.name));
|
||||
}
|
||||
|
||||
export async function validateLocalSlSource(rawYaml: string): Promise<LocalSlValidationResult> {
|
||||
try {
|
||||
const parsed = parseYamlRecord(rawYaml);
|
||||
const schema = parsed.table || parsed.sql ? sourceDefinitionSchema : sourceOverlaySchema;
|
||||
schema.parse(parsed);
|
||||
return { valid: true, errors: [] };
|
||||
} catch (error) {
|
||||
return { valid: false, errors: validationErrors(error) };
|
||||
}
|
||||
}
|
||||
|
||||
export async function writeLocalSlSource(
|
||||
project: KloLocalProject,
|
||||
input: { connectionId: string; sourceName: string; yaml: string },
|
||||
): Promise<KloFileWriteResult> {
|
||||
const validation = await validateLocalSlSource(input.yaml);
|
||||
if (!validation.valid) {
|
||||
throw new Error(`Invalid semantic-layer source: ${validation.errors.join('; ')}`);
|
||||
}
|
||||
|
||||
const parsed = parseYamlRecord(input.yaml);
|
||||
if (typeof parsed.name === 'string' && parsed.name !== input.sourceName) {
|
||||
throw new Error(`Semantic-layer source name "${parsed.name}" does not match requested path "${input.sourceName}"`);
|
||||
}
|
||||
|
||||
const path = slPath(input.connectionId, input.sourceName);
|
||||
return project.fileStore.writeFile(
|
||||
path,
|
||||
input.yaml.endsWith('\n') ? input.yaml : `${input.yaml}\n`,
|
||||
LOCAL_AUTHOR,
|
||||
LOCAL_AUTHOR_EMAIL,
|
||||
`Write semantic-layer source: ${input.connectionId}/${input.sourceName}`,
|
||||
);
|
||||
}
|
||||
|
||||
export async function readLocalSlSource(
|
||||
project: KloLocalProject,
|
||||
input: { connectionId: string; sourceName: string },
|
||||
): Promise<LocalSlSource | null> {
|
||||
const path = slPath(input.connectionId, input.sourceName);
|
||||
try {
|
||||
const result = await project.fileStore.readFile(path);
|
||||
return {
|
||||
...summarizeSource({ connectionId: input.connectionId, path, raw: result.content }),
|
||||
yaml: result.content,
|
||||
};
|
||||
} catch {
|
||||
const records = await loadLocalSlSourceRecords(project, {
|
||||
connectionId: input.connectionId,
|
||||
});
|
||||
const record = records.find((source) => source.name === input.sourceName);
|
||||
return record ? { ...record } : null;
|
||||
}
|
||||
}
|
||||
|
||||
export async function listLocalSlSources(
|
||||
project: KloLocalProject,
|
||||
input: { connectionId?: string } = {},
|
||||
): Promise<LocalSlSourceSummary[]> {
|
||||
if (input.connectionId) {
|
||||
return (await loadLocalSlSourceRecords(project, { connectionId: input.connectionId })).map(
|
||||
({ source: _source, yaml: _yaml, ...summary }) => summary,
|
||||
);
|
||||
}
|
||||
const listed = await project.fileStore.listFiles('semantic-layer');
|
||||
const connectionIds = [...new Set(listed.files.map((path) => path.split('/')[1]).filter(isSafeConnectionId))].sort();
|
||||
const summaries: LocalSlSourceSummary[] = [];
|
||||
for (const connectionId of connectionIds) {
|
||||
const records = await loadLocalSlSourceRecords(project, { connectionId });
|
||||
summaries.push(...records.map(({ source: _source, yaml: _yaml, ...summary }) => summary));
|
||||
}
|
||||
return summaries.sort(
|
||||
(left, right) => left.connectionId.localeCompare(right.connectionId) || left.name.localeCompare(right.name),
|
||||
);
|
||||
}
|
||||
|
||||
interface LocalSlSearchCandidate {
|
||||
summary: LocalSlSourceSummary;
|
||||
source: SemanticLayerSource;
|
||||
searchText: string;
|
||||
}
|
||||
|
||||
function sqliteSlDbPath(project: KloLocalProject): string {
|
||||
return join(project.projectDir, '.klo', 'db.sqlite');
|
||||
}
|
||||
|
||||
async function loadLocalSlSearchCandidates(
|
||||
project: KloLocalProject,
|
||||
input: { connectionId?: string } = {},
|
||||
): Promise<LocalSlSearchCandidate[]> {
|
||||
if (input.connectionId) {
|
||||
return (await loadLocalSlSourceRecords(project, { connectionId: input.connectionId })).map((record) => ({
|
||||
summary: {
|
||||
connectionId: record.connectionId,
|
||||
name: record.name,
|
||||
path: record.path,
|
||||
...(record.description ? { description: record.description } : {}),
|
||||
columnCount: record.columnCount,
|
||||
measureCount: record.measureCount,
|
||||
joinCount: record.joinCount,
|
||||
},
|
||||
source: record.source,
|
||||
searchText: buildSemanticLayerSourceSearchText(record.source),
|
||||
}));
|
||||
}
|
||||
|
||||
const listed = await project.fileStore.listFiles('semantic-layer');
|
||||
const connectionIds = [...new Set(listed.files.map((path) => path.split('/')[1]).filter(isSafeConnectionId))].sort();
|
||||
const candidates: LocalSlSearchCandidate[] = [];
|
||||
for (const connectionId of connectionIds) {
|
||||
candidates.push(...(await loadLocalSlSearchCandidates(project, { connectionId })));
|
||||
}
|
||||
return candidates.sort(
|
||||
(left, right) =>
|
||||
left.summary.connectionId.localeCompare(right.summary.connectionId) ||
|
||||
left.summary.name.localeCompare(right.summary.name),
|
||||
);
|
||||
}
|
||||
|
||||
function candidateKey(summary: LocalSlSourceSummary): string {
|
||||
return `${summary.connectionId}/${summary.name}`;
|
||||
}
|
||||
|
||||
function tokenLaneCandidates(candidates: LocalSlSearchCandidate[], terms: readonly string[]) {
|
||||
if (terms.length === 0) {
|
||||
return [];
|
||||
}
|
||||
return candidates
|
||||
.map((candidate) => {
|
||||
const haystack = candidate.searchText.toLowerCase();
|
||||
const matchedTerms = terms.filter((term) => haystack.includes(term));
|
||||
return {
|
||||
candidate,
|
||||
score: matchedTerms.length / terms.length,
|
||||
};
|
||||
})
|
||||
.filter((result) => result.score > 0)
|
||||
.sort(
|
||||
(left, right) =>
|
||||
right.score - left.score ||
|
||||
left.candidate.summary.connectionId.localeCompare(right.candidate.summary.connectionId) ||
|
||||
left.candidate.summary.name.localeCompare(right.candidate.summary.name),
|
||||
);
|
||||
}
|
||||
|
||||
async function refreshHybridSlIndexes(input: {
|
||||
index: SqliteSlSourcesIndex;
|
||||
project: KloLocalProject;
|
||||
candidates: LocalSlSearchCandidate[];
|
||||
embeddingService?: KloEmbeddingPort | null;
|
||||
}): Promise<void> {
|
||||
const candidatesByConnection = new Map<string, LocalSlSearchCandidate[]>();
|
||||
for (const candidate of input.candidates) {
|
||||
candidatesByConnection.set(candidate.summary.connectionId, [
|
||||
...(candidatesByConnection.get(candidate.summary.connectionId) ?? []),
|
||||
candidate,
|
||||
]);
|
||||
}
|
||||
|
||||
for (const [connectionId, group] of candidatesByConnection) {
|
||||
if (input.embeddingService) {
|
||||
const service = new SlSearchService(input.embeddingService, input.index);
|
||||
await service.indexSources(
|
||||
connectionId,
|
||||
group.map((candidate) => candidate.source),
|
||||
);
|
||||
} else {
|
||||
await input.index.upsertSources(
|
||||
connectionId,
|
||||
group.map((candidate) => ({
|
||||
sourceName: candidate.summary.name,
|
||||
searchText: candidate.searchText,
|
||||
embedding: null,
|
||||
})),
|
||||
);
|
||||
await input.index.deleteStale(
|
||||
connectionId,
|
||||
group.map((candidate) => candidate.summary.name),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const dictionaryEntries = await loadLatestSlDictionaryEntries(input.project, [...candidatesByConnection.keys()]);
|
||||
for (const connectionId of candidatesByConnection.keys()) {
|
||||
await input.index.replaceDictionaryEntries(
|
||||
connectionId,
|
||||
dictionaryEntries.filter((entry) => entry.connectionId === connectionId),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export async function searchLocalSlSources(
|
||||
project: KloLocalProject,
|
||||
input: LocalSlSearchInput,
|
||||
): Promise<LocalSlSourceSearchResult[]> {
|
||||
const query = input.query.trim();
|
||||
if (!query) {
|
||||
return (await listLocalSlSources(project, { connectionId: input.connectionId })).map((source) => ({
|
||||
...source,
|
||||
score: 1,
|
||||
}));
|
||||
}
|
||||
|
||||
if (input.backend === 'pglite-owner-prototype') {
|
||||
if (!input.pglite) {
|
||||
throw new Error('PGlite semantic-layer search prototype requires pglite owner-process options.');
|
||||
}
|
||||
const { searchLocalSlSourcesWithPglitePrototype } = await import('./pglite-sl-search-prototype.js');
|
||||
return searchLocalSlSourcesWithPglitePrototype(project, {
|
||||
connectionId: input.connectionId,
|
||||
query,
|
||||
embeddingService: input.embeddingService ?? null,
|
||||
limit: input.limit,
|
||||
pglite: input.pglite,
|
||||
});
|
||||
}
|
||||
|
||||
const candidates = await loadLocalSlSearchCandidates(project, { connectionId: input.connectionId });
|
||||
if (project.config.storage.search !== 'sqlite-fts5') {
|
||||
return candidates
|
||||
.map((candidate) => {
|
||||
const terms = query
|
||||
.toLowerCase()
|
||||
.split(/\s+/)
|
||||
.map((term) => term.trim())
|
||||
.filter(Boolean);
|
||||
return {
|
||||
candidate,
|
||||
score:
|
||||
terms.length === 0
|
||||
? 0
|
||||
: terms.filter((term) => candidate.searchText.toLowerCase().includes(term)).length / terms.length,
|
||||
};
|
||||
})
|
||||
.filter((result) => result.score > 0)
|
||||
.map((result) => ({
|
||||
...result.candidate.summary,
|
||||
score: result.score,
|
||||
matchReasons: ['token'],
|
||||
}))
|
||||
.sort(
|
||||
(left, right) =>
|
||||
right.score - left.score ||
|
||||
left.connectionId.localeCompare(right.connectionId) ||
|
||||
left.path.localeCompare(right.path),
|
||||
);
|
||||
}
|
||||
|
||||
const index = new SqliteSlSourcesIndex({ dbPath: sqliteSlDbPath(project) });
|
||||
await refreshHybridSlIndexes({ index, project, candidates, embeddingService: input.embeddingService ?? null });
|
||||
|
||||
const candidateById = new Map(candidates.map((candidate) => [candidateKey(candidate.summary), candidate]));
|
||||
const connectionIds = input.connectionId ? [input.connectionId] : undefined;
|
||||
const finalLimit = input.limit ?? candidates.length;
|
||||
const core = new HybridSearchCore();
|
||||
const dictionaryEvidence = new Map<string, SlDictionaryMatch[]>();
|
||||
|
||||
const generators: SearchCandidateGenerator[] = [
|
||||
{
|
||||
lane: 'lexical',
|
||||
async generate(args) {
|
||||
const rows = await index.searchLexicalCandidates({
|
||||
connectionIds,
|
||||
queryText: args.queryText,
|
||||
limit: args.laneCandidatePoolLimit,
|
||||
});
|
||||
return {
|
||||
candidates: rows.map((row) => ({ id: row.id, rank: row.rank, rawScore: row.rawScore })),
|
||||
};
|
||||
},
|
||||
},
|
||||
{
|
||||
lane: 'dictionary',
|
||||
async generate(args) {
|
||||
const rows = await index.searchDictionaryCandidates({
|
||||
connectionIds,
|
||||
queryText: args.queryText,
|
||||
limit: args.laneCandidatePoolLimit,
|
||||
});
|
||||
for (const row of rows) {
|
||||
dictionaryEvidence.set(row.id, row.matches);
|
||||
}
|
||||
return {
|
||||
candidates: rows.map((row) => ({
|
||||
id: row.id,
|
||||
rank: row.rank,
|
||||
rawScore: row.rawScore,
|
||||
evidence: row.matches,
|
||||
})),
|
||||
};
|
||||
},
|
||||
},
|
||||
{
|
||||
lane: 'token',
|
||||
async generate(args) {
|
||||
const rows = tokenLaneCandidates(candidates, args.normalizedQuery.terms).slice(0, args.laneCandidatePoolLimit);
|
||||
return {
|
||||
candidates: rows.map((row, index) => ({
|
||||
id: candidateKey(row.candidate.summary),
|
||||
rank: index + 1,
|
||||
rawScore: row.score,
|
||||
})),
|
||||
};
|
||||
},
|
||||
},
|
||||
{
|
||||
lane: 'semantic',
|
||||
async generate(args) {
|
||||
if (!input.embeddingService) {
|
||||
return { status: 'skipped', candidates: [], reason: 'embedding_unconfigured' };
|
||||
}
|
||||
try {
|
||||
const queryEmbedding = await input.embeddingService.computeEmbedding(args.queryText);
|
||||
const rows = await index.searchSemanticCandidates({
|
||||
connectionIds,
|
||||
queryEmbedding,
|
||||
limit: args.laneCandidatePoolLimit,
|
||||
});
|
||||
return {
|
||||
candidates: rows.map((row) => ({ id: row.id, rank: row.rank, rawScore: row.rawScore })),
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
status: 'skipped',
|
||||
candidates: [],
|
||||
reason: `embedding_unhealthy:${error instanceof Error ? error.message : String(error)}`,
|
||||
};
|
||||
}
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
const result = await core.search({ queryText: query, limit: finalLimit, generators });
|
||||
const hydrated: LocalSlSourceSearchResult[] = [];
|
||||
for (const fused of result.results) {
|
||||
const candidate = candidateById.get(fused.id);
|
||||
if (!candidate) {
|
||||
continue;
|
||||
}
|
||||
const dictionaryMatches = dictionaryEvidence.get(fused.id);
|
||||
hydrated.push({
|
||||
...candidate.summary,
|
||||
score: fused.score,
|
||||
matchReasons: fused.matchReasons as SlSearchMatchReason[],
|
||||
...(dictionaryMatches && dictionaryMatches.length > 0 ? { dictionaryMatches } : {}),
|
||||
lanes: result.lanes,
|
||||
});
|
||||
}
|
||||
return hydrated;
|
||||
}
|
||||
268
packages/context/src/sl/pglite-sl-search-prototype.test.ts
Normal file
268
packages/context/src/sl/pglite-sl-search-prototype.test.ts
Normal file
|
|
@ -0,0 +1,268 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { createServer } from 'node:net';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { initKloProject, type KloLocalProject } from '../project/index.js';
|
||||
import { assertSearchBackendConformanceCase } from '../search/index.js';
|
||||
import { searchLocalSlSources, writeLocalSlSource, type LocalSlSourceSearchResult } from './local-sl.js';
|
||||
import { searchLocalSlSourcesWithPglitePrototype } from './pglite-sl-search-prototype.js';
|
||||
|
||||
const ORDERS_YAML = [
|
||||
'name: orders',
|
||||
'description: Orders with paid revenue and refund status.',
|
||||
'table: public.orders',
|
||||
'grain:',
|
||||
' - order_id',
|
||||
'columns:',
|
||||
' - name: order_id',
|
||||
' type: string',
|
||||
' - name: status',
|
||||
' type: string',
|
||||
' - name: revenue',
|
||||
' type: number',
|
||||
'measures:',
|
||||
' - name: total_revenue',
|
||||
' expr: sum(revenue)',
|
||||
'',
|
||||
].join('\n');
|
||||
|
||||
const FINANCE_ORDERS_YAML = [
|
||||
'name: orders',
|
||||
'description: Finance orders used for invoice reconciliation.',
|
||||
'table: finance.orders',
|
||||
'grain:',
|
||||
' - order_id',
|
||||
'columns:',
|
||||
' - name: order_id',
|
||||
' type: string',
|
||||
' - name: invoice_status',
|
||||
' type: string',
|
||||
'',
|
||||
].join('\n');
|
||||
|
||||
const CUSTOMERS_YAML = [
|
||||
'name: customers',
|
||||
'description: Customer lifecycle accounts by region.',
|
||||
'table: public.customers',
|
||||
'grain:',
|
||||
' - customer_id',
|
||||
'columns:',
|
||||
' - name: customer_id',
|
||||
' type: string',
|
||||
' - name: region',
|
||||
' type: string',
|
||||
'',
|
||||
].join('\n');
|
||||
|
||||
class FakeEmbeddingPort {
|
||||
readonly maxBatchSize = 16;
|
||||
|
||||
async computeEmbedding(text: string): Promise<number[]> {
|
||||
const normalized = text.toLowerCase();
|
||||
if (normalized.includes('semantic revenue') || normalized.includes('orders with paid revenue')) {
|
||||
return [1, 0, 0];
|
||||
}
|
||||
if (normalized.includes('finance orders')) {
|
||||
return [0.72, 0.28, 0];
|
||||
}
|
||||
return [0, 1, 0];
|
||||
}
|
||||
|
||||
async computeEmbeddingsBulk(texts: string[]): Promise<number[][]> {
|
||||
return Promise.all(texts.map((text) => this.computeEmbedding(text)));
|
||||
}
|
||||
}
|
||||
|
||||
async function allocatePort(): Promise<number> {
|
||||
const server = createServer();
|
||||
await new Promise<void>((resolve) => server.listen(0, '127.0.0.1', resolve));
|
||||
const address = server.address();
|
||||
if (typeof address !== 'object' || address === null) {
|
||||
throw new Error('Expected TCP server address while allocating a PGlite SL prototype port.');
|
||||
}
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
server.close((error) => {
|
||||
if (error) {
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
return address.port;
|
||||
}
|
||||
|
||||
function toConformanceResult(result: LocalSlSourceSearchResult) {
|
||||
return {
|
||||
id: `${result.connectionId}/${result.name}`,
|
||||
score: result.score,
|
||||
matchReasons: result.matchReasons ?? [],
|
||||
lanes: result.lanes,
|
||||
dictionaryMatches: result.dictionaryMatches,
|
||||
};
|
||||
}
|
||||
|
||||
async function seedSemanticLayerProject(project: KloLocalProject): Promise<void> {
|
||||
await writeLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'orders', yaml: ORDERS_YAML });
|
||||
await writeLocalSlSource(project, { connectionId: 'finance', sourceName: 'orders', yaml: FINANCE_ORDERS_YAML });
|
||||
await writeLocalSlSource(project, { connectionId: 'warehouse', sourceName: 'customers', yaml: CUSTOMERS_YAML });
|
||||
|
||||
await project.fileStore.writeFile(
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
|
||||
`${JSON.stringify(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 2,
|
||||
tables: [],
|
||||
columns: {
|
||||
'orders.status': {
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'string',
|
||||
rowCount: 10,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 0.2,
|
||||
nullRate: 0,
|
||||
sampleValues: ['paid', 'refunded'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 8,
|
||||
},
|
||||
'customers.region': {
|
||||
table: { catalog: null, db: 'public', name: 'customers' },
|
||||
column: 'region',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'string',
|
||||
rowCount: 10,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.3,
|
||||
nullRate: 0,
|
||||
sampleValues: ['emea', 'amer', 'apac'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 4,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed PGlite dictionary profile',
|
||||
);
|
||||
}
|
||||
|
||||
describe('PGlite semantic-layer search prototype', () => {
|
||||
let tempDir: string;
|
||||
let project: KloLocalProject;
|
||||
let pgliteDataDir: string;
|
||||
let port: number;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-sl-prototype-'));
|
||||
project = await initKloProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' });
|
||||
project.config.ingest.embeddings.dimensions = 3;
|
||||
pgliteDataDir = join(tempDir, 'pglite-search');
|
||||
port = await allocatePort();
|
||||
await seedSemanticLayerProject(project);
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('returns lexical semantic-layer matches through PGlite FTS', async () => {
|
||||
const results = await searchLocalSlSourcesWithPglitePrototype(project, {
|
||||
query: 'paid revenue',
|
||||
limit: 5,
|
||||
pglite: { dataDir: pgliteDataDir, host: '127.0.0.1', port },
|
||||
});
|
||||
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'pglite-owner-prototype',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'pglite lexical source ranking',
|
||||
results: results.map(toConformanceResult),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['lexical'],
|
||||
},
|
||||
expectedLanes: {
|
||||
lexical: { status: 'available' },
|
||||
semantic: { status: 'skipped', reason: 'embedding_unconfigured' },
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('returns dictionary evidence through PGlite pg_trgm and exact matching', async () => {
|
||||
const results = await searchLocalSlSourcesWithPglitePrototype(project, {
|
||||
connectionId: 'warehouse',
|
||||
query: 'refund',
|
||||
limit: 5,
|
||||
pglite: { dataDir: pgliteDataDir, host: '127.0.0.1', port },
|
||||
});
|
||||
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'pglite-owner-prototype',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'pglite dictionary source evidence',
|
||||
results: results.map(toConformanceResult),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['dictionary'],
|
||||
},
|
||||
expectedLanes: {
|
||||
dictionary: { status: 'available' },
|
||||
semantic: { status: 'skipped', reason: 'embedding_unconfigured' },
|
||||
},
|
||||
expectedDictionaryMatchesById: {
|
||||
'warehouse/orders': [{ column: 'status', values: ['refunded'] }],
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('returns semantic matches through PGlite vector ordering when embeddings are configured', async () => {
|
||||
const results = await searchLocalSlSourcesWithPglitePrototype(project, {
|
||||
query: 'semantic revenue',
|
||||
limit: 5,
|
||||
embeddingService: new FakeEmbeddingPort(),
|
||||
pglite: { dataDir: pgliteDataDir, host: '127.0.0.1', port },
|
||||
});
|
||||
|
||||
assertSearchBackendConformanceCase({
|
||||
backendName: 'pglite-owner-prototype',
|
||||
surface: 'semantic-layer',
|
||||
caseName: 'pglite semantic source ranking',
|
||||
results: results.map(toConformanceResult),
|
||||
expectedTopIds: ['warehouse/orders'],
|
||||
expectedReasonsById: {
|
||||
'warehouse/orders': ['semantic'],
|
||||
},
|
||||
expectedLanes: {
|
||||
semantic: { status: 'available' },
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('routes through PGlite only when the private local search input opts in', async () => {
|
||||
const results = await searchLocalSlSources(project, {
|
||||
query: 'refnd',
|
||||
limit: 5,
|
||||
backend: 'pglite-owner-prototype',
|
||||
pglite: { dataDir: pgliteDataDir, host: '127.0.0.1', port },
|
||||
});
|
||||
|
||||
expect(results[0]).toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
name: 'orders',
|
||||
matchReasons: expect.arrayContaining(['dictionary']),
|
||||
dictionaryMatches: [{ column: 'status', values: ['refunded'] }],
|
||||
});
|
||||
});
|
||||
});
|
||||
569
packages/context/src/sl/pglite-sl-search-prototype.ts
Normal file
569
packages/context/src/sl/pglite-sl-search-prototype.ts
Normal file
|
|
@ -0,0 +1,569 @@
|
|||
import { mkdir } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import type { KloEmbeddingPort } from '../core/index.js';
|
||||
import type { KloLocalProject } from '../project/index.js';
|
||||
import { HybridSearchCore, type SearchCandidateGenerator } from '../search/index.js';
|
||||
import { KloPGliteOwnerProcess } from '../search/pglite-owner-process.js';
|
||||
import {
|
||||
listLocalSlSources,
|
||||
loadLocalSlSourceRecords,
|
||||
type LocalSlSourceSearchResult,
|
||||
type LocalSlSourceSummary,
|
||||
} from './local-sl.js';
|
||||
import { loadLatestSlDictionaryEntries, type SlDictionaryEntry } from './sl-dictionary-profile.js';
|
||||
import { buildSemanticLayerSourceSearchText } from './sl-search.service.js';
|
||||
import type { SemanticLayerSource, SlDictionaryMatch, SlSearchMatchReason } from './types.js';
|
||||
|
||||
export interface PgliteSlSearchPrototypeOwnerOptions {
|
||||
dataDir?: string;
|
||||
host: string;
|
||||
port: number;
|
||||
}
|
||||
|
||||
export interface PgliteSlSearchPrototypeInput {
|
||||
connectionId?: string;
|
||||
query: string;
|
||||
embeddingService?: KloEmbeddingPort | null;
|
||||
limit?: number;
|
||||
pglite: PgliteSlSearchPrototypeOwnerOptions;
|
||||
}
|
||||
|
||||
interface LocalSlSearchCandidate {
|
||||
summary: LocalSlSourceSummary;
|
||||
source: SemanticLayerSource;
|
||||
searchText: string;
|
||||
}
|
||||
|
||||
interface PgliteLaneRow {
|
||||
id: string;
|
||||
connection_id: string;
|
||||
source_name: string;
|
||||
score: number | string;
|
||||
}
|
||||
|
||||
interface PgliteDictionaryRow extends PgliteLaneRow {
|
||||
column_name: string;
|
||||
value: string;
|
||||
}
|
||||
|
||||
function candidateKey(summary: LocalSlSourceSummary): string {
|
||||
return `${summary.connectionId}/${summary.name}`;
|
||||
}
|
||||
|
||||
function pgliteDataDir(project: KloLocalProject, input: PgliteSlSearchPrototypeOwnerOptions): string {
|
||||
return input.dataDir ?? join(project.projectDir, '.klo', 'pglite-search-prototype');
|
||||
}
|
||||
|
||||
function vectorDimensions(project: KloLocalProject): number {
|
||||
const dimensions = project.config.ingest.embeddings.dimensions;
|
||||
if (!Number.isInteger(dimensions) || dimensions <= 0) {
|
||||
throw new Error(`PGlite SL search prototype needs a positive embedding dimension, got ${String(dimensions)}.`);
|
||||
}
|
||||
return dimensions;
|
||||
}
|
||||
|
||||
function connectionIdsForSearch(input: { connectionId?: string }): string[] | null {
|
||||
return input.connectionId ? [input.connectionId] : null;
|
||||
}
|
||||
|
||||
async function loadCandidates(
|
||||
project: KloLocalProject,
|
||||
input: { connectionId?: string } = {},
|
||||
): Promise<LocalSlSearchCandidate[]> {
|
||||
if (input.connectionId) {
|
||||
return (await loadLocalSlSourceRecords(project, { connectionId: input.connectionId })).map((record) => ({
|
||||
summary: {
|
||||
connectionId: record.connectionId,
|
||||
name: record.name,
|
||||
path: record.path,
|
||||
...(record.description ? { description: record.description } : {}),
|
||||
columnCount: record.columnCount,
|
||||
measureCount: record.measureCount,
|
||||
joinCount: record.joinCount,
|
||||
},
|
||||
source: record.source,
|
||||
searchText: buildSemanticLayerSourceSearchText(record.source),
|
||||
}));
|
||||
}
|
||||
|
||||
const listed = await project.fileStore.listFiles('semantic-layer');
|
||||
const connectionIds = [
|
||||
...new Set(
|
||||
listed.files
|
||||
.map((path) => path.split('/')[1])
|
||||
.filter((connectionId): connectionId is string =>
|
||||
typeof connectionId === 'string' && /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/.test(connectionId),
|
||||
),
|
||||
),
|
||||
].sort();
|
||||
const candidates: LocalSlSearchCandidate[] = [];
|
||||
for (const connectionId of connectionIds) {
|
||||
candidates.push(...(await loadCandidates(project, { connectionId })));
|
||||
}
|
||||
return candidates.sort(
|
||||
(left, right) =>
|
||||
left.summary.connectionId.localeCompare(right.summary.connectionId) ||
|
||||
left.summary.name.localeCompare(right.summary.name),
|
||||
);
|
||||
}
|
||||
|
||||
function tokenLaneCandidates(candidates: LocalSlSearchCandidate[], terms: readonly string[]) {
|
||||
if (terms.length === 0) {
|
||||
return [];
|
||||
}
|
||||
return candidates
|
||||
.map((candidate) => {
|
||||
const haystack = candidate.searchText.toLowerCase();
|
||||
const matchedTerms = terms.filter((term) => haystack.includes(term));
|
||||
return {
|
||||
candidate,
|
||||
score: matchedTerms.length / terms.length,
|
||||
};
|
||||
})
|
||||
.filter((result) => result.score > 0)
|
||||
.sort(
|
||||
(left, right) =>
|
||||
right.score - left.score ||
|
||||
left.candidate.summary.connectionId.localeCompare(right.candidate.summary.connectionId) ||
|
||||
left.candidate.summary.name.localeCompare(right.candidate.summary.name),
|
||||
);
|
||||
}
|
||||
|
||||
function postgresqlOrTsQuery(query: string): string {
|
||||
const terms = query
|
||||
.toLowerCase()
|
||||
.split(/[^a-z0-9_]+/u)
|
||||
.map((term) => term.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
return [...new Set(terms)].join(' | ');
|
||||
}
|
||||
|
||||
async function resetPrototypeSchema(owner: KloPGliteOwnerProcess, dimensions: number): Promise<void> {
|
||||
await owner.query(`
|
||||
DROP TABLE IF EXISTS prototype_sl_dictionary_values;
|
||||
DROP TABLE IF EXISTS prototype_sl_sources;
|
||||
|
||||
CREATE TABLE prototype_sl_sources (
|
||||
connection_id TEXT NOT NULL,
|
||||
source_name TEXT NOT NULL,
|
||||
path TEXT NOT NULL,
|
||||
description TEXT,
|
||||
column_count INTEGER NOT NULL,
|
||||
measure_count INTEGER NOT NULL,
|
||||
join_count INTEGER NOT NULL,
|
||||
search_text TEXT NOT NULL,
|
||||
embedding vector(${dimensions}),
|
||||
PRIMARY KEY (connection_id, source_name)
|
||||
);
|
||||
|
||||
CREATE INDEX prototype_sl_sources_fts_idx
|
||||
ON prototype_sl_sources
|
||||
USING GIN (to_tsvector('english', search_text));
|
||||
|
||||
CREATE INDEX prototype_sl_sources_vector_idx
|
||||
ON prototype_sl_sources
|
||||
USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 1);
|
||||
|
||||
CREATE TABLE prototype_sl_dictionary_values (
|
||||
connection_id TEXT NOT NULL,
|
||||
source_name TEXT NOT NULL,
|
||||
column_name TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
value_lower TEXT NOT NULL,
|
||||
cardinality INTEGER,
|
||||
PRIMARY KEY (connection_id, source_name, column_name, value)
|
||||
);
|
||||
|
||||
CREATE INDEX prototype_sl_dictionary_values_trgm_idx
|
||||
ON prototype_sl_dictionary_values
|
||||
USING GIN (value gin_trgm_ops);
|
||||
`);
|
||||
}
|
||||
|
||||
async function sourceEmbeddings(input: {
|
||||
candidates: LocalSlSearchCandidate[];
|
||||
embeddingService?: KloEmbeddingPort | null;
|
||||
dimensions: number;
|
||||
}): Promise<Map<string, number[]> | null> {
|
||||
if (!input.embeddingService) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const texts = input.candidates.map((candidate) => candidate.searchText);
|
||||
const embeddings = await input.embeddingService.computeEmbeddingsBulk(texts);
|
||||
const byId = new Map<string, number[]>();
|
||||
embeddings.forEach((embedding, index) => {
|
||||
if (embedding.length !== input.dimensions) {
|
||||
throw new Error(
|
||||
`PGlite SL search prototype expected ${input.dimensions} embedding dimensions, got ${embedding.length}.`,
|
||||
);
|
||||
}
|
||||
const candidate = input.candidates[index];
|
||||
if (candidate) {
|
||||
byId.set(candidateKey(candidate.summary), embedding);
|
||||
}
|
||||
});
|
||||
return byId;
|
||||
}
|
||||
|
||||
async function insertSourceRows(input: {
|
||||
owner: KloPGliteOwnerProcess;
|
||||
candidates: LocalSlSearchCandidate[];
|
||||
embeddings: Map<string, number[]> | null;
|
||||
}): Promise<void> {
|
||||
for (const candidate of input.candidates) {
|
||||
const summary = candidate.summary;
|
||||
const embedding = input.embeddings?.get(candidateKey(summary));
|
||||
await input.owner.query(
|
||||
`
|
||||
INSERT INTO prototype_sl_sources (
|
||||
connection_id,
|
||||
source_name,
|
||||
path,
|
||||
description,
|
||||
column_count,
|
||||
measure_count,
|
||||
join_count,
|
||||
search_text,
|
||||
embedding
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9::vector)
|
||||
`,
|
||||
[
|
||||
summary.connectionId,
|
||||
summary.name,
|
||||
summary.path,
|
||||
summary.description ?? null,
|
||||
summary.columnCount,
|
||||
summary.measureCount,
|
||||
summary.joinCount,
|
||||
candidate.searchText,
|
||||
embedding ? JSON.stringify(embedding) : null,
|
||||
],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async function insertDictionaryRows(owner: KloPGliteOwnerProcess, entries: SlDictionaryEntry[]): Promise<void> {
|
||||
for (const entry of entries) {
|
||||
await owner.query(
|
||||
`
|
||||
INSERT INTO prototype_sl_dictionary_values (
|
||||
connection_id,
|
||||
source_name,
|
||||
column_name,
|
||||
value,
|
||||
value_lower,
|
||||
cardinality
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, lower($4), $5)
|
||||
`,
|
||||
[entry.connectionId, entry.sourceName, entry.columnName, entry.value, entry.cardinality ?? null],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function groupDictionaryRows(rows: PgliteDictionaryRow[], limit: number) {
|
||||
const grouped = new Map<string, PgliteDictionaryRow[]>();
|
||||
for (const row of rows) {
|
||||
grouped.set(row.id, [...(grouped.get(row.id) ?? []), row]);
|
||||
}
|
||||
|
||||
return [...grouped.entries()]
|
||||
.map(([id, group]) => {
|
||||
const first = group[0];
|
||||
const byColumn = new Map<string, string[]>();
|
||||
for (const row of group.sort(
|
||||
(left, right) => left.column_name.localeCompare(right.column_name) || left.value.localeCompare(right.value),
|
||||
)) {
|
||||
byColumn.set(row.column_name, [...(byColumn.get(row.column_name) ?? []), row.value]);
|
||||
}
|
||||
const matches: SlDictionaryMatch[] = [...byColumn.entries()].map(([column, values]) => ({
|
||||
column,
|
||||
values: values.slice(0, 5),
|
||||
...(values.length > 5 ? { overflowCount: values.length - 5 } : {}),
|
||||
}));
|
||||
return {
|
||||
id,
|
||||
connectionId: first?.connection_id ?? '',
|
||||
sourceName: first?.source_name ?? '',
|
||||
rawScore: matches.reduce((total, match) => total + match.values.length, 0),
|
||||
matches,
|
||||
};
|
||||
})
|
||||
.sort(
|
||||
(left, right) =>
|
||||
right.rawScore - left.rawScore ||
|
||||
right.matches.length - left.matches.length ||
|
||||
left.connectionId.localeCompare(right.connectionId) ||
|
||||
left.sourceName.localeCompare(right.sourceName),
|
||||
)
|
||||
.slice(0, Math.max(1, limit))
|
||||
.map((candidate, index) => ({ ...candidate, rank: index + 1 }));
|
||||
}
|
||||
|
||||
async function queryLexicalCandidates(input: {
|
||||
owner: KloPGliteOwnerProcess;
|
||||
queryText: string;
|
||||
connectionIds: string[] | null;
|
||||
limit: number;
|
||||
}) {
|
||||
const tsQuery = postgresqlOrTsQuery(input.queryText);
|
||||
if (!tsQuery) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const result = await input.owner.query<PgliteLaneRow>(
|
||||
`
|
||||
SELECT
|
||||
connection_id || '/' || source_name AS id,
|
||||
connection_id,
|
||||
source_name,
|
||||
ts_rank_cd(to_tsvector('english', search_text), to_tsquery('english', $1)) AS score
|
||||
FROM prototype_sl_sources
|
||||
WHERE to_tsvector('english', search_text) @@ to_tsquery('english', $1)
|
||||
AND ($2::text[] IS NULL OR connection_id = ANY($2::text[]))
|
||||
ORDER BY score DESC, connection_id ASC, source_name ASC
|
||||
LIMIT $3
|
||||
`,
|
||||
[tsQuery, input.connectionIds, Math.max(1, input.limit)],
|
||||
);
|
||||
|
||||
return result.rows.map((row, index) => ({
|
||||
id: row.id,
|
||||
connectionId: row.connection_id,
|
||||
sourceName: row.source_name,
|
||||
rank: index + 1,
|
||||
rawScore: Number(row.score),
|
||||
}));
|
||||
}
|
||||
|
||||
async function querySemanticCandidates(input: {
|
||||
owner: KloPGliteOwnerProcess;
|
||||
queryText: string;
|
||||
connectionIds: string[] | null;
|
||||
embeddingService?: KloEmbeddingPort | null;
|
||||
dimensions: number;
|
||||
limit: number;
|
||||
}) {
|
||||
if (!input.embeddingService) {
|
||||
return { status: 'skipped' as const, candidates: [], reason: 'embedding_unconfigured' };
|
||||
}
|
||||
|
||||
try {
|
||||
const queryEmbedding = await input.embeddingService.computeEmbedding(input.queryText);
|
||||
if (queryEmbedding.length !== input.dimensions) {
|
||||
return {
|
||||
status: 'skipped' as const,
|
||||
candidates: [],
|
||||
reason: `embedding_unhealthy:expected ${input.dimensions} dimensions, got ${queryEmbedding.length}`,
|
||||
};
|
||||
}
|
||||
|
||||
const result = await input.owner.query<PgliteLaneRow>(
|
||||
`
|
||||
SELECT
|
||||
connection_id || '/' || source_name AS id,
|
||||
connection_id,
|
||||
source_name,
|
||||
1 - (embedding <=> $1::vector) AS score
|
||||
FROM prototype_sl_sources
|
||||
WHERE embedding IS NOT NULL
|
||||
AND ($2::text[] IS NULL OR connection_id = ANY($2::text[]))
|
||||
ORDER BY embedding <=> $1::vector, connection_id ASC, source_name ASC
|
||||
LIMIT $3
|
||||
`,
|
||||
[JSON.stringify(queryEmbedding), input.connectionIds, Math.max(1, input.limit)],
|
||||
);
|
||||
|
||||
return {
|
||||
candidates: result.rows.map((row, index) => ({
|
||||
id: row.id,
|
||||
connectionId: row.connection_id,
|
||||
sourceName: row.source_name,
|
||||
rank: index + 1,
|
||||
rawScore: Number(row.score),
|
||||
})),
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
status: 'skipped' as const,
|
||||
candidates: [],
|
||||
reason: `embedding_unhealthy:${error instanceof Error ? error.message : String(error)}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async function queryDictionaryCandidates(input: {
|
||||
owner: KloPGliteOwnerProcess;
|
||||
queryText: string;
|
||||
connectionIds: string[] | null;
|
||||
limit: number;
|
||||
}) {
|
||||
const query = input.queryText.trim();
|
||||
if (!query) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const result = await input.owner.query<PgliteDictionaryRow>(
|
||||
`
|
||||
SELECT
|
||||
connection_id || '/' || source_name AS id,
|
||||
connection_id,
|
||||
source_name,
|
||||
column_name,
|
||||
value,
|
||||
GREATEST(
|
||||
similarity(value, $1),
|
||||
CASE WHEN value_lower = lower($1) THEN 1 ELSE 0 END,
|
||||
CASE WHEN value_lower LIKE '%' || lower($1) || '%' THEN 0.75 ELSE 0 END
|
||||
) AS score
|
||||
FROM prototype_sl_dictionary_values
|
||||
WHERE (
|
||||
similarity(value, $1) > 0
|
||||
OR value_lower = lower($1)
|
||||
OR value_lower LIKE '%' || lower($1) || '%'
|
||||
)
|
||||
AND ($2::text[] IS NULL OR connection_id = ANY($2::text[]))
|
||||
ORDER BY score DESC, connection_id ASC, source_name ASC, column_name ASC, value ASC
|
||||
LIMIT $3
|
||||
`,
|
||||
[query, input.connectionIds, Math.max(25, input.limit * 4)],
|
||||
);
|
||||
|
||||
return groupDictionaryRows(result.rows, input.limit);
|
||||
}
|
||||
|
||||
export async function searchLocalSlSourcesWithPglitePrototype(
|
||||
project: KloLocalProject,
|
||||
input: PgliteSlSearchPrototypeInput,
|
||||
): Promise<LocalSlSourceSearchResult[]> {
|
||||
const query = input.query.trim();
|
||||
if (!query) {
|
||||
return (await listLocalSlSources(project, { connectionId: input.connectionId })).map((source) => ({
|
||||
...source,
|
||||
score: 1,
|
||||
}));
|
||||
}
|
||||
|
||||
const candidates = await loadCandidates(project, { connectionId: input.connectionId });
|
||||
const dimensions = vectorDimensions(project);
|
||||
const dataDir = pgliteDataDir(project, input.pglite);
|
||||
await mkdir(dataDir, { recursive: true });
|
||||
|
||||
const owner = await KloPGliteOwnerProcess.start({
|
||||
dataDir,
|
||||
host: input.pglite.host,
|
||||
port: input.pglite.port,
|
||||
});
|
||||
|
||||
try {
|
||||
const embeddings = await sourceEmbeddings({
|
||||
candidates,
|
||||
embeddingService: input.embeddingService ?? null,
|
||||
dimensions,
|
||||
});
|
||||
await resetPrototypeSchema(owner, dimensions);
|
||||
await insertSourceRows({ owner, candidates, embeddings });
|
||||
|
||||
const candidateConnectionIds = [...new Set(candidates.map((candidate) => candidate.summary.connectionId))].sort();
|
||||
const dictionaryEntries = await loadLatestSlDictionaryEntries(project, candidateConnectionIds);
|
||||
await insertDictionaryRows(owner, dictionaryEntries);
|
||||
|
||||
const candidateById = new Map(candidates.map((candidate) => [candidateKey(candidate.summary), candidate]));
|
||||
const connectionIds = connectionIdsForSearch(input);
|
||||
const finalLimit = input.limit ?? candidates.length;
|
||||
const dictionaryEvidence = new Map<string, SlDictionaryMatch[]>();
|
||||
const core = new HybridSearchCore();
|
||||
|
||||
const generators: SearchCandidateGenerator[] = [
|
||||
{
|
||||
lane: 'lexical',
|
||||
async generate(args) {
|
||||
const rows = await queryLexicalCandidates({
|
||||
owner,
|
||||
queryText: args.queryText,
|
||||
connectionIds,
|
||||
limit: args.laneCandidatePoolLimit,
|
||||
});
|
||||
return {
|
||||
candidates: rows.map((row) => ({ id: row.id, rank: row.rank, rawScore: row.rawScore })),
|
||||
};
|
||||
},
|
||||
},
|
||||
{
|
||||
lane: 'dictionary',
|
||||
async generate(args) {
|
||||
const rows = await queryDictionaryCandidates({
|
||||
owner,
|
||||
queryText: args.queryText,
|
||||
connectionIds,
|
||||
limit: args.laneCandidatePoolLimit,
|
||||
});
|
||||
for (const row of rows) {
|
||||
dictionaryEvidence.set(row.id, row.matches);
|
||||
}
|
||||
return {
|
||||
candidates: rows.map((row) => ({
|
||||
id: row.id,
|
||||
rank: row.rank,
|
||||
rawScore: row.rawScore,
|
||||
evidence: row.matches,
|
||||
})),
|
||||
};
|
||||
},
|
||||
},
|
||||
{
|
||||
lane: 'token',
|
||||
async generate(args) {
|
||||
const rows = tokenLaneCandidates(candidates, args.normalizedQuery.terms).slice(
|
||||
0,
|
||||
args.laneCandidatePoolLimit,
|
||||
);
|
||||
return {
|
||||
candidates: rows.map((row, index) => ({
|
||||
id: candidateKey(row.candidate.summary),
|
||||
rank: index + 1,
|
||||
rawScore: row.score,
|
||||
})),
|
||||
};
|
||||
},
|
||||
},
|
||||
{
|
||||
lane: 'semantic',
|
||||
async generate(args) {
|
||||
return querySemanticCandidates({
|
||||
owner,
|
||||
queryText: args.queryText,
|
||||
connectionIds,
|
||||
embeddingService: input.embeddingService ?? null,
|
||||
dimensions,
|
||||
limit: args.laneCandidatePoolLimit,
|
||||
});
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
const fused = await core.search({ queryText: query, limit: finalLimit, generators });
|
||||
const hydrated: LocalSlSourceSearchResult[] = [];
|
||||
for (const result of fused.results) {
|
||||
const candidate = candidateById.get(result.id);
|
||||
if (!candidate) {
|
||||
continue;
|
||||
}
|
||||
const dictionaryMatches = dictionaryEvidence.get(result.id);
|
||||
hydrated.push({
|
||||
...candidate.summary,
|
||||
score: result.score,
|
||||
matchReasons: result.matchReasons as SlSearchMatchReason[],
|
||||
...(dictionaryMatches && dictionaryMatches.length > 0 ? { dictionaryMatches } : {}),
|
||||
lanes: fused.lanes,
|
||||
});
|
||||
}
|
||||
return hydrated;
|
||||
} finally {
|
||||
await owner.stop();
|
||||
}
|
||||
}
|
||||
53
packages/context/src/sl/ports.ts
Normal file
53
packages/context/src/sl/ports.ts
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
import type { SemanticLayerQueryInput, SemanticLayerSource } from './types.js';
|
||||
|
||||
export interface KloConnectionInfo {
|
||||
id: string;
|
||||
name: string;
|
||||
connectionType: string;
|
||||
}
|
||||
|
||||
export interface KloQueryResult {
|
||||
headers?: string[];
|
||||
rows?: unknown[][];
|
||||
totalRows?: number;
|
||||
}
|
||||
|
||||
export interface SlConnectionCatalogPort {
|
||||
listEnabledConnections(ids: string[]): Promise<KloConnectionInfo[]>;
|
||||
getConnectionById(connectionId: string): Promise<KloConnectionInfo | null>;
|
||||
executeQuery(connectionId: string, sql: string): Promise<KloQueryResult>;
|
||||
}
|
||||
|
||||
export interface SlPythonPort {
|
||||
validateSources(input: {
|
||||
sources: SemanticLayerSource[];
|
||||
dialect: string;
|
||||
recently_touched?: string[];
|
||||
}): Promise<{
|
||||
data?: { errors?: string[]; warnings?: string[]; per_source_warnings?: Record<string, string[]> } | null;
|
||||
error?: unknown;
|
||||
}>;
|
||||
query(input: {
|
||||
sources: SemanticLayerSource[];
|
||||
query: SemanticLayerQueryInput;
|
||||
dialect: string;
|
||||
}): Promise<{ data?: { sql?: string; plan?: Record<string, unknown> } | null; error?: unknown }>;
|
||||
}
|
||||
|
||||
export interface SlSourcesIndexPort {
|
||||
upsertSources(
|
||||
connectionId: string,
|
||||
sources: Array<{ sourceName: string; searchText: string; embedding: number[] | null; contentHash?: string | null }>,
|
||||
): Promise<void>;
|
||||
getExistingSearchTexts(connectionId: string): Promise<Map<string, { searchText: string; hasEmbedding: boolean }>>;
|
||||
deleteStale(connectionId: string, keepNames: string[]): Promise<void>;
|
||||
deleteByConnection(connectionId: string): Promise<void>;
|
||||
deleteByConnectionAndName(connectionId: string, sourceName: string): Promise<void>;
|
||||
search(
|
||||
connectionId: string,
|
||||
queryEmbedding: number[] | null,
|
||||
queryText: string,
|
||||
limit: number,
|
||||
minRrfScore?: number,
|
||||
): Promise<Array<{ sourceName: string; rrfScore: number }>>;
|
||||
}
|
||||
149
packages/context/src/sl/schemas.ts
Normal file
149
packages/context/src/sl/schemas.ts
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
import { z } from 'zod';
|
||||
|
||||
// Literal vocabularies — kept in lockstep with the Python Pydantic model at
|
||||
// python-service/klo-sl/semantic_layer/models.py (SourceColumn / ColumnRole /
|
||||
// ColumnVisibility / JoinDeclaration). If these diverge, YAMLs can pass
|
||||
// TypeScript validation at ingest time but fail Python loading at query time.
|
||||
const columnTypeValues = ['string', 'number', 'time', 'boolean'] as const;
|
||||
const columnRoleValues = ['time', 'default'] as const;
|
||||
const columnVisibilityValues = ['public', 'internal', 'hidden'] as const;
|
||||
const joinRelationshipValues = ['many_to_one', 'one_to_many', 'one_to_one'] as const;
|
||||
|
||||
const slMeasureDefinitionSchema = z.object({
|
||||
name: z.string().min(1),
|
||||
expr: z.string().min(1),
|
||||
filter: z.string().optional(),
|
||||
segments: z.array(z.string().min(1)).optional(),
|
||||
description: z.string().optional(),
|
||||
});
|
||||
|
||||
const segmentDefinitionSchema = z.object({
|
||||
name: z.string().min(1),
|
||||
expr: z.string().min(1),
|
||||
description: z.string().optional(),
|
||||
});
|
||||
|
||||
const defaultTimeDimensionDbtSchema = z.object({
|
||||
dbt: z.string().optional(),
|
||||
});
|
||||
|
||||
const dbtColumnConstraintsSchema = z.object({
|
||||
not_null: z.boolean().optional(),
|
||||
unique: z.boolean().optional(),
|
||||
});
|
||||
|
||||
const dbtDataTestRefSchema = z.object({
|
||||
name: z.string().min(1),
|
||||
package: z.string().min(1),
|
||||
kwargs: z.record(z.string(), z.unknown()).optional(),
|
||||
});
|
||||
|
||||
const dbtColumnTestsSchema = z.object({
|
||||
dbt: z.array(dbtDataTestRefSchema).optional(),
|
||||
dbt_by_package: z.record(z.string(), z.array(z.string().min(1))).optional(),
|
||||
});
|
||||
|
||||
const sourceKeyedStringArraySchema = z.object({
|
||||
dbt: z.array(z.string().min(1)).optional(),
|
||||
});
|
||||
|
||||
const sourceKeyedColumnConstraintsSchema = z.object({
|
||||
dbt: dbtColumnConstraintsSchema.optional(),
|
||||
});
|
||||
|
||||
const freshnessDbtSchema = z.object({
|
||||
raw: z.unknown().optional(),
|
||||
loaded_at_field: z.string().nullable().optional(),
|
||||
});
|
||||
|
||||
const sourceFreshnessSchema = z.object({
|
||||
dbt: freshnessDbtSchema.optional(),
|
||||
});
|
||||
|
||||
const joinDeclarationSchema = z.object({
|
||||
to: z.string().min(1),
|
||||
on: z.string().min(1),
|
||||
relationship: z.enum(joinRelationshipValues),
|
||||
alias: z.string().optional(),
|
||||
});
|
||||
|
||||
const sourceColumnSchema = z.object({
|
||||
name: z.string().min(1),
|
||||
// type/description optional on standalone sources: compose-time enrichment fills them
|
||||
// from the manifest entry named in `inherits_columns_from`. If the agent does not set
|
||||
// `inherits_columns_from`, or the column is not in the manifest, type must be present
|
||||
// — surfaced by sl_validate.
|
||||
type: z.enum(columnTypeValues).optional(),
|
||||
role: z.enum(columnRoleValues).optional(),
|
||||
visibility: z.enum(columnVisibilityValues).optional(),
|
||||
description: z.string().optional(),
|
||||
expr: z.string().optional(),
|
||||
constraints: sourceKeyedColumnConstraintsSchema.optional(),
|
||||
enum_values: sourceKeyedStringArraySchema.optional(),
|
||||
tests: dbtColumnTestsSchema.optional(),
|
||||
});
|
||||
|
||||
/** Overlay column: type requires expr (structural types are inherited from manifest). */
|
||||
const overlayColumnSchema = z
|
||||
.object({
|
||||
name: z.string().min(1),
|
||||
type: z.enum(columnTypeValues).optional(),
|
||||
role: z.enum(columnRoleValues).optional(),
|
||||
visibility: z.enum(columnVisibilityValues).optional(),
|
||||
description: z.string().optional(),
|
||||
expr: z.string().optional(),
|
||||
})
|
||||
.refine((col) => !col.type || col.expr, {
|
||||
message: "Overlay column with 'type' must also have 'expr' (only computed columns may specify a type)",
|
||||
});
|
||||
|
||||
/** Standalone source: has `table` or `sql`, requires grain + columns. */
|
||||
export const sourceDefinitionSchema = z
|
||||
.object({
|
||||
name: z.string().min(1),
|
||||
description: z.string().optional(),
|
||||
// Accepted for documentation parity with the Python spec; behavior is driven
|
||||
// by the `table` / `sql` fields, not by this discriminator.
|
||||
source_type: z.enum(['table', 'sql']).optional(),
|
||||
table: z.string().optional(),
|
||||
sql: z.string().optional(),
|
||||
// Manifest key (e.g. "CONSIGNMENTS") whose column metadata fills any blank
|
||||
// type/descriptions/role on this source's columns at compose time. Lets the
|
||||
// agent write `columns: [{name: FOO}]` instead of redeclaring known fields.
|
||||
// Lookup is fuzzy: bare key, fully-qualified table path, or any suffix all match.
|
||||
inherits_columns_from: z.string().optional(),
|
||||
grain: z.array(z.string()).min(1),
|
||||
columns: z.array(sourceColumnSchema).default([]),
|
||||
joins: z.array(joinDeclarationSchema).default([]),
|
||||
measures: z.array(slMeasureDefinitionSchema).default([]),
|
||||
segments: z.array(segmentDefinitionSchema).optional(),
|
||||
default_time_dimension: defaultTimeDimensionDbtSchema.optional(),
|
||||
tags: sourceKeyedStringArraySchema.optional(),
|
||||
freshness: sourceFreshnessSchema.optional(),
|
||||
})
|
||||
.strict()
|
||||
.refine((s) => (s.table || s.sql) && !(s.table && s.sql), {
|
||||
message: "Standalone source must have exactly one of 'table' or 'sql' (not both)",
|
||||
});
|
||||
|
||||
/** Overlay source: no table/sql, all fields optional except name. */
|
||||
export const sourceOverlaySchema = z
|
||||
.object({
|
||||
name: z.string().min(1),
|
||||
description: z.string().optional(),
|
||||
descriptions: z.record(z.string(), z.string()).optional(),
|
||||
grain: z.array(z.string()).optional(),
|
||||
columns: z.array(overlayColumnSchema).optional(),
|
||||
joins: z.array(joinDeclarationSchema).optional(),
|
||||
measures: z.array(slMeasureDefinitionSchema).optional(),
|
||||
segments: z.array(segmentDefinitionSchema).optional(),
|
||||
exclude_columns: z.array(z.string()).optional(),
|
||||
disable_joins: z.array(z.string()).optional(),
|
||||
default_time_dimension: defaultTimeDimensionDbtSchema.optional(),
|
||||
})
|
||||
.strict();
|
||||
|
||||
/** Returns true if the source data is an overlay (no table/sql field). */
|
||||
export function isOverlaySource(source: Record<string, unknown>): boolean {
|
||||
return !source.table && !source.sql;
|
||||
}
|
||||
678
packages/context/src/sl/semantic-layer.service.test.ts
Normal file
678
packages/context/src/sl/semantic-layer.service.test.ts
Normal file
|
|
@ -0,0 +1,678 @@
|
|||
import type { Mock } from 'vitest';
|
||||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import {
|
||||
composeOverlay,
|
||||
enrichColumnsFromManifest,
|
||||
findDanglingSegmentRefs,
|
||||
SemanticLayerService,
|
||||
} from './semantic-layer.service.js';
|
||||
import { sourceDefinitionSchema } from './schemas.js';
|
||||
import type { SemanticLayerSource } from './types.js';
|
||||
|
||||
const pythonPort = {
|
||||
validateSources: vi.fn(),
|
||||
generateSources: vi.fn(),
|
||||
query: vi.fn(),
|
||||
};
|
||||
|
||||
function connectionCatalog(connectionType = 'SNOWFLAKE') {
|
||||
return {
|
||||
listEnabledConnections: vi.fn().mockResolvedValue([]),
|
||||
getConnectionById: vi.fn().mockResolvedValue({ id: 'conn-1', name: 'conn-1', connectionType }),
|
||||
executeQuery: vi.fn(),
|
||||
};
|
||||
}
|
||||
|
||||
const baseTable: SemanticLayerSource = {
|
||||
name: 'fct_labs',
|
||||
grain: ['lab_order_id'],
|
||||
table: 'analytics.fct_labs',
|
||||
columns: [
|
||||
{ name: 'lab_order_id', type: 'string' },
|
||||
{ name: 'admin_user_id', type: 'string' },
|
||||
{ name: 'lab_type', type: 'string' },
|
||||
],
|
||||
joins: [],
|
||||
measures: [],
|
||||
};
|
||||
|
||||
describe('composeOverlay', () => {
|
||||
it('carries top-level segments from overlay into the composed source', () => {
|
||||
const overlay = {
|
||||
name: 'fct_labs',
|
||||
segments: [{ name: 'byol', expr: "lab_type = 'byol'", description: 'BYOL cohort' }],
|
||||
};
|
||||
const composed = composeOverlay(baseTable, overlay);
|
||||
expect(composed.segments).toHaveLength(1);
|
||||
expect(composed.segments?.[0].name).toBe('byol');
|
||||
expect(composed.segments?.[0].expr).toBe("lab_type = 'byol'");
|
||||
});
|
||||
|
||||
it('preserves measure-level segments references', () => {
|
||||
const overlay = {
|
||||
name: 'fct_labs',
|
||||
segments: [{ name: 'byol', expr: "lab_type = 'byol'" }],
|
||||
measures: [
|
||||
{
|
||||
name: 'byol_subscriber_count',
|
||||
expr: 'count(distinct admin_user_id)',
|
||||
segments: ['byol'],
|
||||
description: 'BYOL subscribers',
|
||||
},
|
||||
],
|
||||
};
|
||||
const composed = composeOverlay(baseTable, overlay);
|
||||
expect(composed.measures).toHaveLength(1);
|
||||
expect(composed.measures[0].segments).toEqual(['byol']);
|
||||
});
|
||||
|
||||
it('leaves base segments unchanged when overlay does not specify segments', () => {
|
||||
const baseWithSegments: SemanticLayerSource = {
|
||||
...baseTable,
|
||||
segments: [{ name: 'pre_existing', expr: 'is_paid = true' }],
|
||||
};
|
||||
const overlay = { name: 'fct_labs', description: 'no segments here' };
|
||||
const composed = composeOverlay(baseWithSegments, overlay);
|
||||
expect(composed.segments).toEqual([{ name: 'pre_existing', expr: 'is_paid = true' }]);
|
||||
});
|
||||
|
||||
it('replaces base segments when overlay provides its own (even an empty array)', () => {
|
||||
const baseWithSegments: SemanticLayerSource = {
|
||||
...baseTable,
|
||||
segments: [{ name: 'pre_existing', expr: 'is_paid = true' }],
|
||||
};
|
||||
const overlay = { name: 'fct_labs', segments: [] };
|
||||
const composed = composeOverlay(baseWithSegments, overlay);
|
||||
expect(composed.segments).toEqual([]);
|
||||
});
|
||||
|
||||
it('throws on unknown top-level overlay keys with a pointed error', () => {
|
||||
const overlay = { name: 'fct_labs', frobnicate: true };
|
||||
expect(() => composeOverlay(baseTable, overlay)).toThrow(
|
||||
/overlay for 'fct_labs' has unhandled keys \[frobnicate\]/,
|
||||
);
|
||||
});
|
||||
|
||||
it('lists every unknown key in the error message, not just the first', () => {
|
||||
const overlay = { name: 'fct_labs', foo: 1, bar: 2 };
|
||||
expect(() => composeOverlay(baseTable, overlay)).toThrow(/foo, bar/);
|
||||
});
|
||||
|
||||
it('still handles existing known keys without regression', () => {
|
||||
const overlay = {
|
||||
name: 'fct_labs',
|
||||
description: 'patient lab orders',
|
||||
exclude_columns: ['admin_user_id'],
|
||||
columns: [{ name: 'is_byol', type: 'boolean', expr: "lab_type = 'byol'" }],
|
||||
measures: [{ name: 'count_all', expr: 'count(*)' }],
|
||||
};
|
||||
const composed = composeOverlay(baseTable, overlay);
|
||||
expect(composed.columns.find((c) => c.name === 'admin_user_id')).toBeUndefined();
|
||||
expect(composed.columns.find((c) => c.name === 'is_byol')).toBeDefined();
|
||||
expect(composed.measures).toHaveLength(1);
|
||||
});
|
||||
|
||||
it('merges overlay descriptions (plural) with base descriptions keyed by source', () => {
|
||||
const baseWithDescriptions: SemanticLayerSource = {
|
||||
...baseTable,
|
||||
descriptions: { db: 'scan-derived description', ai: 'AI description' },
|
||||
};
|
||||
const overlay = {
|
||||
name: 'fct_labs',
|
||||
descriptions: { dbt: 'dbt description', ai: 'AI description (overridden)' },
|
||||
};
|
||||
const composed = composeOverlay(baseWithDescriptions, overlay);
|
||||
expect(composed.descriptions).toEqual({
|
||||
db: 'scan-derived description',
|
||||
ai: 'AI description (overridden)',
|
||||
dbt: 'dbt description',
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('enrichColumnsFromManifest', () => {
|
||||
const manifest: SemanticLayerSource = {
|
||||
name: 'CONSIGNMENTS',
|
||||
table: 'ANALYTICS.MARTS.CONSIGNMENTS',
|
||||
grain: ['CONSIGNED_ITEM_ID'],
|
||||
columns: [
|
||||
{
|
||||
name: 'CONSIGNED_ITEM_ID',
|
||||
type: 'string',
|
||||
descriptions: { ai: 'Unique identifier for the consigned item record.' },
|
||||
},
|
||||
{
|
||||
name: 'CASH_ADV_AMOUNT',
|
||||
type: 'number',
|
||||
descriptions: { ai: 'Amount of cash advance disbursed to consigners.' },
|
||||
},
|
||||
{
|
||||
name: 'CONSIGNMENT_CREATED_AT',
|
||||
type: 'time',
|
||||
role: 'time',
|
||||
descriptions: { ai: 'Timestamp when the consignment was created.' },
|
||||
},
|
||||
],
|
||||
joins: [],
|
||||
measures: [],
|
||||
};
|
||||
|
||||
it('fills blank type and descriptions on source columns from the manifest', () => {
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'aav_consignments',
|
||||
sql: 'SELECT CONSIGNED_ITEM_ID, CASH_ADV_AMOUNT FROM MARTS.CONSIGNMENTS WHERE ...',
|
||||
inherits_columns_from: 'CONSIGNMENTS',
|
||||
grain: ['CONSIGNED_ITEM_ID'],
|
||||
columns: [
|
||||
{ name: 'CONSIGNED_ITEM_ID', type: '' },
|
||||
{ name: 'CASH_ADV_AMOUNT', type: '' },
|
||||
],
|
||||
joins: [],
|
||||
measures: [],
|
||||
};
|
||||
const enriched = enrichColumnsFromManifest(source, manifest);
|
||||
expect(enriched.columns[0]).toEqual({
|
||||
name: 'CONSIGNED_ITEM_ID',
|
||||
type: 'string',
|
||||
descriptions: { ai: 'Unique identifier for the consigned item record.' },
|
||||
});
|
||||
expect(enriched.columns[1]).toEqual({
|
||||
name: 'CASH_ADV_AMOUNT',
|
||||
type: 'number',
|
||||
descriptions: { ai: 'Amount of cash advance disbursed to consigners.' },
|
||||
});
|
||||
});
|
||||
|
||||
it('preserves a local description if the source already declared one', () => {
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'aav_consignments',
|
||||
sql: 'SELECT CONSIGNED_ITEM_ID FROM ...',
|
||||
inherits_columns_from: 'CONSIGNMENTS',
|
||||
grain: ['CONSIGNED_ITEM_ID'],
|
||||
columns: [
|
||||
{
|
||||
name: 'CONSIGNED_ITEM_ID',
|
||||
type: 'string',
|
||||
descriptions: { ai: 'AAV-specific note: always non-null in this filtered view.' },
|
||||
},
|
||||
],
|
||||
joins: [],
|
||||
measures: [],
|
||||
};
|
||||
const enriched = enrichColumnsFromManifest(source, manifest);
|
||||
expect(enriched.columns[0].descriptions).toEqual({
|
||||
ai: 'AAV-specific note: always non-null in this filtered view.',
|
||||
});
|
||||
});
|
||||
|
||||
it('passes through columns absent from the manifest unchanged', () => {
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'aav_consignments',
|
||||
sql: 'SELECT ALT_VALUE_COMBINED, my_derived FROM ...',
|
||||
inherits_columns_from: 'CONSIGNMENTS',
|
||||
grain: ['CONSIGNED_ITEM_ID'],
|
||||
columns: [{ name: 'my_derived', type: 'number', expr: 'CASH_ADV_AMOUNT * 2' }],
|
||||
joins: [],
|
||||
measures: [],
|
||||
};
|
||||
const enriched = enrichColumnsFromManifest(source, manifest);
|
||||
expect(enriched.columns[0]).toEqual({
|
||||
name: 'my_derived',
|
||||
type: 'number',
|
||||
expr: 'CASH_ADV_AMOUNT * 2',
|
||||
});
|
||||
});
|
||||
|
||||
it('copies role from the manifest when the source omits it', () => {
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'aav_consignments',
|
||||
sql: 'SELECT CONSIGNMENT_CREATED_AT FROM ...',
|
||||
inherits_columns_from: 'CONSIGNMENTS',
|
||||
grain: ['CONSIGNED_ITEM_ID'],
|
||||
columns: [{ name: 'CONSIGNMENT_CREATED_AT', type: '' }],
|
||||
joins: [],
|
||||
measures: [],
|
||||
};
|
||||
const enriched = enrichColumnsFromManifest(source, manifest);
|
||||
expect(enriched.columns[0].role).toBe('time');
|
||||
expect(enriched.columns[0].type).toBe('time');
|
||||
});
|
||||
|
||||
it('returns the source unchanged when manifestEntry is null/undefined', () => {
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'aav_consignments',
|
||||
sql: 'SELECT FOO FROM ...',
|
||||
grain: ['FOO'],
|
||||
columns: [{ name: 'FOO', type: '' }],
|
||||
joins: [],
|
||||
measures: [],
|
||||
};
|
||||
const enriched = enrichColumnsFromManifest(source, null);
|
||||
expect(enriched).toEqual(source);
|
||||
});
|
||||
});
|
||||
|
||||
describe('sourceDefinitionSchema', () => {
|
||||
it('preserves dbt structural metadata fields used by manifest-backed SL readers', () => {
|
||||
const result = sourceDefinitionSchema.safeParse({
|
||||
name: 'orders',
|
||||
table: 'public.orders',
|
||||
grain: ['id'],
|
||||
columns: [
|
||||
{
|
||||
name: 'status',
|
||||
type: 'string',
|
||||
constraints: { dbt: { not_null: true, unique: true } },
|
||||
enum_values: { dbt: ['placed', 'shipped'] },
|
||||
tests: {
|
||||
dbt: [{ name: 'accepted_values', package: 'dbt' }],
|
||||
dbt_by_package: { dbt: ['accepted_values'] },
|
||||
},
|
||||
},
|
||||
],
|
||||
joins: [],
|
||||
measures: [],
|
||||
tags: { dbt: ['mart', 'finance'] },
|
||||
freshness: { dbt: { loaded_at_field: 'updated_at', raw: { warn_after: { count: 12, period: 'hour' } } } },
|
||||
default_time_dimension: { dbt: 'updated_at' },
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
if (!result.success) {
|
||||
return;
|
||||
}
|
||||
expect(result.data.columns[0]).toMatchObject({
|
||||
constraints: { dbt: { not_null: true, unique: true } },
|
||||
enum_values: { dbt: ['placed', 'shipped'] },
|
||||
tests: {
|
||||
dbt: [{ name: 'accepted_values', package: 'dbt' }],
|
||||
dbt_by_package: { dbt: ['accepted_values'] },
|
||||
},
|
||||
});
|
||||
expect(result.data.tags).toEqual({ dbt: ['mart', 'finance'] });
|
||||
expect(result.data.freshness).toEqual({
|
||||
dbt: { loaded_at_field: 'updated_at', raw: { warn_after: { count: 12, period: 'hour' } } },
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('findManifestEntryByTableRef', () => {
|
||||
let configService: {
|
||||
listFiles: Mock<(dir: string, recursive?: boolean) => Promise<{ files: string[] }>>;
|
||||
readFile: Mock<(path: string) => Promise<{ content: string }>>;
|
||||
};
|
||||
let service: SemanticLayerService;
|
||||
|
||||
beforeEach(() => {
|
||||
configService = {
|
||||
listFiles: vi.fn<(dir: string, recursive?: boolean) => Promise<{ files: string[] }>>().mockResolvedValue({
|
||||
files: ['semantic-layer/conn-1/_schema/marts.yaml'],
|
||||
}),
|
||||
readFile: vi.fn<(path: string) => Promise<{ content: string }>>().mockResolvedValue({
|
||||
content: [
|
||||
'tables:',
|
||||
' CONSIGNMENTS:',
|
||||
' table: ANALYTICS.MARTS.CONSIGNMENTS',
|
||||
' columns:',
|
||||
' - { name: CONSIGNED_ITEM_ID, type: string, pk: true }',
|
||||
].join('\n'),
|
||||
}),
|
||||
};
|
||||
service = new SemanticLayerService(configService as never, connectionCatalog(), pythonPort);
|
||||
});
|
||||
|
||||
it('finds by exact bare manifest key', async () => {
|
||||
const entry = await service.findManifestEntryByTableRef('conn-1', 'CONSIGNMENTS');
|
||||
expect(entry?.name).toBe('CONSIGNMENTS');
|
||||
});
|
||||
|
||||
it('finds by fully-qualified table path', async () => {
|
||||
const entry = await service.findManifestEntryByTableRef('conn-1', 'ANALYTICS.MARTS.CONSIGNMENTS');
|
||||
expect(entry?.name).toBe('CONSIGNMENTS');
|
||||
});
|
||||
|
||||
it('finds by schema-qualified suffix', async () => {
|
||||
const entry = await service.findManifestEntryByTableRef('conn-1', 'MARTS.CONSIGNMENTS');
|
||||
expect(entry?.name).toBe('CONSIGNMENTS');
|
||||
});
|
||||
|
||||
it('matches case-insensitively on table path', async () => {
|
||||
const entry = await service.findManifestEntryByTableRef('conn-1', 'analytics.marts.consignments');
|
||||
expect(entry?.name).toBe('CONSIGNMENTS');
|
||||
});
|
||||
|
||||
it('returns null when nothing matches', async () => {
|
||||
const entry = await service.findManifestEntryByTableRef('conn-1', 'NOT_A_TABLE');
|
||||
expect(entry).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('loadAllSources — standalone enrichment via inherits_columns_from', () => {
|
||||
let configService: {
|
||||
listFiles: Mock<(dir: string, recursive?: boolean) => Promise<{ files: string[] }>>;
|
||||
readFile: Mock<(path: string) => Promise<{ content: string }>>;
|
||||
};
|
||||
let service: SemanticLayerService;
|
||||
|
||||
beforeEach(() => {
|
||||
configService = {
|
||||
listFiles: vi.fn<(dir: string, recursive?: boolean) => Promise<{ files: string[] }>>(),
|
||||
readFile: vi.fn<(path: string) => Promise<{ content: string }>>(),
|
||||
};
|
||||
service = new SemanticLayerService(configService as never, connectionCatalog(), pythonPort);
|
||||
});
|
||||
|
||||
it('preserves dbt metadata when projecting manifest-backed sources', async () => {
|
||||
const schemaPath = 'semantic-layer/conn-1/_schema/marts.yaml';
|
||||
configService.listFiles.mockImplementation((dir: string) => {
|
||||
if (dir === 'semantic-layer/conn-1' || dir === 'semantic-layer/conn-1/_schema') {
|
||||
return Promise.resolve({ files: [schemaPath] });
|
||||
}
|
||||
return Promise.resolve({ files: [] });
|
||||
});
|
||||
configService.readFile.mockResolvedValue({
|
||||
content: [
|
||||
'tables:',
|
||||
' orders:',
|
||||
' table: public.orders',
|
||||
' tags: { dbt: [mart] }',
|
||||
' freshness:',
|
||||
' dbt:',
|
||||
' loaded_at_field: updated_at',
|
||||
' columns:',
|
||||
' - name: status',
|
||||
' type: string',
|
||||
' constraints: { dbt: { not_null: true } }',
|
||||
' enum_values: { dbt: [placed, shipped] }',
|
||||
' tests:',
|
||||
' dbt:',
|
||||
' - { name: accepted_values, package: dbt }',
|
||||
].join('\n'),
|
||||
});
|
||||
|
||||
const sources = await service.loadAllSources('conn-1');
|
||||
|
||||
expect(sources[0]).toMatchObject({
|
||||
name: 'orders',
|
||||
tags: { dbt: ['mart'] },
|
||||
freshness: { dbt: { loaded_at_field: 'updated_at' } },
|
||||
columns: [
|
||||
{
|
||||
name: 'status',
|
||||
constraints: { dbt: { not_null: true } },
|
||||
enum_values: { dbt: ['placed', 'shipped'] },
|
||||
tests: { dbt: [{ name: 'accepted_values', package: 'dbt' }] },
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('fills blank columns on a standalone source from the manifest entry it points at', async () => {
|
||||
const schemaPath = 'semantic-layer/conn-1/_schema/marts.yaml';
|
||||
const standalonePath = 'semantic-layer/conn-1/aav_consignments.yaml';
|
||||
|
||||
configService.listFiles.mockImplementation((dir: string) => {
|
||||
if (dir === 'semantic-layer/conn-1') {
|
||||
return Promise.resolve({ files: [schemaPath, standalonePath] });
|
||||
}
|
||||
if (dir === 'semantic-layer/conn-1/_schema') {
|
||||
return Promise.resolve({ files: [schemaPath] });
|
||||
}
|
||||
return Promise.resolve({ files: [] });
|
||||
});
|
||||
configService.readFile.mockImplementation((path: string) => {
|
||||
if (path === schemaPath) {
|
||||
return Promise.resolve({
|
||||
content: [
|
||||
'tables:',
|
||||
' CONSIGNMENTS:',
|
||||
' table: ANALYTICS.MARTS.CONSIGNMENTS',
|
||||
' columns:',
|
||||
' - name: CONSIGNED_ITEM_ID',
|
||||
' type: string',
|
||||
' descriptions: { ai: "Unique consigned-item id." }',
|
||||
' - name: CASH_ADV_AMOUNT',
|
||||
' type: number',
|
||||
' descriptions: { ai: "Cash advance amount." }',
|
||||
].join('\n'),
|
||||
});
|
||||
}
|
||||
if (path === standalonePath) {
|
||||
return Promise.resolve({
|
||||
content: [
|
||||
'name: aav_consignments',
|
||||
'sql: |',
|
||||
' SELECT CONSIGNED_ITEM_ID, CASH_ADV_AMOUNT FROM ANALYTICS.MARTS.CONSIGNMENTS WHERE x',
|
||||
'inherits_columns_from: CONSIGNMENTS',
|
||||
'grain: [CONSIGNED_ITEM_ID]',
|
||||
'columns:',
|
||||
' - { name: CONSIGNED_ITEM_ID }',
|
||||
' - { name: CASH_ADV_AMOUNT }',
|
||||
].join('\n'),
|
||||
});
|
||||
}
|
||||
return Promise.reject(new Error(`Unexpected readFile: ${path}`));
|
||||
});
|
||||
|
||||
const sources = await service.loadAllSources('conn-1');
|
||||
const aav = sources.find((s) => s.name === 'aav_consignments');
|
||||
expect(aav).toBeDefined();
|
||||
expect(aav?.columns).toEqual([
|
||||
{ name: 'CONSIGNED_ITEM_ID', type: 'string', descriptions: { ai: 'Unique consigned-item id.' } },
|
||||
{ name: 'CASH_ADV_AMOUNT', type: 'number', descriptions: { ai: 'Cash advance amount.' } },
|
||||
]);
|
||||
});
|
||||
|
||||
it('accepts a fully-qualified path in inherits_columns_from', async () => {
|
||||
const schemaPath = 'semantic-layer/conn-1/_schema/marts.yaml';
|
||||
const standalonePath = 'semantic-layer/conn-1/aav_consignments.yaml';
|
||||
configService.listFiles.mockImplementation((dir: string) => {
|
||||
if (dir === 'semantic-layer/conn-1') {
|
||||
return Promise.resolve({ files: [schemaPath, standalonePath] });
|
||||
}
|
||||
if (dir === 'semantic-layer/conn-1/_schema') {
|
||||
return Promise.resolve({ files: [schemaPath] });
|
||||
}
|
||||
return Promise.resolve({ files: [] });
|
||||
});
|
||||
configService.readFile.mockImplementation((path: string) => {
|
||||
if (path === schemaPath) {
|
||||
return Promise.resolve({
|
||||
content: [
|
||||
'tables:',
|
||||
' CONSIGNMENTS:',
|
||||
' table: ANALYTICS.MARTS.CONSIGNMENTS',
|
||||
' columns:',
|
||||
' - { name: CONSIGNED_ITEM_ID, type: string }',
|
||||
].join('\n'),
|
||||
});
|
||||
}
|
||||
return Promise.resolve({
|
||||
content: [
|
||||
'name: aav_consignments',
|
||||
'sql: SELECT 1',
|
||||
'inherits_columns_from: ANALYTICS.MARTS.CONSIGNMENTS',
|
||||
'grain: [CONSIGNED_ITEM_ID]',
|
||||
'columns:',
|
||||
' - { name: CONSIGNED_ITEM_ID }',
|
||||
].join('\n'),
|
||||
});
|
||||
});
|
||||
|
||||
const sources = await service.loadAllSources('conn-1');
|
||||
const aav = sources.find((s) => s.name === 'aav_consignments');
|
||||
expect(aav?.columns[0].type).toBe('string');
|
||||
});
|
||||
|
||||
it('passes the source through unchanged if inherits_columns_from misses', async () => {
|
||||
const standalonePath = 'semantic-layer/conn-1/aav_consignments.yaml';
|
||||
configService.listFiles.mockImplementation((dir: string) => {
|
||||
if (dir === 'semantic-layer/conn-1') {
|
||||
return Promise.resolve({ files: [standalonePath] });
|
||||
}
|
||||
return Promise.resolve({ files: [] });
|
||||
});
|
||||
configService.readFile.mockResolvedValue({
|
||||
content: [
|
||||
'name: aav_consignments',
|
||||
'sql: SELECT 1',
|
||||
'inherits_columns_from: NO_SUCH_TABLE',
|
||||
'grain: [FOO]',
|
||||
'columns:',
|
||||
' - { name: FOO, type: string }',
|
||||
].join('\n'),
|
||||
});
|
||||
|
||||
const sources = await service.loadAllSources('conn-1');
|
||||
const aav = sources.find((s) => s.name === 'aav_consignments');
|
||||
expect(aav?.columns).toEqual([{ name: 'FOO', type: 'string' }]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('validateWithProposedSource', () => {
|
||||
let configService: {
|
||||
listFiles: Mock<(dir: string, recursive?: boolean) => Promise<{ files: string[] }>>;
|
||||
readFile: Mock<(path: string) => Promise<{ content: string }>>;
|
||||
};
|
||||
let service: SemanticLayerService;
|
||||
|
||||
beforeEach(() => {
|
||||
pythonPort.validateSources.mockReset();
|
||||
configService = {
|
||||
listFiles: vi.fn<(dir: string, recursive?: boolean) => Promise<{ files: string[] }>>().mockResolvedValue({
|
||||
files: [],
|
||||
}),
|
||||
readFile: vi.fn<(path: string) => Promise<{ content: string }>>(),
|
||||
};
|
||||
service = new SemanticLayerService(configService as never, connectionCatalog('BIGQUERY'), pythonPort);
|
||||
});
|
||||
|
||||
it('uses the connection warehouse dialect, not hardcoded postgres', async () => {
|
||||
pythonPort.validateSources.mockResolvedValue({
|
||||
data: { errors: [], warnings: [] },
|
||||
});
|
||||
|
||||
await service.validateWithProposedSource('conn-1', {
|
||||
name: 'std',
|
||||
table: 'analytics.std',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'number' }],
|
||||
joins: [],
|
||||
measures: [],
|
||||
});
|
||||
|
||||
expect(pythonPort.validateSources).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
dialect: 'bigquery',
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('composes a bare overlay with its manifest base before validating', async () => {
|
||||
const schemaPath = 'semantic-layer/conn-1/_schema/core.yaml';
|
||||
const listFilesImpl = (dir: string): Promise<{ files: string[] }> => {
|
||||
if (dir === 'semantic-layer/conn-1') {
|
||||
return Promise.resolve({ files: [schemaPath, 'semantic-layer/conn-1/fct_orders.yaml'] });
|
||||
}
|
||||
if (dir === 'semantic-layer/conn-1/_schema') {
|
||||
return Promise.resolve({ files: [schemaPath] });
|
||||
}
|
||||
return Promise.resolve({ files: [] });
|
||||
};
|
||||
const readFileImpl = (path: string): Promise<{ content: string }> => {
|
||||
if (path === schemaPath) {
|
||||
return Promise.resolve({
|
||||
content: [
|
||||
'tables:',
|
||||
' fct_orders:',
|
||||
' table: analytics.fct_orders',
|
||||
' columns:',
|
||||
' - { name: id, type: string, pk: true }',
|
||||
' - { name: amount, type: number }',
|
||||
].join('\n'),
|
||||
});
|
||||
}
|
||||
if (path === 'semantic-layer/conn-1/fct_orders.yaml') {
|
||||
return Promise.resolve({ content: 'name: fct_orders\nmeasures: []\n' });
|
||||
}
|
||||
return Promise.reject(new Error(`Unexpected readFile: ${path}`));
|
||||
};
|
||||
configService.listFiles.mockImplementation(listFilesImpl);
|
||||
configService.readFile.mockImplementation(readFileImpl);
|
||||
|
||||
pythonPort.validateSources.mockResolvedValue({
|
||||
data: { errors: [], warnings: [] },
|
||||
});
|
||||
|
||||
const overlay: SemanticLayerSource = {
|
||||
name: 'fct_orders',
|
||||
grain: ['id'],
|
||||
columns: [],
|
||||
joins: [],
|
||||
measures: [{ name: 'total_amount', expr: 'sum(amount)' }],
|
||||
};
|
||||
|
||||
await service.validateWithProposedSource('conn-1', overlay);
|
||||
|
||||
expect(pythonPort.validateSources).toHaveBeenCalledTimes(1);
|
||||
const sources = (pythonPort.validateSources.mock.calls[0][0]?.sources ?? []) as Array<Record<string, unknown>>;
|
||||
const composed = sources.find((s) => s.name === 'fct_orders');
|
||||
expect(composed).toBeDefined();
|
||||
expect(composed?.table).toBe('analytics.fct_orders');
|
||||
expect(composed?.measures).toEqual([{ name: 'total_amount', expr: 'sum(amount)' }]);
|
||||
});
|
||||
|
||||
it('returns a pointed error when a bare overlay has no manifest base', async () => {
|
||||
configService.listFiles.mockResolvedValue({ files: [] });
|
||||
|
||||
const overlay: SemanticLayerSource = {
|
||||
name: 'orphan',
|
||||
grain: [],
|
||||
columns: [],
|
||||
joins: [],
|
||||
measures: [],
|
||||
};
|
||||
|
||||
const result = await service.validateWithProposedSource('conn-1', overlay);
|
||||
expect(result.errors[0]).toMatch(/Overlay 'orphan' has no matching manifest entry/);
|
||||
expect(pythonPort.validateSources).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('findDanglingSegmentRefs', () => {
|
||||
it('returns empty when every measure segment resolves', () => {
|
||||
const source = {
|
||||
segments: [{ name: 'byol' }, { name: 'paid' }],
|
||||
measures: [
|
||||
{ name: 'byol_count', segments: ['byol'] },
|
||||
{ name: 'paid_count', segments: ['paid', 'byol'] },
|
||||
],
|
||||
};
|
||||
expect(findDanglingSegmentRefs(source)).toEqual([]);
|
||||
});
|
||||
|
||||
it('flags measures whose segment reference does not exist on the source', () => {
|
||||
const source = {
|
||||
segments: [{ name: 'byol' }],
|
||||
measures: [{ name: 'broken', segments: ['byol', 'missing'] }],
|
||||
};
|
||||
const refs = findDanglingSegmentRefs(source);
|
||||
expect(refs).toHaveLength(1);
|
||||
expect(refs[0]).toMatch(/measure 'broken' references unknown segment 'missing'/);
|
||||
});
|
||||
|
||||
it('flags when a source has zero segments but measures reference one', () => {
|
||||
const source = {
|
||||
measures: [{ name: 'broken', segments: ['byol'] }],
|
||||
};
|
||||
const refs = findDanglingSegmentRefs(source);
|
||||
expect(refs).toHaveLength(1);
|
||||
expect(refs[0]).toMatch(/unknown segment 'byol'/);
|
||||
});
|
||||
|
||||
it('is a no-op for sources with no measures or no segment references', () => {
|
||||
expect(findDanglingSegmentRefs({ measures: [{ name: 'simple', expr: 'count(*)' }] })).toEqual([]);
|
||||
expect(findDanglingSegmentRefs({})).toEqual([]);
|
||||
});
|
||||
});
|
||||
1130
packages/context/src/sl/semantic-layer.service.ts
Normal file
1130
packages/context/src/sl/semantic-layer.service.ts
Normal file
File diff suppressed because it is too large
Load diff
115
packages/context/src/sl/sl-dictionary-profile.test.ts
Normal file
115
packages/context/src/sl/sl-dictionary-profile.test.ts
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { initKloProject, type KloLocalProject } from '../project/index.js';
|
||||
import { loadLatestSlDictionaryEntries } from './sl-dictionary-profile.js';
|
||||
|
||||
describe('loadLatestSlDictionaryEntries', () => {
|
||||
let tempDir: string;
|
||||
let project: KloLocalProject;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-sl-dictionary-profile-'));
|
||||
project = await initKloProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' });
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('loads latest relationship-profile sample values for dictionary candidate columns', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json',
|
||||
`${JSON.stringify(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 4,
|
||||
tables: [],
|
||||
columns: {
|
||||
'orders.status': {
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'string',
|
||||
rowCount: 20,
|
||||
nullCount: 0,
|
||||
distinctCount: 3,
|
||||
uniquenessRatio: 0.15,
|
||||
nullRate: 0,
|
||||
sampleValues: ['paid', 'refunded', 'pending'],
|
||||
minTextLength: 4,
|
||||
maxTextLength: 8,
|
||||
},
|
||||
'orders.customer_id': {
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'customer_id',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'string',
|
||||
rowCount: 20,
|
||||
nullCount: 0,
|
||||
distinctCount: 20,
|
||||
uniquenessRatio: 1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['cus_1'],
|
||||
minTextLength: 5,
|
||||
maxTextLength: 5,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed profile',
|
||||
);
|
||||
|
||||
await project.fileStore.writeFile(
|
||||
'raw-sources/warehouse/live-database/sync-2/enrichment/relationship-profile.json',
|
||||
`${JSON.stringify(
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
sqlAvailable: true,
|
||||
queryCount: 4,
|
||||
tables: [],
|
||||
columns: {
|
||||
'orders.status': {
|
||||
table: { catalog: null, db: 'public', name: 'orders' },
|
||||
column: 'status',
|
||||
nativeType: 'text',
|
||||
normalizedType: 'string',
|
||||
rowCount: 20,
|
||||
nullCount: 0,
|
||||
distinctCount: 2,
|
||||
uniquenessRatio: 0.1,
|
||||
nullRate: 0,
|
||||
sampleValues: ['settled', 'voided'],
|
||||
minTextLength: 6,
|
||||
maxTextLength: 7,
|
||||
},
|
||||
},
|
||||
warnings: [],
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'klo',
|
||||
'klo@example.com',
|
||||
'Seed newer profile',
|
||||
);
|
||||
|
||||
await expect(loadLatestSlDictionaryEntries(project, ['warehouse'])).resolves.toEqual([
|
||||
{ connectionId: 'warehouse', sourceName: 'orders', columnName: 'status', value: 'settled', cardinality: 2 },
|
||||
{ connectionId: 'warehouse', sourceName: 'orders', columnName: 'status', value: 'voided', cardinality: 2 },
|
||||
]);
|
||||
});
|
||||
|
||||
it('returns an empty list when no relationship profile exists', async () => {
|
||||
await expect(loadLatestSlDictionaryEntries(project, ['warehouse'])).resolves.toEqual([]);
|
||||
});
|
||||
});
|
||||
120
packages/context/src/sl/sl-dictionary-profile.ts
Normal file
120
packages/context/src/sl/sl-dictionary-profile.ts
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
import type { KloLocalProject } from '../project/index.js';
|
||||
import { defaultKloDataDictionarySettings, isKloDataDictionaryCandidate } from '../scan/index.js';
|
||||
|
||||
export interface SlDictionaryEntry {
|
||||
connectionId: string;
|
||||
sourceName: string;
|
||||
columnName: string;
|
||||
value: string;
|
||||
cardinality: number | null;
|
||||
}
|
||||
|
||||
interface RelationshipProfileColumn {
|
||||
table?: { name?: string };
|
||||
column?: string;
|
||||
nativeType?: string;
|
||||
normalizedType?: string;
|
||||
distinctCount?: number;
|
||||
sampleValues?: unknown[];
|
||||
}
|
||||
|
||||
interface RelationshipProfileArtifact {
|
||||
connectionId?: string;
|
||||
columns?: Record<string, RelationshipProfileColumn>;
|
||||
}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function parseProfile(raw: string): RelationshipProfileArtifact | null {
|
||||
const parsed = JSON.parse(raw) as unknown;
|
||||
if (!isRecord(parsed)) {
|
||||
return null;
|
||||
}
|
||||
return parsed as RelationshipProfileArtifact;
|
||||
}
|
||||
|
||||
function normalizedValues(values: unknown[] | undefined): string[] {
|
||||
const seen = new Set<string>();
|
||||
const result: string[] = [];
|
||||
for (const value of values ?? []) {
|
||||
const text = String(value).trim();
|
||||
const key = text.toLowerCase();
|
||||
if (text.length === 0 || seen.has(key)) {
|
||||
continue;
|
||||
}
|
||||
seen.add(key);
|
||||
result.push(text);
|
||||
}
|
||||
return result.sort((left, right) => left.localeCompare(right));
|
||||
}
|
||||
|
||||
function columnEntries(connectionId: string, column: RelationshipProfileColumn): SlDictionaryEntry[] {
|
||||
const sourceName = column.table?.name;
|
||||
const columnName = column.column;
|
||||
if (!sourceName || !columnName) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const columnType = column.normalizedType ?? column.nativeType ?? '';
|
||||
if (!isKloDataDictionaryCandidate(columnType, columnName)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const cardinality = typeof column.distinctCount === 'number' ? column.distinctCount : null;
|
||||
if (cardinality !== null && cardinality > defaultKloDataDictionarySettings.cardinalityThreshold) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return normalizedValues(column.sampleValues).map((value) => ({
|
||||
connectionId,
|
||||
sourceName,
|
||||
columnName,
|
||||
value,
|
||||
cardinality,
|
||||
}));
|
||||
}
|
||||
|
||||
async function latestProfilePath(project: KloLocalProject, connectionId: string): Promise<string | null> {
|
||||
const root = `raw-sources/${connectionId}/live-database`;
|
||||
let files: string[];
|
||||
try {
|
||||
files = (await project.fileStore.listFiles(root)).files;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
return (
|
||||
files
|
||||
.filter((path) => path.endsWith('/enrichment/relationship-profile.json'))
|
||||
.sort((left, right) => left.localeCompare(right))
|
||||
.at(-1) ?? null
|
||||
);
|
||||
}
|
||||
|
||||
export async function loadLatestSlDictionaryEntries(
|
||||
project: KloLocalProject,
|
||||
connectionIds: readonly string[],
|
||||
): Promise<SlDictionaryEntry[]> {
|
||||
const entries: SlDictionaryEntry[] = [];
|
||||
for (const connectionId of [...new Set(connectionIds)].sort()) {
|
||||
const path = await latestProfilePath(project, connectionId);
|
||||
if (!path) {
|
||||
continue;
|
||||
}
|
||||
const raw = await project.fileStore.readFile(path);
|
||||
const profile = parseProfile(raw.content);
|
||||
const profileConnectionId = profile?.connectionId ?? connectionId;
|
||||
for (const column of Object.values(profile?.columns ?? {})) {
|
||||
entries.push(...columnEntries(profileConnectionId, column));
|
||||
}
|
||||
}
|
||||
return entries.sort(
|
||||
(left, right) =>
|
||||
left.connectionId.localeCompare(right.connectionId) ||
|
||||
left.sourceName.localeCompare(right.sourceName) ||
|
||||
left.columnName.localeCompare(right.columnName) ||
|
||||
left.value.localeCompare(right.value),
|
||||
);
|
||||
}
|
||||
165
packages/context/src/sl/sl-search.service.test.ts
Normal file
165
packages/context/src/sl/sl-search.service.test.ts
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { buildSemanticLayerSourceSearchText, SlSearchService } from './sl-search.service.js';
|
||||
import type { SemanticLayerSource } from './types.js';
|
||||
|
||||
describe('SlSearchService', () => {
|
||||
it('builds search text from source, columns, measures, and joins', () => {
|
||||
const service = new SlSearchService(
|
||||
{ maxBatchSize: 16, computeEmbedding: vi.fn(), computeEmbeddingsBulk: vi.fn() },
|
||||
{
|
||||
upsertSources: vi.fn(),
|
||||
getExistingSearchTexts: vi.fn(),
|
||||
deleteStale: vi.fn(),
|
||||
deleteByConnection: vi.fn(),
|
||||
deleteByConnectionAndName: vi.fn(),
|
||||
search: vi.fn(),
|
||||
},
|
||||
);
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'orders',
|
||||
descriptions: { user: 'Customer orders' },
|
||||
table: 'public.orders',
|
||||
grain: ['id'],
|
||||
columns: [
|
||||
{ name: 'id', type: 'string' },
|
||||
{ name: 'amount', type: 'number', descriptions: { user: 'Order amount' } },
|
||||
],
|
||||
measures: [{ name: 'revenue', expr: 'sum(amount)', description: 'Gross revenue' }],
|
||||
joins: [{ to: 'customers', on: 'orders.customer_id = customers.id', relationship: 'many_to_one' }],
|
||||
};
|
||||
|
||||
expect(service.buildSearchText(source)).toContain('orders');
|
||||
expect(service.buildSearchText(source)).toContain('Customer orders');
|
||||
expect(service.buildSearchText(source)).toContain('amount (number) Order amount');
|
||||
expect(service.buildSearchText(source)).toContain('measure: revenue sum(amount) Gross revenue');
|
||||
expect(service.buildSearchText(source)).toContain('join: customers (many_to_one)');
|
||||
});
|
||||
|
||||
it('exports the same canonical search text builder used by SlSearchService', () => {
|
||||
const service = new SlSearchService(
|
||||
{ maxBatchSize: 16, computeEmbedding: vi.fn(), computeEmbeddingsBulk: vi.fn() },
|
||||
{
|
||||
upsertSources: vi.fn(),
|
||||
getExistingSearchTexts: vi.fn(),
|
||||
deleteStale: vi.fn(),
|
||||
deleteByConnection: vi.fn(),
|
||||
deleteByConnectionAndName: vi.fn(),
|
||||
search: vi.fn(),
|
||||
},
|
||||
);
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'orders',
|
||||
descriptions: { user: 'Customer orders' },
|
||||
table: 'public.orders',
|
||||
grain: ['id'],
|
||||
columns: [
|
||||
{
|
||||
name: 'status',
|
||||
type: 'string',
|
||||
enum_values: { dbt: ['paid', 'refunded'] },
|
||||
constraints: { dbt: { not_null: true } },
|
||||
},
|
||||
],
|
||||
joins: [{ to: 'customers', on: 'orders.customer_id = customers.id', relationship: 'many_to_one' }],
|
||||
measures: [{ name: 'total_revenue', expr: 'sum(revenue)', description: 'Gross revenue' }],
|
||||
tags: { dbt: ['finance'] },
|
||||
};
|
||||
|
||||
expect(buildSemanticLayerSourceSearchText(source)).toBe(service.buildSearchText(source));
|
||||
expect(buildSemanticLayerSourceSearchText(source)).toContain('dbt values: paid, refunded');
|
||||
expect(buildSemanticLayerSourceSearchText(source)).toContain('measure: total_revenue sum(revenue) Gross revenue');
|
||||
expect(buildSemanticLayerSourceSearchText(source)).toContain('dbt tags: finance');
|
||||
});
|
||||
|
||||
it('includes dbt enum, not_null, and unique tokens for columns', () => {
|
||||
const service = new SlSearchService(
|
||||
{ maxBatchSize: 16, computeEmbedding: vi.fn(), computeEmbeddingsBulk: vi.fn() },
|
||||
{
|
||||
upsertSources: vi.fn(),
|
||||
getExistingSearchTexts: vi.fn(),
|
||||
deleteStale: vi.fn(),
|
||||
deleteByConnection: vi.fn(),
|
||||
deleteByConnectionAndName: vi.fn(),
|
||||
search: vi.fn(),
|
||||
},
|
||||
);
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'src_orders',
|
||||
table: 'public.orders',
|
||||
grain: [],
|
||||
columns: [
|
||||
{
|
||||
name: 'status',
|
||||
type: 'string',
|
||||
descriptions: {},
|
||||
enum_values: { dbt: ['a', 'b'] },
|
||||
constraints: { dbt: { not_null: true, unique: true } },
|
||||
},
|
||||
],
|
||||
joins: [],
|
||||
measures: [],
|
||||
};
|
||||
const text = service.buildSearchText(source);
|
||||
expect(text).toContain('dbt values: a, b');
|
||||
expect(text).toContain('not_null');
|
||||
expect(text).toContain('unique');
|
||||
});
|
||||
|
||||
it('includes dbt default time token for MetricFlow agg_time_dimension', () => {
|
||||
const service = new SlSearchService(
|
||||
{ maxBatchSize: 16, computeEmbedding: vi.fn(), computeEmbeddingsBulk: vi.fn() },
|
||||
{
|
||||
upsertSources: vi.fn(),
|
||||
getExistingSearchTexts: vi.fn(),
|
||||
deleteStale: vi.fn(),
|
||||
deleteByConnection: vi.fn(),
|
||||
deleteByConnectionAndName: vi.fn(),
|
||||
search: vi.fn(),
|
||||
},
|
||||
);
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'orders',
|
||||
table: 'public.orders',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'number' }],
|
||||
joins: [],
|
||||
measures: [],
|
||||
default_time_dimension: { dbt: 'order_date' },
|
||||
};
|
||||
expect(service.buildSearchText(source)).toContain('dbt default time: order_date');
|
||||
});
|
||||
|
||||
it('includes dbt table tags and freshness from manifest-backed source', () => {
|
||||
const service = new SlSearchService(
|
||||
{ maxBatchSize: 16, computeEmbedding: vi.fn(), computeEmbeddingsBulk: vi.fn() },
|
||||
{
|
||||
upsertSources: vi.fn(),
|
||||
getExistingSearchTexts: vi.fn(),
|
||||
deleteStale: vi.fn(),
|
||||
deleteByConnection: vi.fn(),
|
||||
deleteByConnectionAndName: vi.fn(),
|
||||
search: vi.fn(),
|
||||
},
|
||||
);
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'customers',
|
||||
table: 'jaffle.customers',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'number' }],
|
||||
joins: [],
|
||||
measures: [],
|
||||
tags: { dbt: ['raw', 'core'] },
|
||||
freshness: {
|
||||
dbt: {
|
||||
loaded_at_field: 'updated_at',
|
||||
raw: { warn_after: { count: 12, period: 'hour' } },
|
||||
},
|
||||
},
|
||||
};
|
||||
const text = service.buildSearchText(source);
|
||||
expect(text).toContain('dbt tags: raw, core');
|
||||
expect(text).toContain('dbt freshness:');
|
||||
expect(text).toContain('loaded_at=updated_at');
|
||||
expect(text).toContain('warn_after');
|
||||
});
|
||||
});
|
||||
168
packages/context/src/sl/sl-search.service.ts
Normal file
168
packages/context/src/sl/sl-search.service.ts
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
import type { KloEmbeddingPort, KloLogger } from '../core/index.js';
|
||||
import { noopLogger } from '../core/index.js';
|
||||
import { DEFAULT_PRIORITY, resolveDescription } from './descriptions.js';
|
||||
import type { SlSourcesIndexPort } from './ports.js';
|
||||
import type { SemanticLayerSource } from './types.js';
|
||||
|
||||
export function buildSemanticLayerSourceSearchText(
|
||||
source: SemanticLayerSource,
|
||||
priority: string[] = DEFAULT_PRIORITY,
|
||||
): string {
|
||||
const config = { priority };
|
||||
const parts: string[] = [source.name.replace(/_/g, ' ')];
|
||||
|
||||
const sourceDesc = resolveDescription(source.descriptions, config);
|
||||
if (sourceDesc) {
|
||||
parts.push(sourceDesc);
|
||||
}
|
||||
|
||||
if (source.table) {
|
||||
parts.push(`table: ${source.table}`);
|
||||
}
|
||||
|
||||
if (source.default_time_dimension?.dbt) {
|
||||
parts.push(`dbt default time: ${source.default_time_dimension.dbt}`);
|
||||
}
|
||||
|
||||
for (const col of source.columns ?? []) {
|
||||
const colDesc = resolveDescription(col.descriptions, config);
|
||||
let extra = '';
|
||||
if (col.enum_values?.dbt?.length) {
|
||||
extra += ` [dbt values: ${col.enum_values.dbt.join(', ')}]`;
|
||||
}
|
||||
if (col.constraints?.dbt?.not_null) {
|
||||
extra += ' not_null';
|
||||
}
|
||||
if (col.constraints?.dbt?.unique) {
|
||||
extra += ' unique';
|
||||
}
|
||||
parts.push(`${col.name} (${col.type})${colDesc ? ` ${colDesc}` : ''}${extra}`);
|
||||
}
|
||||
|
||||
for (const m of source.measures ?? []) {
|
||||
parts.push(`measure: ${m.name} ${m.expr}${m.description ? ` ${m.description}` : ''}`);
|
||||
}
|
||||
|
||||
for (const j of source.joins ?? []) {
|
||||
parts.push(`join: ${j.to} (${j.relationship})`);
|
||||
}
|
||||
|
||||
if (source.tags?.dbt?.length) {
|
||||
parts.push(`dbt tags: ${source.tags.dbt.join(', ')}`);
|
||||
}
|
||||
|
||||
if (source.freshness?.dbt) {
|
||||
const fd = source.freshness.dbt;
|
||||
const bits: string[] = [];
|
||||
if (fd.loaded_at_field) {
|
||||
bits.push(`loaded_at=${fd.loaded_at_field}`);
|
||||
}
|
||||
if (fd.raw !== undefined) {
|
||||
let rawStr = JSON.stringify(fd.raw);
|
||||
if (rawStr.length > 120) {
|
||||
rawStr = `${rawStr.slice(0, 117)}...`;
|
||||
}
|
||||
bits.push(rawStr);
|
||||
}
|
||||
if (bits.length > 0) {
|
||||
parts.push(`dbt freshness: ${bits.join(' ')}`);
|
||||
}
|
||||
}
|
||||
|
||||
return parts.join('. ');
|
||||
}
|
||||
|
||||
export class SlSearchService {
|
||||
constructor(
|
||||
private readonly embeddingService: KloEmbeddingPort,
|
||||
private readonly slSourcesRepository: SlSourcesIndexPort,
|
||||
private readonly logger: KloLogger = noopLogger,
|
||||
) {}
|
||||
|
||||
async indexSources(connectionId: string, sources: SemanticLayerSource[]): Promise<void> {
|
||||
if (sources.length === 0) {
|
||||
await this.slSourcesRepository.deleteByConnection(connectionId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Detect which sources actually changed by comparing search_text
|
||||
const existing = await this.slSourcesRepository.getExistingSearchTexts(connectionId);
|
||||
const searchTexts = sources.map((s) => this.buildSearchText(s));
|
||||
|
||||
const changedIndices: number[] = [];
|
||||
for (let i = 0; i < sources.length; i++) {
|
||||
const prev = existing.get(sources[i].name);
|
||||
if (!prev || prev.searchText !== searchTexts[i] || !prev.hasEmbedding) {
|
||||
changedIndices.push(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (changedIndices.length === 0) {
|
||||
// Still clean up stale sources even if nothing changed
|
||||
const keepNames = sources.map((s) => s.name);
|
||||
await this.slSourcesRepository.deleteStale(connectionId, keepNames);
|
||||
this.logger.log(`SL sources for connection ${connectionId}: all ${sources.length} up to date, 0 reindexed`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Compute embeddings only for changed sources
|
||||
const changedTexts = changedIndices.map((i) => searchTexts[i]);
|
||||
let changedEmbeddings: (number[] | null)[];
|
||||
try {
|
||||
const batchSize = this.embeddingService.maxBatchSize;
|
||||
const allEmbeddings: number[][] = [];
|
||||
for (let i = 0; i < changedTexts.length; i += batchSize) {
|
||||
const batch = changedTexts.slice(i, i + batchSize);
|
||||
const batchEmbeddings = await this.embeddingService.computeEmbeddingsBulk(batch);
|
||||
allEmbeddings.push(...batchEmbeddings);
|
||||
}
|
||||
changedEmbeddings = allEmbeddings;
|
||||
} catch (error) {
|
||||
this.logger.warn(
|
||||
`Failed to compute SL source embeddings: ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
changedEmbeddings = changedIndices.map(() => null);
|
||||
}
|
||||
|
||||
const rows = changedIndices.map((srcIdx, i) => {
|
||||
return {
|
||||
sourceName: sources[srcIdx].name,
|
||||
searchText: searchTexts[srcIdx],
|
||||
embedding: changedEmbeddings[i],
|
||||
};
|
||||
});
|
||||
|
||||
await this.slSourcesRepository.upsertSources(connectionId, rows);
|
||||
|
||||
// Remove sources that no longer exist in YAML
|
||||
const keepNames = sources.map((s) => s.name);
|
||||
await this.slSourcesRepository.deleteStale(connectionId, keepNames);
|
||||
|
||||
this.logger.log(
|
||||
`SL sources for connection ${connectionId}: ${changedIndices.length}/${sources.length} reindexed, ${sources.length - changedIndices.length} unchanged`,
|
||||
);
|
||||
}
|
||||
|
||||
async search(
|
||||
connectionId: string,
|
||||
query: string,
|
||||
limit = 15,
|
||||
minRrfScore = 0,
|
||||
): Promise<Array<{ sourceName: string; score: number }>> {
|
||||
let queryEmbedding: number[] | null = null;
|
||||
try {
|
||||
queryEmbedding = await this.embeddingService.computeEmbedding(query);
|
||||
} catch (error) {
|
||||
this.logger.warn(
|
||||
`Failed to compute query embedding, falling back to FTS + trigram: ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
}
|
||||
|
||||
const results = await this.slSourcesRepository.search(connectionId, queryEmbedding, query, limit, minRrfScore);
|
||||
return results.map((r) => ({ sourceName: r.sourceName, score: r.rrfScore }));
|
||||
}
|
||||
|
||||
buildSearchText(source: SemanticLayerSource, priority: string[] = DEFAULT_PRIORITY): string {
|
||||
return buildSemanticLayerSourceSearchText(source, priority);
|
||||
}
|
||||
}
|
||||
8
packages/context/src/sl/sl-validator.port.ts
Normal file
8
packages/context/src/sl/sl-validator.port.ts
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
export interface SlValidationResult {
|
||||
errors: string[];
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
export interface SlValidatorPort<TDeps = unknown> {
|
||||
validateSingleSource(deps: TDeps, connectionId: string, sourceName: string): Promise<SlValidationResult>;
|
||||
}
|
||||
164
packages/context/src/sl/sqlite-sl-sources-index.test.ts
Normal file
164
packages/context/src/sl/sqlite-sl-sources-index.test.ts
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
import { access, mkdtemp, rm } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { SqliteSlSourcesIndex } from './sqlite-sl-sources-index.js';
|
||||
|
||||
describe('SqliteSlSourcesIndex', () => {
|
||||
let tempDir: string;
|
||||
let dbPath: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-sqlite-sl-index-'));
|
||||
dbPath = join(tempDir, 'db.sqlite');
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('creates SQLite tables and searches indexed source text', async () => {
|
||||
const index = new SqliteSlSourcesIndex({ dbPath });
|
||||
|
||||
await index.upsertSources('warehouse', [
|
||||
{
|
||||
sourceName: 'orders',
|
||||
searchText: 'orders table: public.orders measure: total_revenue sum(revenue) gross revenue',
|
||||
embedding: null,
|
||||
},
|
||||
{
|
||||
sourceName: 'tickets',
|
||||
searchText: 'tickets table: public.tickets measure: ticket_count count(*) support queue',
|
||||
embedding: null,
|
||||
},
|
||||
]);
|
||||
|
||||
await expect(access(dbPath)).resolves.toBeUndefined();
|
||||
expect(await index.search('warehouse', null, 'gross revenue', 10)).toEqual([
|
||||
expect.objectContaining({
|
||||
sourceName: 'orders',
|
||||
rrfScore: expect.any(Number),
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it('reports existing search text and embedding presence', async () => {
|
||||
const index = new SqliteSlSourcesIndex({ dbPath });
|
||||
|
||||
await index.upsertSources('warehouse', [
|
||||
{
|
||||
sourceName: 'orders',
|
||||
searchText: 'orders gross revenue',
|
||||
embedding: [0.1, 0.2, 0.3],
|
||||
},
|
||||
{
|
||||
sourceName: 'tickets',
|
||||
searchText: 'tickets support queue',
|
||||
embedding: null,
|
||||
},
|
||||
]);
|
||||
|
||||
await expect(index.getExistingSearchTexts('warehouse')).resolves.toEqual(
|
||||
new Map([
|
||||
['orders', { searchText: 'orders gross revenue', hasEmbedding: true }],
|
||||
['tickets', { searchText: 'tickets support queue', hasEmbedding: false }],
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('deletes stale, named, and connection-scoped rows from the FTS index', async () => {
|
||||
const index = new SqliteSlSourcesIndex({ dbPath });
|
||||
|
||||
await index.upsertSources('warehouse', [
|
||||
{ sourceName: 'orders', searchText: 'orders revenue', embedding: null },
|
||||
{ sourceName: 'tickets', searchText: 'tickets support', embedding: null },
|
||||
]);
|
||||
await index.upsertSources('finance', [{ sourceName: 'invoices', searchText: 'invoices revenue', embedding: null }]);
|
||||
|
||||
await index.deleteStale('warehouse', ['orders']);
|
||||
expect(await index.search('warehouse', null, 'support', 10)).toEqual([]);
|
||||
expect(await index.search('warehouse', null, 'revenue', 10)).toEqual([
|
||||
expect.objectContaining({ sourceName: 'orders' }),
|
||||
]);
|
||||
expect(await index.search('finance', null, 'revenue', 10)).toEqual([
|
||||
expect.objectContaining({ sourceName: 'invoices' }),
|
||||
]);
|
||||
|
||||
await index.deleteByConnectionAndName('warehouse', 'orders');
|
||||
expect(await index.search('warehouse', null, 'revenue', 10)).toEqual([]);
|
||||
|
||||
await index.deleteByConnection('finance');
|
||||
expect(await index.search('finance', null, 'revenue', 10)).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns lane candidates with stable connection-scoped IDs', async () => {
|
||||
const index = new SqliteSlSourcesIndex({ dbPath });
|
||||
|
||||
await index.upsertSources('warehouse', [
|
||||
{ sourceName: 'orders', searchText: 'orders gross revenue paid status', embedding: [1, 0] },
|
||||
]);
|
||||
await index.upsertSources('finance', [
|
||||
{ sourceName: 'orders', searchText: 'finance orders invoices', embedding: [0, 1] },
|
||||
]);
|
||||
|
||||
await expect(index.searchLexicalCandidates({ queryText: 'gross revenue', limit: 25 })).resolves.toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'warehouse/orders',
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'orders',
|
||||
rank: 1,
|
||||
rawScore: expect.any(Number),
|
||||
}),
|
||||
]);
|
||||
|
||||
await expect(index.searchSemanticCandidates({ queryEmbedding: [0, 1], limit: 25 })).resolves.toEqual([
|
||||
expect.objectContaining({ id: 'finance/orders', connectionId: 'finance', sourceName: 'orders', rank: 1 }),
|
||||
expect.objectContaining({ id: 'warehouse/orders', connectionId: 'warehouse', sourceName: 'orders', rank: 2 }),
|
||||
]);
|
||||
});
|
||||
|
||||
it('aggregates dictionary matches to one source-level lane candidate', async () => {
|
||||
const index = new SqliteSlSourcesIndex({ dbPath });
|
||||
|
||||
await index.replaceDictionaryEntries('warehouse', [
|
||||
{ connectionId: 'warehouse', sourceName: 'orders', columnName: 'status', value: 'paid', cardinality: 3 },
|
||||
{ connectionId: 'warehouse', sourceName: 'orders', columnName: 'status', value: 'refunded', cardinality: 3 },
|
||||
{ connectionId: 'warehouse', sourceName: 'orders', columnName: 'channel', value: 'paid search', cardinality: 4 },
|
||||
{
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'tickets',
|
||||
columnName: 'priority',
|
||||
value: 'paid support',
|
||||
cardinality: 5,
|
||||
},
|
||||
]);
|
||||
|
||||
await expect(index.searchDictionaryCandidates({ queryText: 'paid', limit: 25 })).resolves.toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'warehouse/orders',
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'orders',
|
||||
rank: 1,
|
||||
matches: [
|
||||
{ column: 'channel', values: ['paid search'] },
|
||||
{ column: 'status', values: ['paid'] },
|
||||
],
|
||||
}),
|
||||
expect.objectContaining({
|
||||
id: 'warehouse/tickets',
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'tickets',
|
||||
rank: 2,
|
||||
matches: [{ column: 'priority', values: ['paid support'] }],
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
||||
it('returns an empty result for blank or punctuation-only queries', async () => {
|
||||
const index = new SqliteSlSourcesIndex({ dbPath });
|
||||
await index.upsertSources('warehouse', [{ sourceName: 'orders', searchText: 'orders revenue', embedding: null }]);
|
||||
|
||||
expect(await index.search('warehouse', null, ' ', 10)).toEqual([]);
|
||||
expect(await index.search('warehouse', null, '---', 10)).toEqual([]);
|
||||
});
|
||||
});
|
||||
549
packages/context/src/sl/sqlite-sl-sources-index.ts
Normal file
549
packages/context/src/sl/sqlite-sl-sources-index.ts
Normal file
|
|
@ -0,0 +1,549 @@
|
|||
import { mkdirSync } from 'node:fs';
|
||||
import { dirname } from 'node:path';
|
||||
import Database from 'better-sqlite3';
|
||||
import type { SlSourcesIndexPort } from './ports.js';
|
||||
import type { SlDictionaryEntry } from './sl-dictionary-profile.js';
|
||||
import type { SlDictionaryMatch } from './types.js';
|
||||
|
||||
export interface SqliteSlSourcesIndexOptions {
|
||||
dbPath: string;
|
||||
}
|
||||
|
||||
type ExistingRow = {
|
||||
source_name: string;
|
||||
search_text: string;
|
||||
embedding_json: string | null;
|
||||
};
|
||||
|
||||
type SearchRow = {
|
||||
connection_id?: string;
|
||||
source_name: string;
|
||||
rank: number;
|
||||
};
|
||||
|
||||
export interface SlSqliteLaneCandidate {
|
||||
id: string;
|
||||
connectionId: string;
|
||||
sourceName: string;
|
||||
rank: number;
|
||||
rawScore: number;
|
||||
}
|
||||
|
||||
export interface SlSqliteDictionaryCandidate extends SlSqliteLaneCandidate {
|
||||
matches: SlDictionaryMatch[];
|
||||
}
|
||||
|
||||
type IndexedSourceRow = {
|
||||
connection_id: string;
|
||||
source_name: string;
|
||||
embedding_json: string | null;
|
||||
};
|
||||
|
||||
type DictionarySearchRow = {
|
||||
connection_id: string;
|
||||
source_name: string;
|
||||
column_name: string;
|
||||
value: string;
|
||||
rank: number | null;
|
||||
};
|
||||
|
||||
function candidateId(connectionId: string, sourceName: string): string {
|
||||
return `${connectionId}/${sourceName}`;
|
||||
}
|
||||
|
||||
function cosineSimilarity(left: number[], right: number[]): number {
|
||||
if (left.length === 0 || left.length !== right.length) {
|
||||
return 0;
|
||||
}
|
||||
let dot = 0;
|
||||
let leftNorm = 0;
|
||||
let rightNorm = 0;
|
||||
for (let i = 0; i < left.length; i++) {
|
||||
const l = left[i] ?? 0;
|
||||
const r = right[i] ?? 0;
|
||||
dot += l * r;
|
||||
leftNorm += l * l;
|
||||
rightNorm += r * r;
|
||||
}
|
||||
if (leftNorm === 0 || rightNorm === 0) {
|
||||
return 0;
|
||||
}
|
||||
return dot / (Math.sqrt(leftNorm) * Math.sqrt(rightNorm));
|
||||
}
|
||||
|
||||
function normalizeFtsQuery(query: string): string {
|
||||
const terms = query
|
||||
.toLowerCase()
|
||||
.split(/[^a-z0-9_]+/u)
|
||||
.map((term) => term.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
return [...new Set(terms)].map((term) => `"${term.replaceAll('"', '""')}"`).join(' OR ');
|
||||
}
|
||||
|
||||
function scoreFromRank(rank: number): number {
|
||||
return Number((1 / (1 + Math.abs(rank))).toFixed(6));
|
||||
}
|
||||
|
||||
export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
|
||||
private readonly db: Database.Database;
|
||||
|
||||
constructor(options: SqliteSlSourcesIndexOptions) {
|
||||
mkdirSync(dirname(options.dbPath), { recursive: true });
|
||||
this.db = new Database(options.dbPath);
|
||||
this.db.pragma('journal_mode = WAL');
|
||||
this.db.pragma('foreign_keys = ON');
|
||||
this.db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS local_sl_sources (
|
||||
connection_id TEXT NOT NULL,
|
||||
source_name TEXT NOT NULL,
|
||||
search_text TEXT NOT NULL,
|
||||
embedding_json TEXT,
|
||||
content_hash TEXT,
|
||||
updated_at TEXT NOT NULL,
|
||||
PRIMARY KEY (connection_id, source_name)
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS local_sl_sources_fts USING fts5(
|
||||
connection_id UNINDEXED,
|
||||
source_name UNINDEXED,
|
||||
search_text
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS local_sl_dictionary_values (
|
||||
connection_id TEXT NOT NULL,
|
||||
source_name TEXT NOT NULL,
|
||||
column_name TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
value_lower TEXT NOT NULL,
|
||||
cardinality INTEGER,
|
||||
updated_at TEXT NOT NULL,
|
||||
PRIMARY KEY (connection_id, source_name, column_name, value)
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS local_sl_dictionary_values_fts USING fts5(
|
||||
connection_id UNINDEXED,
|
||||
source_name UNINDEXED,
|
||||
column_name UNINDEXED,
|
||||
value
|
||||
);
|
||||
`);
|
||||
}
|
||||
|
||||
async upsertSources(
|
||||
connectionId: string,
|
||||
sources: Array<{ sourceName: string; searchText: string; embedding: number[] | null; contentHash?: string | null }>,
|
||||
): Promise<void> {
|
||||
if (sources.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const upsertRow = this.db.prepare(`
|
||||
INSERT INTO local_sl_sources (
|
||||
connection_id,
|
||||
source_name,
|
||||
search_text,
|
||||
embedding_json,
|
||||
content_hash,
|
||||
updated_at
|
||||
)
|
||||
VALUES (
|
||||
@connectionId,
|
||||
@sourceName,
|
||||
@searchText,
|
||||
@embeddingJson,
|
||||
@contentHash,
|
||||
@updatedAt
|
||||
)
|
||||
ON CONFLICT(connection_id, source_name) DO UPDATE SET
|
||||
search_text = excluded.search_text,
|
||||
embedding_json = excluded.embedding_json,
|
||||
content_hash = COALESCE(excluded.content_hash, local_sl_sources.content_hash),
|
||||
updated_at = excluded.updated_at
|
||||
`);
|
||||
const deleteFts = this.db.prepare(`
|
||||
DELETE FROM local_sl_sources_fts
|
||||
WHERE connection_id = @connectionId
|
||||
AND source_name = @sourceName
|
||||
`);
|
||||
const insertFts = this.db.prepare(`
|
||||
INSERT INTO local_sl_sources_fts (connection_id, source_name, search_text)
|
||||
VALUES (@connectionId, @sourceName, @searchText)
|
||||
`);
|
||||
|
||||
const transaction = this.db.transaction(
|
||||
(
|
||||
rows: Array<{
|
||||
sourceName: string;
|
||||
searchText: string;
|
||||
embedding: number[] | null;
|
||||
contentHash?: string | null;
|
||||
}>,
|
||||
) => {
|
||||
const updatedAt = new Date().toISOString();
|
||||
for (const source of rows) {
|
||||
const row = {
|
||||
connectionId,
|
||||
sourceName: source.sourceName,
|
||||
searchText: source.searchText,
|
||||
embeddingJson: source.embedding ? JSON.stringify(source.embedding) : null,
|
||||
contentHash: source.contentHash ?? null,
|
||||
updatedAt,
|
||||
};
|
||||
upsertRow.run(row);
|
||||
deleteFts.run(row);
|
||||
insertFts.run(row);
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
transaction(sources);
|
||||
}
|
||||
|
||||
async getExistingSearchTexts(
|
||||
connectionId: string,
|
||||
): Promise<Map<string, { searchText: string; hasEmbedding: boolean }>> {
|
||||
const rows = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT source_name, search_text, embedding_json
|
||||
FROM local_sl_sources
|
||||
WHERE connection_id = ?
|
||||
ORDER BY source_name ASC
|
||||
`,
|
||||
)
|
||||
.all(connectionId) as ExistingRow[];
|
||||
|
||||
return new Map(
|
||||
rows.map((row) => [row.source_name, { searchText: row.search_text, hasEmbedding: row.embedding_json !== null }]),
|
||||
);
|
||||
}
|
||||
|
||||
async deleteStale(connectionId: string, keepNames: string[]): Promise<void> {
|
||||
if (keepNames.length === 0) {
|
||||
await this.deleteByConnection(connectionId);
|
||||
return;
|
||||
}
|
||||
|
||||
const placeholders = keepNames.map(() => '?').join(', ');
|
||||
const stale = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT source_name
|
||||
FROM local_sl_sources
|
||||
WHERE connection_id = ?
|
||||
AND source_name NOT IN (${placeholders})
|
||||
`,
|
||||
)
|
||||
.all(connectionId, ...keepNames) as Array<{ source_name: string }>;
|
||||
|
||||
const deleteFts = this.db.prepare(`
|
||||
DELETE FROM local_sl_sources_fts
|
||||
WHERE connection_id = ?
|
||||
AND source_name = ?
|
||||
`);
|
||||
const deleteRow = this.db.prepare(`
|
||||
DELETE FROM local_sl_sources
|
||||
WHERE connection_id = ?
|
||||
AND source_name = ?
|
||||
`);
|
||||
const remove = this.db.transaction((sourceNames: string[]) => {
|
||||
for (const sourceName of sourceNames) {
|
||||
deleteFts.run(connectionId, sourceName);
|
||||
deleteRow.run(connectionId, sourceName);
|
||||
}
|
||||
});
|
||||
|
||||
remove(stale.map((row) => row.source_name));
|
||||
}
|
||||
|
||||
async deleteByConnection(connectionId: string): Promise<void> {
|
||||
const remove = this.db.transaction(() => {
|
||||
this.db.prepare('DELETE FROM local_sl_sources_fts WHERE connection_id = ?').run(connectionId);
|
||||
this.db.prepare('DELETE FROM local_sl_sources WHERE connection_id = ?').run(connectionId);
|
||||
});
|
||||
remove();
|
||||
}
|
||||
|
||||
async deleteByConnectionAndName(connectionId: string, sourceName: string): Promise<void> {
|
||||
this.deleteByConnectionAndNameSync(connectionId, sourceName);
|
||||
}
|
||||
|
||||
async replaceDictionaryEntries(connectionId: string, entries: SlDictionaryEntry[]): Promise<void> {
|
||||
const remove = this.db.transaction(() => {
|
||||
this.db.prepare('DELETE FROM local_sl_dictionary_values_fts WHERE connection_id = ?').run(connectionId);
|
||||
this.db.prepare('DELETE FROM local_sl_dictionary_values WHERE connection_id = ?').run(connectionId);
|
||||
});
|
||||
const insertRow = this.db.prepare(`
|
||||
INSERT INTO local_sl_dictionary_values (
|
||||
connection_id,
|
||||
source_name,
|
||||
column_name,
|
||||
value,
|
||||
value_lower,
|
||||
cardinality,
|
||||
updated_at
|
||||
)
|
||||
VALUES (
|
||||
@connectionId,
|
||||
@sourceName,
|
||||
@columnName,
|
||||
@value,
|
||||
@valueLower,
|
||||
@cardinality,
|
||||
@updatedAt
|
||||
)
|
||||
`);
|
||||
const insertFts = this.db.prepare(`
|
||||
INSERT INTO local_sl_dictionary_values_fts (connection_id, source_name, column_name, value)
|
||||
VALUES (@connectionId, @sourceName, @columnName, @value)
|
||||
`);
|
||||
const write = this.db.transaction((rows: SlDictionaryEntry[]) => {
|
||||
const updatedAt = new Date().toISOString();
|
||||
for (const entry of rows.filter((candidate) => candidate.connectionId === connectionId)) {
|
||||
const row = {
|
||||
connectionId: entry.connectionId,
|
||||
sourceName: entry.sourceName,
|
||||
columnName: entry.columnName,
|
||||
value: entry.value,
|
||||
valueLower: entry.value.toLowerCase(),
|
||||
cardinality: entry.cardinality,
|
||||
updatedAt,
|
||||
};
|
||||
insertRow.run(row);
|
||||
insertFts.run(row);
|
||||
}
|
||||
});
|
||||
|
||||
remove();
|
||||
write(entries);
|
||||
}
|
||||
|
||||
async searchLexicalCandidates(input: {
|
||||
connectionIds?: readonly string[];
|
||||
queryText: string;
|
||||
limit: number;
|
||||
}): Promise<SlSqliteLaneCandidate[]> {
|
||||
const ftsQuery = normalizeFtsQuery(input.queryText);
|
||||
if (!ftsQuery) {
|
||||
return [];
|
||||
}
|
||||
const connectionIds = [...new Set(input.connectionIds ?? [])].sort();
|
||||
const connectionPredicate =
|
||||
connectionIds.length > 0 ? `AND connection_id IN (${connectionIds.map(() => '?').join(', ')})` : '';
|
||||
const rows = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT connection_id, source_name, bm25(local_sl_sources_fts) AS rank
|
||||
FROM local_sl_sources_fts
|
||||
WHERE local_sl_sources_fts MATCH ?
|
||||
${connectionPredicate}
|
||||
ORDER BY rank ASC, connection_id ASC, source_name ASC
|
||||
LIMIT ?
|
||||
`,
|
||||
)
|
||||
.all(ftsQuery, ...connectionIds, Math.max(1, input.limit)) as Array<SearchRow & { connection_id: string }>;
|
||||
|
||||
return rows.map((row, index) => ({
|
||||
id: candidateId(row.connection_id, row.source_name),
|
||||
connectionId: row.connection_id,
|
||||
sourceName: row.source_name,
|
||||
rank: index + 1,
|
||||
rawScore: Number(row.rank),
|
||||
}));
|
||||
}
|
||||
|
||||
async searchSemanticCandidates(input: {
|
||||
connectionIds?: readonly string[];
|
||||
queryEmbedding: number[];
|
||||
limit: number;
|
||||
}): Promise<SlSqliteLaneCandidate[]> {
|
||||
const connectionIds = [...new Set(input.connectionIds ?? [])].sort();
|
||||
const connectionPredicate =
|
||||
connectionIds.length > 0 ? `WHERE connection_id IN (${connectionIds.map(() => '?').join(', ')})` : '';
|
||||
const rows = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT connection_id, source_name, embedding_json
|
||||
FROM local_sl_sources
|
||||
${connectionPredicate}
|
||||
ORDER BY connection_id ASC, source_name ASC
|
||||
`,
|
||||
)
|
||||
.all(...connectionIds) as IndexedSourceRow[];
|
||||
|
||||
return rows
|
||||
.flatMap((row) => {
|
||||
if (!row.embedding_json) {
|
||||
return [];
|
||||
}
|
||||
try {
|
||||
const embedding = JSON.parse(row.embedding_json) as unknown;
|
||||
if (!Array.isArray(embedding) || !embedding.every((value) => typeof value === 'number')) {
|
||||
return [];
|
||||
}
|
||||
return [
|
||||
{
|
||||
id: candidateId(row.connection_id, row.source_name),
|
||||
connectionId: row.connection_id,
|
||||
sourceName: row.source_name,
|
||||
rank: 0,
|
||||
rawScore: cosineSimilarity(input.queryEmbedding, embedding),
|
||||
},
|
||||
];
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
})
|
||||
.sort(
|
||||
(left, right) =>
|
||||
right.rawScore - left.rawScore ||
|
||||
left.connectionId.localeCompare(right.connectionId) ||
|
||||
left.sourceName.localeCompare(right.sourceName),
|
||||
)
|
||||
.slice(0, Math.max(1, input.limit))
|
||||
.map((candidate, index) => ({ ...candidate, rank: index + 1 }));
|
||||
}
|
||||
|
||||
async searchDictionaryCandidates(input: {
|
||||
connectionIds?: readonly string[];
|
||||
queryText: string;
|
||||
limit: number;
|
||||
}): Promise<SlSqliteDictionaryCandidate[]> {
|
||||
const ftsQuery = normalizeFtsQuery(input.queryText);
|
||||
const normalizedQuery = input.queryText.trim().toLowerCase();
|
||||
if (!ftsQuery && !normalizedQuery) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const connectionIds = [...new Set(input.connectionIds ?? [])].sort();
|
||||
const connectionPredicate =
|
||||
connectionIds.length > 0 ? `AND connection_id IN (${connectionIds.map(() => '?').join(', ')})` : '';
|
||||
const ftsRows = ftsQuery
|
||||
? (this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT connection_id, source_name, column_name, value, bm25(local_sl_dictionary_values_fts) AS rank
|
||||
FROM local_sl_dictionary_values_fts
|
||||
WHERE local_sl_dictionary_values_fts MATCH ?
|
||||
${connectionPredicate}
|
||||
ORDER BY rank ASC, connection_id ASC, source_name ASC, column_name ASC, value ASC
|
||||
LIMIT ?
|
||||
`,
|
||||
)
|
||||
.all(ftsQuery, ...connectionIds, Math.max(25, input.limit * 4)) as DictionarySearchRow[])
|
||||
: [];
|
||||
|
||||
const substringRows = normalizedQuery
|
||||
? (this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT connection_id, source_name, column_name, value, NULL AS rank
|
||||
FROM local_sl_dictionary_values
|
||||
WHERE value_lower LIKE ?
|
||||
${connectionPredicate}
|
||||
ORDER BY connection_id ASC, source_name ASC, column_name ASC, value ASC
|
||||
LIMIT ?
|
||||
`,
|
||||
)
|
||||
.all(`%${normalizedQuery}%`, ...connectionIds, Math.max(25, input.limit * 4)) as DictionarySearchRow[])
|
||||
: [];
|
||||
|
||||
const rowsByKey = new Map<string, DictionarySearchRow>();
|
||||
for (const row of [...ftsRows, ...substringRows]) {
|
||||
const key = `${row.connection_id}/${row.source_name}/${row.column_name}/${row.value}`;
|
||||
if (!rowsByKey.has(key)) {
|
||||
rowsByKey.set(key, row);
|
||||
}
|
||||
}
|
||||
|
||||
const grouped = new Map<string, DictionarySearchRow[]>();
|
||||
for (const row of rowsByKey.values()) {
|
||||
const key = candidateId(row.connection_id, row.source_name);
|
||||
grouped.set(key, [...(grouped.get(key) ?? []), row]);
|
||||
}
|
||||
|
||||
return [...grouped.entries()]
|
||||
.map(([id, rows]) => {
|
||||
const [first] = rows;
|
||||
const byColumn = new Map<string, string[]>();
|
||||
for (const row of rows.sort(
|
||||
(left, right) => left.column_name.localeCompare(right.column_name) || left.value.localeCompare(right.value),
|
||||
)) {
|
||||
byColumn.set(row.column_name, [...(byColumn.get(row.column_name) ?? []), row.value]);
|
||||
}
|
||||
const matches = [...byColumn.entries()].map(([column, values]) => ({ column, values: values.slice(0, 5) }));
|
||||
return {
|
||||
id,
|
||||
connectionId: first?.connection_id ?? '',
|
||||
sourceName: first?.source_name ?? '',
|
||||
rank: 0,
|
||||
rawScore: matches.reduce((total, match) => total + match.values.length, 0),
|
||||
matches,
|
||||
};
|
||||
})
|
||||
.sort(
|
||||
(left, right) =>
|
||||
right.rawScore - left.rawScore ||
|
||||
right.matches.length - left.matches.length ||
|
||||
left.connectionId.localeCompare(right.connectionId) ||
|
||||
left.sourceName.localeCompare(right.sourceName),
|
||||
)
|
||||
.slice(0, Math.max(1, input.limit))
|
||||
.map((candidate, index) => ({ ...candidate, rank: index + 1 }));
|
||||
}
|
||||
|
||||
async search(
|
||||
connectionId: string,
|
||||
_queryEmbedding: number[] | null,
|
||||
queryText: string,
|
||||
limit: number,
|
||||
minRrfScore = 0,
|
||||
): Promise<Array<{ sourceName: string; rrfScore: number }>> {
|
||||
const ftsQuery = normalizeFtsQuery(queryText);
|
||||
if (!ftsQuery) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const rows = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT source_name, bm25(local_sl_sources_fts) AS rank
|
||||
FROM local_sl_sources_fts
|
||||
WHERE connection_id = ?
|
||||
AND local_sl_sources_fts MATCH ?
|
||||
ORDER BY rank ASC, source_name ASC
|
||||
LIMIT ?
|
||||
`,
|
||||
)
|
||||
.all(connectionId, ftsQuery, Math.max(1, limit)) as SearchRow[];
|
||||
|
||||
return rows
|
||||
.map((row) => ({ sourceName: row.source_name, rrfScore: scoreFromRank(row.rank) }))
|
||||
.filter((row) => row.rrfScore >= minRrfScore);
|
||||
}
|
||||
|
||||
private deleteByConnectionAndNameSync(connectionId: string, sourceName: string): void {
|
||||
const remove = this.db.transaction(() => {
|
||||
this.db
|
||||
.prepare(
|
||||
`
|
||||
DELETE FROM local_sl_sources_fts
|
||||
WHERE connection_id = ?
|
||||
AND source_name = ?
|
||||
`,
|
||||
)
|
||||
.run(connectionId, sourceName);
|
||||
this.db
|
||||
.prepare(
|
||||
`
|
||||
DELETE FROM local_sl_sources
|
||||
WHERE connection_id = ?
|
||||
AND source_name = ?
|
||||
`,
|
||||
)
|
||||
.run(connectionId, sourceName);
|
||||
});
|
||||
remove();
|
||||
}
|
||||
}
|
||||
154
packages/context/src/sl/tools/base-semantic-layer.tool.ts
Normal file
154
packages/context/src/sl/tools/base-semantic-layer.tool.ts
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
import type { ZodType } from 'zod';
|
||||
import type { GitAuthorResolverPort, ToolContext, ToolOutput } from '../../tools/index.js';
|
||||
import { BaseTool } from '../../tools/index.js';
|
||||
import { sourceDefinitionSchema } from '../schemas.js';
|
||||
import { SemanticLayerService } from '../semantic-layer.service.js';
|
||||
import { SlSearchService } from '../sl-search.service.js';
|
||||
|
||||
export { sourceDefinitionSchema };
|
||||
|
||||
// ── Shared output types ──
|
||||
|
||||
export interface SemanticLayerStructured {
|
||||
success: boolean;
|
||||
sourceName: string;
|
||||
yaml?: string;
|
||||
commitHash?: string;
|
||||
errors?: string[];
|
||||
validationErrors?: string[];
|
||||
validationWarnings?: string[];
|
||||
actionRequiredWarnings?: string[];
|
||||
}
|
||||
|
||||
export interface BaseSemanticLayerToolDeps {
|
||||
semanticLayerService: SemanticLayerService;
|
||||
slSearchService: SlSearchService;
|
||||
authorResolver: GitAuthorResolverPort;
|
||||
}
|
||||
|
||||
// ── Abstract base class ──
|
||||
|
||||
export abstract class BaseSemanticLayerTool<TInput extends ZodType = ZodType> extends BaseTool<TInput> {
|
||||
protected readonly semanticLayerService: SemanticLayerService;
|
||||
protected readonly slSearchService: SlSearchService;
|
||||
protected readonly authorResolver: GitAuthorResolverPort;
|
||||
|
||||
constructor(deps: BaseSemanticLayerToolDeps) {
|
||||
super();
|
||||
this.semanticLayerService = deps.semanticLayerService;
|
||||
this.slSearchService = deps.slSearchService;
|
||||
this.authorResolver = deps.authorResolver;
|
||||
}
|
||||
|
||||
protected async readSourceYaml(
|
||||
connectionId: string,
|
||||
sourceName: string,
|
||||
context?: ToolContext,
|
||||
): Promise<string | null> {
|
||||
const semanticLayerService = context?.session?.semanticLayerService ?? this.semanticLayerService;
|
||||
|
||||
try {
|
||||
const { content } = await semanticLayerService.readSourceFile(connectionId, sourceName);
|
||||
return content;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
protected buildMarkdown(
|
||||
success: boolean,
|
||||
errors: string[],
|
||||
sourceName: string,
|
||||
extra?: {
|
||||
yaml?: string;
|
||||
commitHash?: string;
|
||||
validationErrors?: string[];
|
||||
validationWarnings?: string[];
|
||||
actionRequiredWarnings?: string[];
|
||||
editCount?: number;
|
||||
},
|
||||
): string {
|
||||
const parts: string[] = [];
|
||||
|
||||
if (success) {
|
||||
const verb = extra?.editCount != null ? `applied ${extra.editCount} edit(s) to` : 'saved';
|
||||
parts.push(`Source **${sourceName}** ${verb} successfully.`);
|
||||
} else {
|
||||
parts.push(`Source **${sourceName}** update completed with ${errors.length} error(s):`);
|
||||
for (const err of errors) {
|
||||
parts.push(`- ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (extra?.commitHash) {
|
||||
parts.push(`Commit: \`${extra.commitHash}\``);
|
||||
}
|
||||
|
||||
if (extra?.actionRequiredWarnings && extra.actionRequiredWarnings.length > 0) {
|
||||
parts.push('\n**Action required:**');
|
||||
for (const warning of extra.actionRequiredWarnings) {
|
||||
parts.push(`- ${warning}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (extra?.validationErrors && extra.validationErrors.length > 0) {
|
||||
parts.push('\n**Validation errors:**');
|
||||
for (const ve of extra.validationErrors) {
|
||||
parts.push(`- ${ve}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (extra?.validationWarnings && extra.validationWarnings.length > 0) {
|
||||
parts.push('\n**Validation warnings:**');
|
||||
for (const vw of extra.validationWarnings) {
|
||||
parts.push(`- ${vw}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (extra?.yaml) {
|
||||
const yaml = extra.yaml;
|
||||
const MAX_YAML = 2000;
|
||||
if (yaml.length > MAX_YAML) {
|
||||
parts.push(`\n**YAML** (${yaml.length} chars, truncated):\n\`\`\`yaml\n${yaml.slice(0, MAX_YAML)}...\n\`\`\``);
|
||||
} else {
|
||||
parts.push(`\n**YAML**:\n\`\`\`yaml\n${yaml}\n\`\`\``);
|
||||
}
|
||||
}
|
||||
|
||||
return parts.join('\n');
|
||||
}
|
||||
|
||||
protected buildOutput(
|
||||
success: boolean,
|
||||
errors: string[],
|
||||
sourceName: string,
|
||||
extra?: {
|
||||
yaml?: string;
|
||||
commitHash?: string;
|
||||
validationErrors?: string[];
|
||||
validationWarnings?: string[];
|
||||
actionRequiredWarnings?: string[];
|
||||
editCount?: number;
|
||||
},
|
||||
): ToolOutput<SemanticLayerStructured> {
|
||||
return {
|
||||
markdown: this.buildMarkdown(success, errors, sourceName, extra),
|
||||
structured: {
|
||||
success,
|
||||
sourceName,
|
||||
yaml: extra?.yaml,
|
||||
commitHash: extra?.commitHash,
|
||||
...(errors.length > 0 ? { errors } : {}),
|
||||
...(extra?.validationErrors && extra.validationErrors.length > 0
|
||||
? { validationErrors: extra.validationErrors }
|
||||
: {}),
|
||||
...(extra?.validationWarnings && extra.validationWarnings.length > 0
|
||||
? { validationWarnings: extra.validationWarnings }
|
||||
: {}),
|
||||
...(extra?.actionRequiredWarnings && extra.actionRequiredWarnings.length > 0
|
||||
? { actionRequiredWarnings: extra.actionRequiredWarnings }
|
||||
: {}),
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
18
packages/context/src/sl/tools/connection-id-schema.test.ts
Normal file
18
packages/context/src/sl/tools/connection-id-schema.test.ts
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { slToolConnectionIdSchema } from './connection-id-schema.js';
|
||||
|
||||
describe('slToolConnectionIdSchema', () => {
|
||||
it('accepts app UUIDs and local project connection ids', () => {
|
||||
expect(slToolConnectionIdSchema.parse('00000000-0000-4000-8000-000000000001')).toBe(
|
||||
'00000000-0000-4000-8000-000000000001',
|
||||
);
|
||||
expect(slToolConnectionIdSchema.parse('warehouse')).toBe('warehouse');
|
||||
expect(slToolConnectionIdSchema.parse('warehouse_prod-1')).toBe('warehouse_prod-1');
|
||||
});
|
||||
|
||||
it('rejects empty, path-like, and hidden connection ids', () => {
|
||||
for (const value of ['', '../warehouse', 'warehouse/prod', '.warehouse', 'warehouse prod']) {
|
||||
expect(() => slToolConnectionIdSchema.parse(value)).toThrow();
|
||||
}
|
||||
});
|
||||
});
|
||||
6
packages/context/src/sl/tools/connection-id-schema.ts
Normal file
6
packages/context/src/sl/tools/connection-id-schema.ts
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
import { z } from 'zod';
|
||||
|
||||
export const slToolConnectionIdSchema = z
|
||||
.string()
|
||||
.min(1)
|
||||
.regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/, 'Connection id must be alphanumeric and may contain _ or -');
|
||||
11
packages/context/src/sl/tools/index.ts
Normal file
11
packages/context/src/sl/tools/index.ts
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
export type { BaseSemanticLayerToolDeps, SemanticLayerStructured } from './base-semantic-layer.tool.js';
|
||||
export { BaseSemanticLayerTool, sourceDefinitionSchema } from './base-semantic-layer.tool.js';
|
||||
export type { SlDiscoverySettings } from './sl-discover.tool.js';
|
||||
export { SlDiscoverTool } from './sl-discover.tool.js';
|
||||
export { SlEditSourceTool } from './sl-edit-source.tool.js';
|
||||
export { SlReadSourceTool } from './sl-read-source.tool.js';
|
||||
export { SlRollbackTool } from './sl-rollback.tool.js';
|
||||
export { SlValidateTool, validateSemanticLayerEndpoint } from './sl-validate.tool.js';
|
||||
export { SlWriteSourceTool } from './sl-write-source.tool.js';
|
||||
export type { SlValidationDeps, SourceValidationResult } from './sl-warehouse-validation.js';
|
||||
export { revertSourceToPreHead, validateSingleSource } from './sl-warehouse-validation.js';
|
||||
337
packages/context/src/sl/tools/sl-discover.tool.ts
Normal file
337
packages/context/src/sl/tools/sl-discover.tool.ts
Normal file
|
|
@ -0,0 +1,337 @@
|
|||
import { z } from 'zod';
|
||||
import { DEFAULT_PRIORITY, resolveDescription } from '../descriptions.js';
|
||||
import type { SemanticLayerSource } from '../types.js';
|
||||
import type { ToolContext, ToolOutput } from '../../tools/index.js';
|
||||
import { BaseSemanticLayerTool, type BaseSemanticLayerToolDeps } from './base-semantic-layer.tool.js';
|
||||
import { slToolConnectionIdSchema } from './connection-id-schema.js';
|
||||
|
||||
export interface SlDiscoverySettings {
|
||||
maxSources: number;
|
||||
minRrfScore: number;
|
||||
maxDetailedSources: number;
|
||||
}
|
||||
|
||||
const slDiscoverInputSchema = z.object({
|
||||
connectionId: slToolConnectionIdSchema
|
||||
.optional()
|
||||
.describe('Data source connection ID (omit to discover across all data sources)'),
|
||||
query: z.string().optional().describe('Search query to filter sources/columns/measures by name or description'),
|
||||
sourceName: z
|
||||
.string()
|
||||
.optional()
|
||||
.describe('Inspect a specific source in full detail (requires connectionId if multiple data sources)'),
|
||||
});
|
||||
|
||||
type SlDiscoverInput = z.infer<typeof slDiscoverInputSchema>;
|
||||
|
||||
interface SlDiscoverStructured {
|
||||
sources: Array<{
|
||||
connectionId: string;
|
||||
connectionName: string;
|
||||
name: string;
|
||||
description?: string;
|
||||
columnCount: number;
|
||||
measureCount: number;
|
||||
joinCount: number;
|
||||
}>;
|
||||
detail?: Record<string, unknown>;
|
||||
totalSources: number;
|
||||
}
|
||||
|
||||
export class SlDiscoverTool extends BaseSemanticLayerTool<typeof slDiscoverInputSchema> {
|
||||
readonly name = 'sl_discover';
|
||||
|
||||
constructor(
|
||||
deps: BaseSemanticLayerToolDeps,
|
||||
private readonly discoverySettings: SlDiscoverySettings,
|
||||
) {
|
||||
super(deps);
|
||||
}
|
||||
|
||||
get description(): string {
|
||||
return `<purpose>
|
||||
Discover available semantic layer sources, columns, measures, and joins.
|
||||
When called without a connectionId, discovers sources across ALL data sources — grouped by data source name and ID.
|
||||
Use this to understand what data is available before writing a semantic_query.
|
||||
</purpose>
|
||||
|
||||
<when_to_use>
|
||||
- Before querying: understand available sources across all data sources
|
||||
- To inspect a specific source in detail (columns, joins, measures, grain) — requires connectionId when multiple data sources exist
|
||||
- To search for sources related to a concept (e.g., "revenue", "customers") across all data sources
|
||||
</when_to_use>`;
|
||||
}
|
||||
|
||||
get inputSchema() {
|
||||
return slDiscoverInputSchema;
|
||||
}
|
||||
|
||||
async call(input: SlDiscoverInput, _context: ToolContext): Promise<ToolOutput<SlDiscoverStructured>> {
|
||||
const { query, sourceName } = input;
|
||||
|
||||
// Resolve connectionId: use provided value, or auto-detect
|
||||
let connectionId = input.connectionId;
|
||||
if (!connectionId) {
|
||||
const connections = await this.semanticLayerService.listConnectionIdsWithNames();
|
||||
if (connections.length === 0) {
|
||||
return {
|
||||
markdown: 'No semantic layer sources found. Run a schema scan first.',
|
||||
structured: { sources: [], totalSources: 0 },
|
||||
};
|
||||
}
|
||||
if (connections.length === 1) {
|
||||
connectionId = connections[0].id;
|
||||
} else {
|
||||
// Multiple connections — aggregate or prompt depending on operation
|
||||
if (sourceName) {
|
||||
const connectionList = connections
|
||||
.map((c) => `- **${c.name}** (${c.connectionType}): \`${c.id}\``)
|
||||
.join('\n');
|
||||
return {
|
||||
markdown: `Multiple data sources have semantic layer sources. Specify a connectionId to inspect source "${sourceName}":\n\n${connectionList}`,
|
||||
structured: { sources: [], totalSources: 0 },
|
||||
};
|
||||
}
|
||||
return this.discoverAcrossConnections(connections, query);
|
||||
}
|
||||
}
|
||||
|
||||
// If inspecting a specific source — show the SL interface (columns, measures, joins)
|
||||
// without the raw SQL. Use `sl_read_source` to see the full YAML including SQL.
|
||||
if (sourceName) {
|
||||
const sources = await this.semanticLayerService.loadAllSources(connectionId);
|
||||
const source = sources.find((s) => s.name === sourceName);
|
||||
if (!source) {
|
||||
return {
|
||||
markdown: `Source **${sourceName}** not found for this connection.`,
|
||||
structured: { sources: [], totalSources: 0 },
|
||||
};
|
||||
}
|
||||
|
||||
const parts: string[] = [];
|
||||
this.appendSourceDetail(parts, source);
|
||||
|
||||
if (source.grain?.length) {
|
||||
parts.push(`Grain: ${source.grain.join(', ')}`);
|
||||
}
|
||||
|
||||
return {
|
||||
markdown: parts.join('\n'),
|
||||
structured: {
|
||||
sources: [
|
||||
{
|
||||
connectionId,
|
||||
connectionName: connectionId,
|
||||
name: source.name,
|
||||
description:
|
||||
resolveDescription(source.descriptions, { priority: DEFAULT_PRIORITY }) ?? undefined,
|
||||
columnCount: source.columns.length,
|
||||
measureCount: source.measures.length,
|
||||
joinCount: source.joins.length,
|
||||
},
|
||||
],
|
||||
totalSources: 1,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// Single connection: list all sources
|
||||
const connections = await this.semanticLayerService.listConnectionIdsWithNames();
|
||||
const connInfo = connections.find((c) => c.id === connectionId);
|
||||
return this.discoverForConnection(connectionId, connInfo?.name ?? connectionId, query);
|
||||
}
|
||||
|
||||
private async discoverAcrossConnections(
|
||||
connections: Array<{ id: string; name: string; connectionType: string }>,
|
||||
query?: string,
|
||||
): Promise<ToolOutput<SlDiscoverStructured>> {
|
||||
// Load sources from all connections in parallel
|
||||
const results = await Promise.all(
|
||||
connections.map(async (conn) => {
|
||||
const sources = await this.semanticLayerService.loadAllSources(conn.id);
|
||||
let filtered = sources;
|
||||
if (query) {
|
||||
filtered = await this.filterByQuery(conn.id, sources, query);
|
||||
}
|
||||
return { conn, sources: filtered };
|
||||
}),
|
||||
);
|
||||
|
||||
const allSummaries: SlDiscoverStructured['sources'] = [];
|
||||
const parts: string[] = [];
|
||||
let totalSources = 0;
|
||||
|
||||
for (const { conn, sources } of results) {
|
||||
if (sources.length === 0) {
|
||||
continue;
|
||||
}
|
||||
totalSources += sources.length;
|
||||
|
||||
parts.push(`## ${conn.name} (${conn.connectionType}) — \`${conn.id}\``);
|
||||
parts.push('');
|
||||
|
||||
const config = { priority: DEFAULT_PRIORITY };
|
||||
for (const s of sources) {
|
||||
allSummaries.push({
|
||||
connectionId: conn.id,
|
||||
connectionName: conn.name,
|
||||
name: s.name,
|
||||
description: resolveDescription(s.descriptions, config) ?? undefined,
|
||||
columnCount: (s.columns ?? []).length,
|
||||
measureCount: (s.measures ?? []).length,
|
||||
joinCount: (s.joins ?? []).length,
|
||||
});
|
||||
}
|
||||
|
||||
this.appendTieredSources(parts, sources, !!query);
|
||||
}
|
||||
|
||||
if (totalSources === 0) {
|
||||
return {
|
||||
markdown: query
|
||||
? `No semantic layer sources found matching "${query}".`
|
||||
: 'No semantic layer sources found. Run a schema scan first, or create sources with sl_write_source.',
|
||||
structured: { sources: [], totalSources: 0 },
|
||||
};
|
||||
}
|
||||
|
||||
const header = `**${totalSources} source(s) found across ${results.filter((r) => r.sources.length > 0).length} data source(s)**${query ? ` matching "${query}"` : ''}:\n`;
|
||||
parts.unshift(header);
|
||||
|
||||
return {
|
||||
markdown: parts.join('\n'),
|
||||
structured: { sources: allSummaries, totalSources },
|
||||
};
|
||||
}
|
||||
|
||||
private async discoverForConnection(
|
||||
connectionId: string,
|
||||
connectionName: string,
|
||||
query?: string,
|
||||
): Promise<ToolOutput<SlDiscoverStructured>> {
|
||||
const sources = await this.semanticLayerService.loadAllSources(connectionId);
|
||||
|
||||
if (sources.length === 0) {
|
||||
return {
|
||||
markdown: 'No semantic layer sources found. Run a schema scan first, or create sources with sl_write_source.',
|
||||
structured: { sources: [], totalSources: 0 },
|
||||
};
|
||||
}
|
||||
|
||||
const filtered = query ? await this.filterByQuery(connectionId, sources, query) : sources;
|
||||
|
||||
const config = { priority: DEFAULT_PRIORITY };
|
||||
const summaries = filtered.map((s) => ({
|
||||
connectionId,
|
||||
connectionName,
|
||||
name: s.name,
|
||||
description: resolveDescription(s.descriptions, config) ?? undefined,
|
||||
columnCount: (s.columns ?? []).length,
|
||||
measureCount: (s.measures ?? []).length,
|
||||
joinCount: (s.joins ?? []).length,
|
||||
}));
|
||||
|
||||
const parts: string[] = [`**${filtered.length} source(s) found**${query ? ` matching "${query}"` : ''}:\n`];
|
||||
|
||||
this.appendTieredSources(parts, filtered, !!query);
|
||||
|
||||
return {
|
||||
markdown: parts.join('\n'),
|
||||
structured: { sources: summaries, totalSources: filtered.length },
|
||||
};
|
||||
}
|
||||
|
||||
private async filterByQuery(
|
||||
connectionId: string,
|
||||
sources: SemanticLayerSource[],
|
||||
query: string,
|
||||
): Promise<SemanticLayerSource[]> {
|
||||
const config = this.discoverySettings;
|
||||
const searchResults = await this.slSearchService.search(connectionId, query, config.maxSources, config.minRrfScore);
|
||||
if (searchResults.length > 0) {
|
||||
const rankedNames = new Set(searchResults.map((r) => r.sourceName));
|
||||
const nameOrder = new Map(searchResults.map((r, i) => [r.sourceName, i]));
|
||||
return sources
|
||||
.filter((s) => rankedNames.has(s.name))
|
||||
.sort((a, b) => (nameOrder.get(a.name) ?? 0) - (nameOrder.get(b.name) ?? 0));
|
||||
}
|
||||
return this.fallbackTermMatch(sources, query);
|
||||
}
|
||||
|
||||
private fallbackTermMatch(sources: SemanticLayerSource[], query: string): SemanticLayerSource[] {
|
||||
const config = { priority: DEFAULT_PRIORITY };
|
||||
const terms = query.toLowerCase().split(/\s+/).filter(Boolean);
|
||||
const scored = sources
|
||||
.map((s) => {
|
||||
const searchText = [
|
||||
s.name,
|
||||
resolveDescription(s.descriptions, config) ?? '',
|
||||
...s.columns.map((c) => `${c.name} ${resolveDescription(c.descriptions, config) ?? ''}`),
|
||||
...s.measures.map((m) => `${m.name} ${m.description ?? ''}`),
|
||||
]
|
||||
.join(' ')
|
||||
.toLowerCase();
|
||||
const matchCount = terms.filter((term) => searchText.includes(term)).length;
|
||||
return { source: s, matchCount };
|
||||
})
|
||||
.filter((x) => x.matchCount > 0)
|
||||
.sort((a, b) => b.matchCount - a.matchCount);
|
||||
return scored.map((x) => x.source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Render sources in two tiers:
|
||||
* - Top N (ranked by relevance when query is present) get full detail
|
||||
* - Remaining sources get a one-liner with name, description, and measure count
|
||||
*/
|
||||
private appendTieredSources(parts: string[], sources: SemanticLayerSource[], hasQuery: boolean): void {
|
||||
const maxDetailed = this.discoverySettings.maxDetailedSources;
|
||||
const detailLimit = hasQuery ? maxDetailed : 0;
|
||||
const detailed = sources.slice(0, detailLimit);
|
||||
const rest = sources.slice(detailLimit);
|
||||
|
||||
for (const s of detailed) {
|
||||
this.appendSourceDetail(parts, s);
|
||||
}
|
||||
|
||||
if (rest.length > 0) {
|
||||
if (detailed.length > 0) {
|
||||
parts.push('**Other sources** (pass `sourceName` to inspect):');
|
||||
}
|
||||
const defaultConfig = { priority: DEFAULT_PRIORITY };
|
||||
for (const s of rest) {
|
||||
const resolvedDesc = resolveDescription(s.descriptions, defaultConfig);
|
||||
const desc = resolvedDesc ? ` — ${resolvedDesc}` : '';
|
||||
const stats = [s.measures.length > 0 ? `${s.measures.length} measures` : null, `${s.columns.length} cols`]
|
||||
.filter(Boolean)
|
||||
.join(', ');
|
||||
parts.push(`- **${s.name}**${desc} (${stats})`);
|
||||
}
|
||||
parts.push('');
|
||||
}
|
||||
}
|
||||
|
||||
/** Full detail for a single source: metadata, measures, joins, all public columns. */
|
||||
private appendSourceDetail(parts: string[], s: SemanticLayerSource): void {
|
||||
const detailDesc = resolveDescription(s.descriptions, { priority: DEFAULT_PRIORITY });
|
||||
parts.push(`### ${s.name}${detailDesc ? ` — ${detailDesc}` : ''}`);
|
||||
parts.push(
|
||||
`Type: ${s.sql ? 'sql' : 'table'} | Columns: ${s.columns.length} | Measures: ${s.measures.length} | Joins: ${s.joins.length}`,
|
||||
);
|
||||
|
||||
if (s.measures.length > 0) {
|
||||
parts.push(`Measures: ${s.measures.map((m) => `\`${m.name}\` (${m.expr})`).join(', ')}`);
|
||||
}
|
||||
|
||||
if (s.joins.length > 0) {
|
||||
parts.push(`Joins: ${s.joins.map((j) => `→ ${j.to} (${j.relationship})`).join(', ')}`);
|
||||
}
|
||||
|
||||
const publicCols = s.columns.filter((c) => c.visibility !== 'hidden');
|
||||
if (publicCols.length > 0) {
|
||||
parts.push(`Columns: ${publicCols.map((c) => `\`${s.name}.${c.name}\` (${c.type})`).join(', ')}`);
|
||||
}
|
||||
|
||||
parts.push('');
|
||||
}
|
||||
}
|
||||
187
packages/context/src/sl/tools/sl-edit-source.tool.test.ts
Normal file
187
packages/context/src/sl/tools/sl-edit-source.tool.test.ts
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { ToolSession } from '../../tools/index.js';
|
||||
import { createTouchedSlSources, hasTouchedSlSource, type ToolContext } from '../../tools/index.js';
|
||||
import { SlEditSourceTool } from './sl-edit-source.tool.js';
|
||||
|
||||
function makeTool(overrides: any = {}) {
|
||||
const semanticLayerService = {
|
||||
readSourceFile: vi.fn().mockResolvedValue({
|
||||
content:
|
||||
'name: orders\ntable: public.orders\ngrain: [id]\ncolumns:\n - name: id\n type: string\nmeasures: []\njoins: []\n',
|
||||
}),
|
||||
validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
|
||||
writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }),
|
||||
loadAllSources: vi.fn().mockResolvedValue([]),
|
||||
deleteSource: vi.fn().mockResolvedValue(undefined),
|
||||
isManifestBacked: vi.fn().mockResolvedValue(false),
|
||||
...overrides.semanticLayerService,
|
||||
};
|
||||
const slSearchService = {
|
||||
indexSources: vi.fn().mockResolvedValue(undefined),
|
||||
...overrides.slSearchService,
|
||||
};
|
||||
const tool = new SlEditSourceTool({
|
||||
semanticLayerService: semanticLayerService as never,
|
||||
slSearchService: slSearchService as never,
|
||||
authorResolver: { resolve: vi.fn().mockResolvedValue({ name: 'T U', email: 't@u.com' }) },
|
||||
});
|
||||
return { tool, semanticLayerService, slSearchService };
|
||||
}
|
||||
|
||||
const baseContext: ToolContext = { sourceId: 's', messageId: 'm', userId: 'u' };
|
||||
|
||||
function makeSession(overrides: Partial<ToolSession> = {}): ToolSession {
|
||||
return {
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
isWorktreeScoped: true,
|
||||
preHead: 'base',
|
||||
touchedSlSources: createTouchedSlSources(),
|
||||
actions: [],
|
||||
semanticLayerService: {
|
||||
readSourceFile: vi.fn().mockResolvedValue({
|
||||
content:
|
||||
'name: orders\ntable: public.orders\ngrain: [id]\ncolumns:\n - name: id\n type: string\nmeasures: []\njoins: []\n',
|
||||
}),
|
||||
validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
|
||||
writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }),
|
||||
loadAllSources: vi.fn().mockResolvedValue([]),
|
||||
} as any,
|
||||
wikiService: {} as any,
|
||||
configService: {} as any,
|
||||
gitService: {} as any,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('SlEditSourceTool — session gating', () => {
|
||||
it('skips slSearchService.indexSources when session is worktree-scoped', async () => {
|
||||
const { tool, slSearchService } = makeTool();
|
||||
const session = makeSession();
|
||||
const context: ToolContext = { ...baseContext, session };
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: session.connectionId,
|
||||
sourceName: 'orders',
|
||||
yaml_edits: [{ oldText: 'measures: []', newText: 'measures: []' }],
|
||||
} as any,
|
||||
context,
|
||||
);
|
||||
expect(result.structured.success).toBe(true);
|
||||
expect(slSearchService.indexSources).not.toHaveBeenCalled();
|
||||
expect(hasTouchedSlSource(session.touchedSlSources, session.connectionId!, 'orders')).toBe(true);
|
||||
expect(session.actions).toContainEqual(expect.objectContaining({ target: 'sl', key: 'orders' }));
|
||||
});
|
||||
|
||||
it('records cross-connection SL edits with targetConnectionId', async () => {
|
||||
const { tool } = makeTool();
|
||||
const session = makeSession({ connectionId: '11111111-1111-4111-8111-111111111111' });
|
||||
const warehouseConnectionId = '22222222-2222-4222-8222-222222222222';
|
||||
const context: ToolContext = { ...baseContext, session };
|
||||
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: warehouseConnectionId,
|
||||
sourceName: 'orders',
|
||||
yaml_edits: [{ oldText: 'measures: []', newText: 'measures: []' }],
|
||||
} as any,
|
||||
context,
|
||||
);
|
||||
|
||||
expect(result.structured.success).toBe(true);
|
||||
expect(hasTouchedSlSource(session.touchedSlSources, warehouseConnectionId, 'orders')).toBe(true);
|
||||
expect(session.actions).toContainEqual(
|
||||
expect.objectContaining({
|
||||
target: 'sl',
|
||||
type: 'updated',
|
||||
key: 'orders',
|
||||
targetConnectionId: warehouseConnectionId,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('indexes normally when no session is present', async () => {
|
||||
const { tool, slSearchService } = makeTool();
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
sourceName: 'orders',
|
||||
yaml_edits: [{ oldText: 'measures: []', newText: 'measures: []' }],
|
||||
} as any,
|
||||
baseContext,
|
||||
);
|
||||
expect(result.structured.success).toBe(true);
|
||||
expect(slSearchService.indexSources).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('uses session.semanticLayerService when session is present', async () => {
|
||||
const { tool } = makeTool();
|
||||
const session = makeSession();
|
||||
const context: ToolContext = { ...baseContext, session };
|
||||
await tool.call(
|
||||
{
|
||||
connectionId: session.connectionId,
|
||||
sourceName: 'orders',
|
||||
yaml_edits: [{ oldText: 'measures: []', newText: 'measures: []' }],
|
||||
} as any,
|
||||
context,
|
||||
);
|
||||
expect((session.semanticLayerService as any).writeSource).toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('SlEditSourceTool — manifest-backed source without overlay', () => {
|
||||
it('returns a directed hint pointing at sl_write_source + overlay shape', async () => {
|
||||
const { tool, semanticLayerService } = makeTool({
|
||||
semanticLayerService: {
|
||||
readSourceFile: vi.fn().mockRejectedValue(new Error('ENOENT')),
|
||||
isManifestBacked: vi.fn().mockResolvedValue(true),
|
||||
},
|
||||
});
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
sourceName: 'CONSIGNMENTS',
|
||||
yaml_edits: [{ oldText: 'measures: []', newText: 'measures:\n - name: aav_count\n expr: count(*)' }],
|
||||
} as any,
|
||||
baseContext,
|
||||
);
|
||||
|
||||
expect(result.structured.success).toBe(false);
|
||||
expect(semanticLayerService.isManifestBacked).toHaveBeenCalledWith(
|
||||
'11111111-1111-1111-1111-111111111111',
|
||||
'CONSIGNMENTS',
|
||||
);
|
||||
expect(semanticLayerService.writeSource).not.toHaveBeenCalled();
|
||||
|
||||
const joinedErrors = (result.structured.errors ?? []).join('\n');
|
||||
expect(joinedErrors).toContain('CONSIGNMENTS');
|
||||
expect(joinedErrors).toContain('manifest');
|
||||
expect(joinedErrors).toContain('sl_write_source');
|
||||
expect(joinedErrors).toContain('overlay');
|
||||
// Overlay shape: only name + measures/segments/description
|
||||
expect(joinedErrors).toContain('measures');
|
||||
expect(joinedErrors).toContain('segments');
|
||||
});
|
||||
|
||||
it('still returns the plain "Source not found" error for truly-missing names', async () => {
|
||||
const { tool, semanticLayerService } = makeTool({
|
||||
semanticLayerService: {
|
||||
readSourceFile: vi.fn().mockRejectedValue(new Error('ENOENT')),
|
||||
isManifestBacked: vi.fn().mockResolvedValue(false),
|
||||
},
|
||||
});
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
sourceName: 'does_not_exist',
|
||||
yaml_edits: [{ oldText: 'x', newText: 'y' }],
|
||||
} as any,
|
||||
baseContext,
|
||||
);
|
||||
|
||||
expect(result.structured.success).toBe(false);
|
||||
expect(result.structured.errors).toEqual(['Source not found. Use sl_write_source to create it.']);
|
||||
expect(semanticLayerService.isManifestBacked).toHaveBeenCalledTimes(1);
|
||||
expect(semanticLayerService.writeSource).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
200
packages/context/src/sl/tools/sl-edit-source.tool.ts
Normal file
200
packages/context/src/sl/tools/sl-edit-source.tool.ts
Normal file
|
|
@ -0,0 +1,200 @@
|
|||
import YAML from 'yaml';
|
||||
import { z } from 'zod';
|
||||
import { addTouchedSlSource, type ToolContext, type ToolOutput } from '../../tools/index.js';
|
||||
import { applySqlEdits } from '../../tools/sql-edit-replacer.js';
|
||||
import type { SemanticLayerSource } from '../types.js';
|
||||
import {
|
||||
BaseSemanticLayerTool,
|
||||
type BaseSemanticLayerToolDeps,
|
||||
type SemanticLayerStructured,
|
||||
} from './base-semantic-layer.tool.js';
|
||||
import { slToolConnectionIdSchema } from './connection-id-schema.js';
|
||||
|
||||
const slEditSourceInputSchema = z.object({
|
||||
connectionId: slToolConnectionIdSchema.describe('Data source connection ID'),
|
||||
sourceName: z.string().describe('Name of the source to edit'),
|
||||
yaml_edits: z
|
||||
.array(
|
||||
z.object({
|
||||
oldText: z.string().describe('Exact text to find in the current YAML. Must match exactly (byte-for-byte).'),
|
||||
newText: z.string().describe('Replacement text. Use empty string to delete.'),
|
||||
reason: z.string().optional().describe('Brief reason for this edit.'),
|
||||
}),
|
||||
)
|
||||
.optional()
|
||||
.describe('Targeted exact-match search/replace edits on the raw YAML content.'),
|
||||
delete: z.boolean().optional().describe('Set to true to delete this source entirely'),
|
||||
});
|
||||
|
||||
type SlEditSourceInput = z.infer<typeof slEditSourceInputSchema>;
|
||||
|
||||
function actionTargetConnectionId(
|
||||
runConnectionId: string | null | undefined,
|
||||
actionConnectionId: string,
|
||||
): string | null {
|
||||
return runConnectionId && runConnectionId !== actionConnectionId ? actionConnectionId : null;
|
||||
}
|
||||
|
||||
export class SlEditSourceTool extends BaseSemanticLayerTool<typeof slEditSourceInputSchema> {
|
||||
readonly name = 'sl_edit_source';
|
||||
|
||||
constructor(deps: BaseSemanticLayerToolDeps) {
|
||||
super(deps);
|
||||
}
|
||||
|
||||
get description(): string {
|
||||
return `<purpose>
|
||||
Make targeted edits to an existing semantic layer source using exact-match search/replace on YAML content.
|
||||
If no source exists yet, use sl_write_source instead — this tool will reject the call.
|
||||
</purpose>
|
||||
|
||||
<when_to_use>
|
||||
- Adding/removing a measure on an existing source
|
||||
- Adding/updating a join relationship
|
||||
- Updating column descriptions
|
||||
- Removing an obsolete source (set delete: true)
|
||||
- Consolidation: delete redundant sources, edit the surviving one
|
||||
</when_to_use>
|
||||
|
||||
<edit_guidelines>
|
||||
- yaml_edits: exact-match search/replace on raw YAML. oldText must match byte-for-byte (no whitespace normalization or fuzzy matching).
|
||||
Include enough surrounding context in oldText for a unique match.
|
||||
- Read the source first with sl_read_source to copy the exact text you want to replace.
|
||||
- Keep edits scoped to the user's request — don't proactively regenerate all measures.
|
||||
</edit_guidelines>`;
|
||||
}
|
||||
|
||||
get inputSchema() {
|
||||
return slEditSourceInputSchema;
|
||||
}
|
||||
|
||||
async call(input: SlEditSourceInput, context: ToolContext): Promise<ToolOutput<SemanticLayerStructured>> {
|
||||
const { connectionId, sourceName } = input;
|
||||
const { name: author, email: authorEmail } = await this.authorResolver.resolve(context.userId);
|
||||
|
||||
const semanticLayerService = context.session?.semanticLayerService ?? this.semanticLayerService;
|
||||
const skipIndex = context.session?.isWorktreeScoped === true;
|
||||
|
||||
// Handle delete
|
||||
if (input.delete) {
|
||||
try {
|
||||
await semanticLayerService.deleteSource(connectionId, sourceName, author, authorEmail);
|
||||
if (context.session) {
|
||||
addTouchedSlSource(context.session.touchedSlSources, connectionId, sourceName);
|
||||
context.session.actions.push({
|
||||
target: 'sl',
|
||||
type: 'removed',
|
||||
key: sourceName,
|
||||
detail: 'Deleted source',
|
||||
targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId),
|
||||
});
|
||||
}
|
||||
return this.buildOutput(true, [], sourceName, { yaml: undefined, commitHash: undefined });
|
||||
} catch (error) {
|
||||
return this.buildOutput(false, [error instanceof Error ? error.message : String(error)], sourceName);
|
||||
}
|
||||
}
|
||||
|
||||
// Read existing source
|
||||
let currentYaml: string | null = null;
|
||||
try {
|
||||
const { content } = await semanticLayerService.readSourceFile(connectionId, sourceName);
|
||||
currentYaml = content;
|
||||
} catch {
|
||||
currentYaml = null;
|
||||
}
|
||||
if (!currentYaml) {
|
||||
const manifestBacked = await semanticLayerService.isManifestBacked(connectionId, sourceName);
|
||||
if (manifestBacked) {
|
||||
return this.buildOutput(
|
||||
false,
|
||||
[
|
||||
[
|
||||
`Source "${sourceName}" exists in the schema manifest but has no overlay file yet — sl_edit_source cannot edit it directly.`,
|
||||
`Bootstrap an overlay with sl_write_source, then re-run sl_edit_source on subsequent changes:`,
|
||||
` name: ${sourceName}`,
|
||||
` measures:`,
|
||||
` - name: <measure_name>`,
|
||||
` expr: "<expression>"`,
|
||||
` description: "<what it measures>"`,
|
||||
`Overlay shape: "name:" plus any of "measures:", "segments:", "description:". Do NOT include "sql:", "table:", "grain:", "columns:", or "joins:" — those are inherited from the manifest.`,
|
||||
].join('\n'),
|
||||
],
|
||||
sourceName,
|
||||
);
|
||||
}
|
||||
return this.buildOutput(false, ['Source not found. Use sl_write_source to create it.'], sourceName);
|
||||
}
|
||||
|
||||
const errors: string[] = [];
|
||||
let yaml = currentYaml;
|
||||
let editCount = 0;
|
||||
|
||||
// Apply yaml_edits (text-level search/replace, exact-match only)
|
||||
if (input.yaml_edits && input.yaml_edits.length > 0) {
|
||||
const editResult = applySqlEdits(yaml, input.yaml_edits, { exactOnly: true });
|
||||
yaml = editResult.sql;
|
||||
editCount = editResult.appliedEdits;
|
||||
if (!editResult.success) {
|
||||
errors.push(...editResult.errors);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse resulting YAML
|
||||
let source: SemanticLayerSource;
|
||||
try {
|
||||
source = YAML.parse(yaml) as SemanticLayerSource;
|
||||
} catch (e) {
|
||||
return this.buildOutput(false, [`YAML parse error after edits: ${e}`], sourceName);
|
||||
}
|
||||
|
||||
// Re-serialize and write
|
||||
const updatedYaml = YAML.stringify(source, { indent: 2, lineWidth: 0 });
|
||||
|
||||
const { errors: validationErrors, warnings: validationWarnings } =
|
||||
await semanticLayerService.validateWithProposedSource(connectionId, source);
|
||||
if (validationErrors.length > 0) {
|
||||
return this.buildOutput(
|
||||
false,
|
||||
[...errors, 'Validation failed — edits were NOT saved:', ...validationErrors],
|
||||
sourceName,
|
||||
{ yaml: updatedYaml, editCount, validationErrors, validationWarnings },
|
||||
);
|
||||
}
|
||||
|
||||
const commitMessage = `Edit source ${sourceName}: ${
|
||||
input.yaml_edits ? `${input.yaml_edits.length} YAML edit(s)` : 'update'
|
||||
}`;
|
||||
|
||||
try {
|
||||
const result = await semanticLayerService.writeSource(connectionId, source, author, authorEmail, commitMessage);
|
||||
|
||||
if (!skipIndex) {
|
||||
const allSources = await semanticLayerService.loadAllSources(connectionId);
|
||||
await this.slSearchService.indexSources(connectionId, allSources).catch(() => {});
|
||||
}
|
||||
|
||||
if (context.session) {
|
||||
addTouchedSlSource(context.session.touchedSlSources, connectionId, sourceName);
|
||||
context.session.actions.push({
|
||||
target: 'sl',
|
||||
type: 'updated',
|
||||
key: sourceName,
|
||||
detail: `Applied ${editCount} edit(s)`,
|
||||
targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId),
|
||||
});
|
||||
}
|
||||
|
||||
return this.buildOutput(errors.length === 0, errors, sourceName, {
|
||||
yaml: updatedYaml,
|
||||
commitHash: result.commitHash ?? undefined,
|
||||
editCount,
|
||||
validationErrors,
|
||||
validationWarnings,
|
||||
});
|
||||
} catch (error) {
|
||||
errors.push(error instanceof Error ? error.message : String(error));
|
||||
return this.buildOutput(false, errors, sourceName, { yaml: updatedYaml, editCount });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { ToolSession } from '../../tools/index.js';
|
||||
import { createTouchedSlSources, type ToolContext } from '../../tools/index.js';
|
||||
import { SlReadSourceTool } from './sl-read-source.tool.js';
|
||||
|
||||
function makeTool(overrides: Partial<Record<string, any>> = {}) {
|
||||
const semanticLayerService = {
|
||||
readSourceFile: vi.fn().mockResolvedValue({ content: 'name: foo_default\n', path: 'default' }),
|
||||
...overrides.semanticLayerService,
|
||||
};
|
||||
|
||||
const tool = new SlReadSourceTool({
|
||||
semanticLayerService: semanticLayerService as never,
|
||||
slSearchService: {} as never,
|
||||
authorResolver: { resolve: vi.fn() },
|
||||
});
|
||||
return { tool, semanticLayerService };
|
||||
}
|
||||
|
||||
function makeContext(overrides: Partial<ToolContext> = {}): ToolContext {
|
||||
return {
|
||||
sourceId: 'src',
|
||||
messageId: 'msg',
|
||||
userId: 'user',
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function makeSession(overrides: Partial<ToolSession> = {}): ToolSession {
|
||||
return {
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
isWorktreeScoped: true,
|
||||
preHead: 'base',
|
||||
touchedSlSources: createTouchedSlSources(),
|
||||
actions: [],
|
||||
semanticLayerService: {
|
||||
readSourceFile: vi.fn().mockResolvedValue({ content: 'name: foo_session\n', path: 'session' }),
|
||||
} as any,
|
||||
wikiService: {} as any,
|
||||
configService: {} as any,
|
||||
gitService: {} as any,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('SlReadSourceTool - session-scoped reads', () => {
|
||||
it('reads through context.session.semanticLayerService when a session is present', async () => {
|
||||
const { tool, semanticLayerService } = makeTool();
|
||||
const session = makeSession();
|
||||
|
||||
const result = await tool.call(
|
||||
{ connectionId: '11111111-1111-1111-1111-111111111111', sourceName: 'foo' },
|
||||
makeContext({ session }),
|
||||
);
|
||||
|
||||
expect((session.semanticLayerService as any).readSourceFile).toHaveBeenCalledWith(
|
||||
'11111111-1111-1111-1111-111111111111',
|
||||
'foo',
|
||||
);
|
||||
expect(semanticLayerService.readSourceFile).not.toHaveBeenCalled();
|
||||
expect(result.structured.yaml).toContain('foo_session');
|
||||
});
|
||||
|
||||
it('reads through the default service when no session is present', async () => {
|
||||
const { tool, semanticLayerService } = makeTool();
|
||||
|
||||
const result = await tool.call(
|
||||
{ connectionId: '11111111-1111-1111-1111-111111111111', sourceName: 'foo' },
|
||||
makeContext(),
|
||||
);
|
||||
|
||||
expect(semanticLayerService.readSourceFile).toHaveBeenCalledWith('11111111-1111-1111-1111-111111111111', 'foo');
|
||||
expect(result.structured.yaml).toContain('foo_default');
|
||||
});
|
||||
});
|
||||
63
packages/context/src/sl/tools/sl-read-source.tool.ts
Normal file
63
packages/context/src/sl/tools/sl-read-source.tool.ts
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
import { z } from 'zod';
|
||||
import type { ToolContext, ToolOutput } from '../../tools/index.js';
|
||||
import { BaseSemanticLayerTool, type BaseSemanticLayerToolDeps } from './base-semantic-layer.tool.js';
|
||||
import { slToolConnectionIdSchema } from './connection-id-schema.js';
|
||||
|
||||
const slReadSourceInputSchema = z.object({
|
||||
connectionId: slToolConnectionIdSchema.describe('Data source connection ID'),
|
||||
sourceName: z.string().describe('Name of the source to read'),
|
||||
});
|
||||
|
||||
type SlReadSourceInput = z.infer<typeof slReadSourceInputSchema>;
|
||||
|
||||
interface SlReadSourceStructured {
|
||||
sourceName: string;
|
||||
yaml: string;
|
||||
}
|
||||
|
||||
export class SlReadSourceTool extends BaseSemanticLayerTool<typeof slReadSourceInputSchema> {
|
||||
readonly name = 'sl_read_source';
|
||||
|
||||
constructor(deps: BaseSemanticLayerToolDeps) {
|
||||
super(deps);
|
||||
}
|
||||
|
||||
get description(): string {
|
||||
return `<purpose>
|
||||
Read the raw YAML definition of a semantic layer source, including its SQL implementation.
|
||||
Use this when you need to understand how a source is built — e.g., before editing it with sl_edit_source or sl_write_source.
|
||||
</purpose>
|
||||
|
||||
<when_to_use>
|
||||
- Before editing a source: understand its full definition (SQL, columns, measures, joins)
|
||||
- When debugging a source: see the underlying SQL query
|
||||
- When creating a new source based on an existing one
|
||||
</when_to_use>
|
||||
|
||||
<when_not_to_use>
|
||||
- To discover what sources/measures/dimensions are available for querying — use sl_discover instead
|
||||
- To query data — use semantic_query or create_widget with slQuery
|
||||
</when_not_to_use>`;
|
||||
}
|
||||
|
||||
get inputSchema() {
|
||||
return slReadSourceInputSchema;
|
||||
}
|
||||
|
||||
async call(input: SlReadSourceInput, context: ToolContext): Promise<ToolOutput<SlReadSourceStructured>> {
|
||||
const { connectionId, sourceName } = input;
|
||||
|
||||
const yaml = await this.readSourceYaml(connectionId, sourceName, context);
|
||||
if (!yaml) {
|
||||
return {
|
||||
markdown: `Source **${sourceName}** not found for connection ${connectionId}.`,
|
||||
structured: { sourceName, yaml: '' },
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
markdown: `## Source: ${sourceName}\n\n\`\`\`yaml\n${yaml}\n\`\`\``,
|
||||
structured: { sourceName, yaml },
|
||||
};
|
||||
}
|
||||
}
|
||||
67
packages/context/src/sl/tools/sl-rollback.tool.test.ts
Normal file
67
packages/context/src/sl/tools/sl-rollback.tool.test.ts
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { ToolSession } from '../../tools/index.js';
|
||||
import { createTouchedSlSources, hasTouchedSlSource, type ToolContext } from '../../tools/index.js';
|
||||
import { SlRollbackTool } from './sl-rollback.tool.js';
|
||||
|
||||
function makeSession(overrides: Partial<ToolSession> = {}): ToolSession {
|
||||
return {
|
||||
connectionId: 'conn-1',
|
||||
isWorktreeScoped: true,
|
||||
preHead: 'base',
|
||||
touchedSlSources: createTouchedSlSources([{ connectionId: 'conn-1', sourceName: 'orders' }]),
|
||||
actions: [{ target: 'sl', type: 'updated', key: 'orders', detail: 'x' }],
|
||||
semanticLayerService: {} as any,
|
||||
wikiService: {} as any,
|
||||
configService: {
|
||||
writeFile: vi.fn().mockResolvedValue(undefined),
|
||||
deleteFile: vi.fn().mockResolvedValue(undefined),
|
||||
} as any,
|
||||
gitService: { getFileAtCommit: vi.fn().mockResolvedValue('pre: content') } as any,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('SlRollbackTool', () => {
|
||||
const connections = {
|
||||
getConnectionById: vi.fn(),
|
||||
listEnabledConnections: vi.fn(),
|
||||
executeQuery: vi.fn(),
|
||||
};
|
||||
|
||||
it('errors when context.session is absent', async () => {
|
||||
const tool = new SlRollbackTool({} as never, connections as never, 1);
|
||||
const context: ToolContext = { sourceId: 's', messageId: 'm', userId: 'u' };
|
||||
const result = await tool.call({ sourceName: 'orders' } as any, context);
|
||||
expect(result.structured.success).toBe(false);
|
||||
expect(result.markdown).toMatch(/session/i);
|
||||
});
|
||||
|
||||
it('errors when session has no connectionId (wiki-only turn)', async () => {
|
||||
const tool = new SlRollbackTool({} as never, connections as never, 1);
|
||||
const session = makeSession({ connectionId: null });
|
||||
const context: ToolContext = { sourceId: 's', messageId: 'm', userId: 'u', session };
|
||||
const result = await tool.call({ sourceName: 'orders' } as any, context);
|
||||
expect(result.structured.success).toBe(false);
|
||||
expect(result.markdown).toMatch(/connection-scoped session/i);
|
||||
// Session state untouched
|
||||
expect(hasTouchedSlSource(session.touchedSlSources, 'conn-1', 'orders')).toBe(true);
|
||||
expect((session.gitService as any).getFileAtCommit).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('restores the source content from preHead, clears touched set, prunes actions', async () => {
|
||||
const slSourcesRepository = { deleteByConnectionAndName: vi.fn().mockResolvedValue(undefined) };
|
||||
const tool = new SlRollbackTool(slSourcesRepository as never, connections as never, 1);
|
||||
const session = makeSession();
|
||||
const context: ToolContext = { sourceId: 's', messageId: 'm', userId: 'u', session };
|
||||
const result = await tool.call({ sourceName: 'orders' } as any, context);
|
||||
|
||||
expect(result.structured.success).toBe(true);
|
||||
expect((session.gitService as any).getFileAtCommit).toHaveBeenCalledWith(
|
||||
expect.stringContaining('orders.yaml'),
|
||||
'base',
|
||||
);
|
||||
expect((session.configService as any).writeFile).toHaveBeenCalled();
|
||||
expect(hasTouchedSlSource(session.touchedSlSources, 'conn-1', 'orders')).toBe(false);
|
||||
expect(session.actions).toEqual([]);
|
||||
});
|
||||
});
|
||||
87
packages/context/src/sl/tools/sl-rollback.tool.ts
Normal file
87
packages/context/src/sl/tools/sl-rollback.tool.ts
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
import { z } from 'zod';
|
||||
import { BaseTool, deleteTouchedSlSource, type ToolContext, type ToolOutput } from '../../tools/index.js';
|
||||
import type { SlConnectionCatalogPort, SlSourcesIndexPort } from '../ports.js';
|
||||
import { revertSourceToPreHead } from './sl-warehouse-validation.js';
|
||||
|
||||
const slRollbackInputSchema = z.object({
|
||||
sourceName: z.string().describe('Name of the source to roll back'),
|
||||
});
|
||||
|
||||
type SlRollbackInput = z.infer<typeof slRollbackInputSchema>;
|
||||
|
||||
interface SlRollbackStructured {
|
||||
success: boolean;
|
||||
sourceName: string;
|
||||
outcome?: string;
|
||||
}
|
||||
|
||||
export class SlRollbackTool extends BaseTool<typeof slRollbackInputSchema> {
|
||||
readonly name = 'sl_rollback';
|
||||
|
||||
constructor(
|
||||
private readonly slSourcesRepository: SlSourcesIndexPort,
|
||||
private readonly connections: SlConnectionCatalogPort,
|
||||
private readonly probeRowCount: number,
|
||||
) {
|
||||
super();
|
||||
}
|
||||
|
||||
get description(): string {
|
||||
return `<purpose>
|
||||
Abandon this-session changes to a source and restore it to its pre-session state.
|
||||
Use when a write/edit failed validation in a way you cannot fix in-session (e.g. the source requires elevated warehouse permissions).
|
||||
</purpose>`;
|
||||
}
|
||||
|
||||
get inputSchema() {
|
||||
return slRollbackInputSchema;
|
||||
}
|
||||
|
||||
async call(input: SlRollbackInput, context: ToolContext): Promise<ToolOutput<SlRollbackStructured>> {
|
||||
const session = context.session;
|
||||
if (!session) {
|
||||
return {
|
||||
markdown:
|
||||
'Error: sl_rollback requires an active session (ingest WU or memory-agent). Use git revert for interactive rollback.',
|
||||
structured: { success: false, sourceName: input.sourceName },
|
||||
};
|
||||
}
|
||||
if (!session.connectionId) {
|
||||
return {
|
||||
markdown: 'Error: sl_rollback requires a connection-scoped session; this session has no warehouse connection.',
|
||||
structured: { success: false, sourceName: input.sourceName },
|
||||
};
|
||||
}
|
||||
|
||||
const outcome = await revertSourceToPreHead(
|
||||
{
|
||||
semanticLayerService: session.semanticLayerService,
|
||||
connections: this.connections,
|
||||
configService: session.configService,
|
||||
gitService: session.gitService,
|
||||
slSourcesRepository: this.slSourcesRepository,
|
||||
probeRowCount: this.probeRowCount,
|
||||
},
|
||||
session.connectionId,
|
||||
session.preHead,
|
||||
input.sourceName,
|
||||
);
|
||||
|
||||
deleteTouchedSlSource(session.touchedSlSources, session.connectionId, input.sourceName);
|
||||
for (let i = session.actions.length - 1; i >= 0; i--) {
|
||||
const a = session.actions[i];
|
||||
if (
|
||||
a.target === 'sl' &&
|
||||
a.key === input.sourceName &&
|
||||
(a.targetConnectionId ?? session.connectionId) === session.connectionId
|
||||
) {
|
||||
session.actions.splice(i, 1);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
markdown: `Source "${input.sourceName}" rolled back: ${outcome}.`,
|
||||
structured: { success: true, sourceName: input.sourceName, outcome },
|
||||
};
|
||||
}
|
||||
}
|
||||
66
packages/context/src/sl/tools/sl-validate.tool.test.ts
Normal file
66
packages/context/src/sl/tools/sl-validate.tool.test.ts
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { ToolSession } from '../../tools/index.js';
|
||||
import { createTouchedSlSources, type ToolContext } from '../../tools/index.js';
|
||||
import type { SemanticLayerService } from '../semantic-layer.service.js';
|
||||
import type { SemanticLayerSource } from '../types.js';
|
||||
import { SlValidateTool, validateSemanticLayerEndpoint } from './sl-validate.tool.js';
|
||||
|
||||
describe('validateSemanticLayerEndpoint', () => {
|
||||
it('uses the connection warehouse dialect, not hardcoded postgres', async () => {
|
||||
const serviceMock = {
|
||||
validateSourcesForConnection: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
|
||||
};
|
||||
|
||||
await validateSemanticLayerEndpoint('conn-1', serviceMock as unknown as SemanticLayerService);
|
||||
|
||||
expect(serviceMock.validateSourcesForConnection).toHaveBeenCalledWith('conn-1');
|
||||
});
|
||||
|
||||
it('short-circuits when there are no validatable sources', async () => {
|
||||
const serviceMock = {
|
||||
validateSourcesForConnection: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
|
||||
};
|
||||
|
||||
const result = await validateSemanticLayerEndpoint('conn-1', serviceMock as unknown as SemanticLayerService);
|
||||
|
||||
expect(result).toEqual({ errors: [], warnings: [] });
|
||||
});
|
||||
});
|
||||
|
||||
describe('SlValidateTool — session-aware touched-set filtering', () => {
|
||||
it('when session present, only returns errors/warnings that mention touched sources', async () => {
|
||||
const sources: SemanticLayerSource[] = [
|
||||
{ name: 'orders', table: 'x.orders', grain: ['id'], columns: [], joins: [], measures: [] },
|
||||
{ name: 'customers', table: 'x.customers', grain: ['id'], columns: [], joins: [], measures: [] },
|
||||
];
|
||||
const serviceMock = {
|
||||
loadAllSources: vi.fn().mockResolvedValue(sources),
|
||||
validateSourcesForConnection: vi.fn().mockResolvedValue({
|
||||
errors: ['orders: missing join target', 'customers: invalid grain'],
|
||||
warnings: ['orders: disconnected-components warning'],
|
||||
}),
|
||||
};
|
||||
|
||||
const tool = new SlValidateTool({
|
||||
semanticLayerService: serviceMock as never,
|
||||
slSearchService: {} as never,
|
||||
authorResolver: { resolve: vi.fn() },
|
||||
});
|
||||
|
||||
const session: ToolSession = {
|
||||
connectionId: 'conn-1',
|
||||
isWorktreeScoped: true,
|
||||
preHead: null,
|
||||
touchedSlSources: createTouchedSlSources([{ connectionId: 'conn-1', sourceName: 'orders' }]),
|
||||
actions: [],
|
||||
semanticLayerService: serviceMock as any,
|
||||
wikiService: {} as any,
|
||||
configService: {} as any,
|
||||
gitService: {} as any,
|
||||
};
|
||||
const context: ToolContext = { sourceId: 's', messageId: 'm', userId: 'u', session };
|
||||
const result = await tool.call({ connectionId: 'conn-1' } as any, context);
|
||||
expect(result.structured.validationErrors).toEqual(['orders: missing join target']);
|
||||
expect(result.structured.validationWarnings).toEqual(['orders: disconnected-components warning']);
|
||||
});
|
||||
});
|
||||
130
packages/context/src/sl/tools/sl-validate.tool.ts
Normal file
130
packages/context/src/sl/tools/sl-validate.tool.ts
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
import { z } from 'zod';
|
||||
import { type ToolContext, type ToolOutput, touchedSlSourceNamesForConnection } from '../../tools/index.js';
|
||||
import { SemanticLayerService } from '../semantic-layer.service.js';
|
||||
import {
|
||||
BaseSemanticLayerTool,
|
||||
type BaseSemanticLayerToolDeps,
|
||||
type SemanticLayerStructured,
|
||||
} from './base-semantic-layer.tool.js';
|
||||
import { slToolConnectionIdSchema } from './connection-id-schema.js';
|
||||
|
||||
const slValidateInputSchema = z.object({
|
||||
connectionId: slToolConnectionIdSchema.describe('Data source connection ID'),
|
||||
});
|
||||
|
||||
type SlValidateInput = z.infer<typeof slValidateInputSchema>;
|
||||
|
||||
type ValidationReport = {
|
||||
errors: string[];
|
||||
warnings: string[];
|
||||
};
|
||||
|
||||
export async function validateSemanticLayerEndpoint(
|
||||
connectionId: string,
|
||||
semanticLayerService: SemanticLayerService,
|
||||
): Promise<ValidationReport> {
|
||||
try {
|
||||
return await semanticLayerService.validateSourcesForConnection(connectionId);
|
||||
} catch (e) {
|
||||
return {
|
||||
errors: [`Validation call failed: ${e instanceof Error ? e.message : String(e)}`],
|
||||
warnings: [],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export class SlValidateTool extends BaseSemanticLayerTool<typeof slValidateInputSchema> {
|
||||
readonly name = 'sl_validate';
|
||||
|
||||
constructor(deps: BaseSemanticLayerToolDeps) {
|
||||
super(deps);
|
||||
}
|
||||
|
||||
get description(): string {
|
||||
return `<purpose>
|
||||
Validate that all semantic layer sources for a connection form a consistent model.
|
||||
Checks: all join targets exist, grain is valid, no missing references.
|
||||
</purpose>
|
||||
|
||||
<when_to_use>
|
||||
- After making edits with sl_write_source
|
||||
- Before querying, to ensure the model is healthy
|
||||
- When troubleshooting query failures
|
||||
</when_to_use>`;
|
||||
}
|
||||
|
||||
get inputSchema() {
|
||||
return slValidateInputSchema;
|
||||
}
|
||||
|
||||
async call(input: SlValidateInput, context: ToolContext): Promise<ToolOutput<SemanticLayerStructured>> {
|
||||
const { connectionId } = input;
|
||||
|
||||
const semanticLayerService = context.session?.semanticLayerService ?? this.semanticLayerService;
|
||||
|
||||
const sources = await semanticLayerService.loadAllSources(connectionId);
|
||||
if (sources.length === 0) {
|
||||
return this.buildOutput(true, [], '(all)', {
|
||||
validationErrors: ['No sources found for this connection.'],
|
||||
});
|
||||
}
|
||||
|
||||
let { errors, warnings } = await validateSemanticLayerEndpoint(connectionId, semanticLayerService);
|
||||
|
||||
const touched = context.session?.touchedSlSources;
|
||||
if (touched && touched.size > 0) {
|
||||
const touchedArr = touchedSlSourceNamesForConnection(touched, connectionId);
|
||||
if (touchedArr.length > 0) {
|
||||
errors = errors.filter((e) => touchedArr.some((n) => e.includes(n)));
|
||||
warnings = warnings.filter((w) => touchedArr.some((n) => w.includes(n)));
|
||||
}
|
||||
}
|
||||
|
||||
const valid = errors.length === 0;
|
||||
const parts: string[] = [];
|
||||
parts.push(`**Semantic layer validation** for ${sources.length} source(s):`);
|
||||
|
||||
if (valid && warnings.length === 0) {
|
||||
parts.push('All sources are valid. Join graph is consistent.');
|
||||
} else {
|
||||
const summary: string[] = [];
|
||||
if (errors.length > 0) {
|
||||
summary.push(`${errors.length} error(s)`);
|
||||
}
|
||||
if (warnings.length > 0) {
|
||||
summary.push(`${warnings.length} warning(s)`);
|
||||
}
|
||||
parts.push(`Found ${summary.join(' and ')}:`);
|
||||
if (errors.length > 0) {
|
||||
parts.push('', '**Errors:**');
|
||||
for (const err of errors) {
|
||||
parts.push(`- ${err}`);
|
||||
}
|
||||
}
|
||||
if (warnings.length > 0) {
|
||||
parts.push('', '**Warnings:**');
|
||||
for (const warn of warnings) {
|
||||
parts.push(`- ${warn}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// List sources summary
|
||||
parts.push('\n**Sources:**');
|
||||
for (const s of sources) {
|
||||
parts.push(
|
||||
`- **${s.name}** (${s.sql ? 'sql' : 'table'}): ${s.columns.length} cols, ${s.measures.length} measures, ${s.joins.length} joins`,
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
markdown: parts.join('\n'),
|
||||
structured: {
|
||||
success: valid,
|
||||
sourceName: '(all)',
|
||||
validationErrors: errors.length > 0 ? errors : undefined,
|
||||
validationWarnings: warnings.length > 0 ? warnings : undefined,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
120
packages/context/src/sl/tools/sl-warehouse-validation.test.ts
Normal file
120
packages/context/src/sl/tools/sl-warehouse-validation.test.ts
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { validateSingleSource } from './sl-warehouse-validation.js';
|
||||
|
||||
function makeDeps(opts: { sourceYaml: string; executeQuery: ReturnType<typeof vi.fn> }) {
|
||||
return {
|
||||
semanticLayerService: {
|
||||
readSourceFile: vi.fn().mockResolvedValue({ content: opts.sourceYaml, path: 'x' }),
|
||||
isManifestBacked: vi.fn().mockResolvedValue(false),
|
||||
listManifestSourceNames: vi.fn().mockResolvedValue([]),
|
||||
loadSource: vi.fn().mockResolvedValue(null),
|
||||
loadAllSources: vi.fn().mockResolvedValue([]),
|
||||
} as never,
|
||||
connections: {
|
||||
executeQuery: opts.executeQuery,
|
||||
getConnectionById: vi.fn().mockResolvedValue({ id: 'conn-1', name: 'conn-1', connectionType: 'bigquery' }),
|
||||
listEnabledConnections: vi.fn().mockResolvedValue([]),
|
||||
} as never,
|
||||
configService: {} as never,
|
||||
gitService: {} as never,
|
||||
slSourcesRepository: { deleteByConnectionAndName: vi.fn().mockResolvedValue(undefined) } as never,
|
||||
probeRowCount: 1,
|
||||
};
|
||||
}
|
||||
|
||||
describe('validateSingleSource warehouse dry-run', () => {
|
||||
it('surfaces warehouse error when dry-run fails on unknown column', async () => {
|
||||
const yaml = `name: fct_arr_delta
|
||||
source_type: sql
|
||||
sql: |
|
||||
SELECT * FROM analytics.fct_arr_delta WHERE date_date < CURRENT_DATE()
|
||||
grain: [date_date]
|
||||
columns:
|
||||
- name: date_date
|
||||
type: time
|
||||
measures:
|
||||
- name: count_delta_events
|
||||
expr: count(*)
|
||||
joins: []
|
||||
`;
|
||||
const executeQuery = vi.fn().mockRejectedValue(new Error('Unrecognized name: date_date at [1:42]'));
|
||||
const deps = makeDeps({ sourceYaml: yaml, executeQuery });
|
||||
const result = await validateSingleSource(deps, 'conn-1', 'fct_arr_delta');
|
||||
expect(result.errors.join('\n')).toMatch(/Unrecognized name: date_date/);
|
||||
expect(result.errors.join('\n')).toMatch(/embedded sql dry-run failed/);
|
||||
});
|
||||
|
||||
it('flags declared columns missing from the dry-run result', async () => {
|
||||
const yaml = `name: fct_arr_delta
|
||||
source_type: sql
|
||||
sql: |
|
||||
SELECT date, customer_id FROM analytics.fct_arr_delta
|
||||
columns:
|
||||
- name: date_date
|
||||
type: time
|
||||
- name: customer_id
|
||||
type: string
|
||||
measures:
|
||||
- name: count_delta
|
||||
expr: count(*)
|
||||
joins: []
|
||||
grain: [customer_id]
|
||||
`;
|
||||
const executeQuery = vi.fn().mockResolvedValue({
|
||||
headers: ['date', 'customer_id'],
|
||||
rows: [],
|
||||
totalRows: 0,
|
||||
error: null,
|
||||
});
|
||||
const deps = makeDeps({ sourceYaml: yaml, executeQuery });
|
||||
const result = await validateSingleSource(deps, 'conn-1', 'fct_arr_delta');
|
||||
expect(result.errors.join('\n')).toMatch(/declared columns absent from sql result — date_date/);
|
||||
expect(result.errors.join('\n')).toMatch(/warehouse returned:/);
|
||||
});
|
||||
|
||||
it('passes cleanly when dry-run succeeds and declared columns match', async () => {
|
||||
const yaml = `name: lab_results
|
||||
source_type: sql
|
||||
sql: |
|
||||
SELECT lab_order_id, admin_user_id FROM analytics.raw_lab_results
|
||||
grain: [lab_order_id]
|
||||
columns:
|
||||
- name: lab_order_id
|
||||
type: string
|
||||
- name: admin_user_id
|
||||
type: string
|
||||
measures:
|
||||
- name: count_lab_results
|
||||
expr: count(lab_order_id)
|
||||
joins: []
|
||||
`;
|
||||
const executeQuery = vi.fn().mockResolvedValue({
|
||||
headers: ['lab_order_id', 'admin_user_id'],
|
||||
rows: [],
|
||||
totalRows: 0,
|
||||
error: null,
|
||||
});
|
||||
const deps = makeDeps({ sourceYaml: yaml, executeQuery });
|
||||
const result = await validateSingleSource(deps, 'conn-1', 'lab_results');
|
||||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
|
||||
it('uses LIMIT 1 (not LIMIT 0) so runtime policies fire', async () => {
|
||||
const yaml = `name: foo
|
||||
source_type: sql
|
||||
sql: |
|
||||
SELECT a FROM analytics.bar
|
||||
grain: [a]
|
||||
columns:
|
||||
- {name: a, type: string}
|
||||
measures: []
|
||||
joins: []
|
||||
`;
|
||||
const executeQuery = vi.fn().mockResolvedValue({ headers: ['a'], rows: [], totalRows: 0, error: null });
|
||||
const deps = makeDeps({ sourceYaml: yaml, executeQuery });
|
||||
await validateSingleSource(deps, 'conn-1', 'foo');
|
||||
const probeSql = executeQuery.mock.calls[0][1] as string;
|
||||
expect(probeSql).toMatch(/LIMIT 1\b/);
|
||||
expect(probeSql).not.toMatch(/LIMIT 0\b/);
|
||||
});
|
||||
});
|
||||
325
packages/context/src/sl/tools/sl-warehouse-validation.ts
Normal file
325
packages/context/src/sl/tools/sl-warehouse-validation.ts
Normal file
|
|
@ -0,0 +1,325 @@
|
|||
import YAML from 'yaml';
|
||||
import type { GitService, KloFileStorePort } from '../../core/index.js';
|
||||
import { SYSTEM_GIT_AUTHOR } from '../../tools/index.js';
|
||||
import type { SlConnectionCatalogPort, SlSourcesIndexPort } from '../ports.js';
|
||||
import { sourceOverlaySchema } from '../schemas.js';
|
||||
import { SemanticLayerService } from '../semantic-layer.service.js';
|
||||
import { sourceDefinitionSchema } from './base-semantic-layer.tool.js';
|
||||
|
||||
export interface SlValidationDeps {
|
||||
semanticLayerService: SemanticLayerService;
|
||||
connections: SlConnectionCatalogPort;
|
||||
configService: KloFileStorePort;
|
||||
gitService: GitService;
|
||||
slSourcesRepository: SlSourcesIndexPort;
|
||||
probeRowCount: number;
|
||||
}
|
||||
|
||||
export interface SourceValidationResult {
|
||||
errors: string[];
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
const slSourcePath = (connectionId: string, sourceName: string): string =>
|
||||
`semantic-layer/${connectionId}/${sourceName}.yaml`;
|
||||
|
||||
function resolveDialect(warehouse: string | null): string | null {
|
||||
if (!warehouse) {
|
||||
return null;
|
||||
}
|
||||
return SemanticLayerService.mapDialect(warehouse);
|
||||
}
|
||||
|
||||
function wrapWithZeroRowQuery(sql: string, dialect: string): string {
|
||||
if (dialect === 'tsql') {
|
||||
return `SELECT TOP 0 * FROM (${sql}) AS _discovery`;
|
||||
}
|
||||
return `SELECT * FROM (${sql}) AS _discovery LIMIT 0`;
|
||||
}
|
||||
|
||||
function wrapWithSingleRowQuery(sql: string, dialect: string): string {
|
||||
if (dialect === 'tsql') {
|
||||
return `SELECT TOP 1 * FROM (${sql}) AS _base`;
|
||||
}
|
||||
return `SELECT * FROM (${sql}) AS _base LIMIT 1`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate one SL source end-to-end: YAML parse, Zod schema, duplicate-measure detection,
|
||||
* warehouse dry-run (`SELECT * FROM (sql) LIMIT 1` — forces runtime policy enforcement).
|
||||
*
|
||||
* Returns errors and hint-style warnings. An empty errors array means the YAML is
|
||||
* structurally valid AND the warehouse can execute a probe against its embedded sql.
|
||||
*/
|
||||
export async function validateSingleSource(
|
||||
deps: SlValidationDeps,
|
||||
connectionId: string,
|
||||
sourceName: string,
|
||||
): Promise<SourceValidationResult> {
|
||||
const errors: string[] = [];
|
||||
const warnings: string[] = [];
|
||||
|
||||
let content: string;
|
||||
try {
|
||||
const result = await deps.semanticLayerService.readSourceFile(connectionId, sourceName);
|
||||
content = result.content;
|
||||
} catch {
|
||||
errors.push(`${sourceName}.yaml: file not found`);
|
||||
return { errors, warnings };
|
||||
}
|
||||
|
||||
let parsed: Record<string, unknown>;
|
||||
try {
|
||||
parsed = YAML.parse(content);
|
||||
} catch (e) {
|
||||
errors.push(`${sourceName}.yaml: invalid YAML — ${e instanceof Error ? e.message : String(e)}`);
|
||||
return { errors, warnings };
|
||||
}
|
||||
if (!parsed || typeof parsed !== 'object') {
|
||||
errors.push(`${sourceName}.yaml: top-level content is not an object`);
|
||||
return { errors, warnings };
|
||||
}
|
||||
|
||||
const isOverlay = !parsed.table && !parsed.sql;
|
||||
if (!isOverlay) {
|
||||
const isManifestBacked = await deps.semanticLayerService.isManifestBacked(connectionId, sourceName);
|
||||
if (isManifestBacked) {
|
||||
errors.push(
|
||||
`${sourceName}.yaml: standalone source shadows an existing manifest entry — ` +
|
||||
`writing it as-is drops the manifest's columns and joins. ` +
|
||||
`Remove "sql:", "table:", "grain:", "columns:", and "joins:" and keep only ` +
|
||||
`"name:" plus "measures:"/"segments:"/"description:" to write an overlay ` +
|
||||
`that inherits the manifest schema. Call sl_describe_table to see it first.`,
|
||||
);
|
||||
return { errors, warnings };
|
||||
}
|
||||
}
|
||||
const schema = isOverlay ? sourceOverlaySchema : sourceDefinitionSchema;
|
||||
const result = schema.safeParse(parsed);
|
||||
if (!result.success) {
|
||||
const issues = result.error.issues.map((i) => `${i.path.join('.')}: ${i.message}`).join('; ');
|
||||
errors.push(`${sourceName}.yaml: schema — ${issues}`);
|
||||
const errorPaths = new Set(result.error.issues.map((i) => String(i.path[0])));
|
||||
if (errorPaths.has('joins')) {
|
||||
warnings.push(
|
||||
`${sourceName}.yaml: hint — join format: {to, on: 'local_col = TARGET.col', relationship: 'many_to_one|one_to_many|one_to_one'}`,
|
||||
);
|
||||
}
|
||||
if (errorPaths.has('columns')) {
|
||||
warnings.push(
|
||||
`${sourceName}.yaml: hint — overlay columns must be computed: {name, expr, type}. Do NOT include base table columns.`,
|
||||
);
|
||||
}
|
||||
if (errorPaths.has('measures')) {
|
||||
warnings.push(
|
||||
`${sourceName}.yaml: hint — measure format: {name, expr, description (optional), filter (optional)}`,
|
||||
);
|
||||
}
|
||||
return { errors, warnings };
|
||||
}
|
||||
|
||||
const measures = (parsed.measures as Array<{ name: string }> | undefined) ?? [];
|
||||
const seenMeasures = new Set<string>();
|
||||
for (const m of measures) {
|
||||
if (seenMeasures.has(m.name)) {
|
||||
errors.push(`${sourceName}.yaml: duplicate measure name "${m.name}"`);
|
||||
}
|
||||
seenMeasures.add(m.name);
|
||||
}
|
||||
|
||||
let warehouse: string | null = null;
|
||||
try {
|
||||
const connection = await deps.connections.getConnectionById(connectionId);
|
||||
warehouse = connection?.connectionType ?? null;
|
||||
} catch {
|
||||
warehouse = null;
|
||||
}
|
||||
|
||||
if (typeof parsed.sql === 'string' && parsed.sql.trim().length > 0) {
|
||||
const innerSql = parsed.sql.trim().replace(/;+\s*$/, '');
|
||||
const probeRowCount = deps.probeRowCount;
|
||||
const dialect = resolveDialect(warehouse);
|
||||
let probeSql: string;
|
||||
if (dialect) {
|
||||
probeSql =
|
||||
probeRowCount === 0 ? wrapWithZeroRowQuery(innerSql, dialect) : wrapWithSingleRowQuery(innerSql, dialect);
|
||||
} else {
|
||||
probeSql = `SELECT * FROM (${innerSql}) AS _probe LIMIT ${probeRowCount}`;
|
||||
}
|
||||
const sourceColumns = ((parsed.columns as Array<{ name?: string; type?: string }> | undefined) ?? [])
|
||||
.map((c) => ({ name: c.name ?? '', type: c.type ?? '' }))
|
||||
.filter((c) => c.name);
|
||||
try {
|
||||
const probe = await deps.connections.executeQuery(connectionId, probeSql);
|
||||
const actual = new Set((probe.headers ?? []).map((h) => h.toLowerCase()));
|
||||
const missing = sourceColumns.map((c) => c.name).filter((n) => !actual.has(n.toLowerCase()));
|
||||
if (missing.length > 0) {
|
||||
errors.push(
|
||||
`${sourceName}.yaml: declared columns absent from sql result — ${missing.join(', ')} (warehouse returned: ${[...actual].slice(0, 10).join(', ')}${actual.size > 10 ? ', …' : ''})`,
|
||||
);
|
||||
}
|
||||
} catch (e) {
|
||||
errors.push(
|
||||
formatProbeError({
|
||||
sourceName,
|
||||
measureName: null,
|
||||
probeSql,
|
||||
warehouse,
|
||||
sourceColumns,
|
||||
error: e,
|
||||
headline: 'embedded sql dry-run failed',
|
||||
}),
|
||||
);
|
||||
}
|
||||
} else if (isOverlay) {
|
||||
const measureErrors = await probeOverlayMeasures(deps, connectionId, sourceName, warehouse);
|
||||
errors.push(...measureErrors);
|
||||
}
|
||||
|
||||
return { errors, warnings };
|
||||
}
|
||||
|
||||
function formatProbeError(args: {
|
||||
sourceName: string;
|
||||
measureName: string | null;
|
||||
probeSql: string;
|
||||
warehouse: string | null;
|
||||
sourceColumns: Array<{ name: string; type: string }>;
|
||||
error: unknown;
|
||||
headline: string;
|
||||
}): string {
|
||||
const { sourceName, measureName, probeSql, warehouse, sourceColumns, error, headline } = args;
|
||||
const errMsg = error instanceof Error ? error.message : String(error);
|
||||
const refColumns = sourceColumns.filter((c) => referencesColumn(probeSql, c.name));
|
||||
const lines: string[] = [
|
||||
measureName ? `${sourceName}.yaml: measure "${measureName}" ${headline}.` : `${sourceName}.yaml: ${headline}.`,
|
||||
];
|
||||
if (warehouse) {
|
||||
lines.push(` Warehouse: ${warehouse}`);
|
||||
}
|
||||
lines.push(` Probe SQL: ${probeSql}`);
|
||||
if (refColumns.length > 0) {
|
||||
lines.push(` Referenced columns: ${refColumns.map((c) => `${c.name} (${c.type || '?'})`).join(', ')}`);
|
||||
}
|
||||
lines.push(` Error: ${errMsg}`);
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
function referencesColumn(sql: string, columnName: string): boolean {
|
||||
if (!columnName) {
|
||||
return false;
|
||||
}
|
||||
const escaped = columnName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
return new RegExp(`\\b${escaped}\\b`).test(sql);
|
||||
}
|
||||
|
||||
async function probeOverlayMeasures(
|
||||
deps: SlValidationDeps,
|
||||
connectionId: string,
|
||||
sourceName: string,
|
||||
warehouse: string | null,
|
||||
): Promise<string[]> {
|
||||
const errors: string[] = [];
|
||||
let composed:
|
||||
| {
|
||||
name: string;
|
||||
table?: string;
|
||||
sql?: string;
|
||||
columns?: Array<{ name?: string; type?: string }>;
|
||||
measures: Array<{ name: string; expr: string; filter?: string; segments?: string[] }>;
|
||||
segments?: Array<{ name: string; expr: string }>;
|
||||
}
|
||||
| undefined;
|
||||
try {
|
||||
const all = await deps.semanticLayerService.loadAllSources(connectionId);
|
||||
composed = all.find((s) => s.name === sourceName);
|
||||
} catch (e) {
|
||||
errors.push(
|
||||
`${sourceName}.yaml: failed to load composed source for probe — ${e instanceof Error ? e.message : String(e)}`,
|
||||
);
|
||||
return errors;
|
||||
}
|
||||
if (!composed?.table || composed.measures.length === 0) {
|
||||
return errors;
|
||||
}
|
||||
|
||||
const sourceColumns = (composed.columns ?? [])
|
||||
.map((c) => ({ name: c.name ?? '', type: c.type ?? '' }))
|
||||
.filter((c) => c.name);
|
||||
|
||||
for (const measure of composed.measures) {
|
||||
const measureRef = `${sourceName}.${measure.name}`;
|
||||
let probeSql = `<composed via semantic-layer engine for ${measureRef}>`;
|
||||
try {
|
||||
const result = await deps.semanticLayerService.executeQuery(connectionId, {
|
||||
measures: [measureRef],
|
||||
dimensions: [],
|
||||
filters: [],
|
||||
limit: 1,
|
||||
});
|
||||
probeSql = result.sql ?? probeSql;
|
||||
} catch (e) {
|
||||
errors.push(
|
||||
formatProbeError({
|
||||
sourceName,
|
||||
measureName: measure.name,
|
||||
probeSql,
|
||||
warehouse,
|
||||
sourceColumns,
|
||||
error: e,
|
||||
headline: 'dry-run failed',
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
return errors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Restore `sourceName` to the content it had at `preHead`, or delete it if it didn't
|
||||
* exist then. Used by sl_rollback (agent-driven) and the pre-squash revert gate
|
||||
* (automatic). Returns a short human-readable description of what happened.
|
||||
*/
|
||||
export async function revertSourceToPreHead(
|
||||
deps: SlValidationDeps,
|
||||
connectionId: string,
|
||||
preHead: string | null,
|
||||
sourceName: string,
|
||||
): Promise<string> {
|
||||
const relPath = slSourcePath(connectionId, sourceName);
|
||||
let preContent: string | null = null;
|
||||
if (preHead) {
|
||||
try {
|
||||
preContent = await deps.gitService.getFileAtCommit(relPath, preHead);
|
||||
} catch {
|
||||
preContent = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (preContent !== null) {
|
||||
await deps.configService.writeFile(
|
||||
relPath,
|
||||
preContent,
|
||||
SYSTEM_GIT_AUTHOR.name,
|
||||
SYSTEM_GIT_AUTHOR.email,
|
||||
`Revert SL source to pre-session state: ${sourceName}`,
|
||||
{ skipLock: true },
|
||||
);
|
||||
return 'restored to pre-session content';
|
||||
}
|
||||
|
||||
try {
|
||||
await deps.configService.deleteFile(
|
||||
relPath,
|
||||
SYSTEM_GIT_AUTHOR.name,
|
||||
SYSTEM_GIT_AUTHOR.email,
|
||||
`Drop SL source (not present at session start): ${sourceName}`,
|
||||
{ skipLock: true },
|
||||
);
|
||||
await deps.slSourcesRepository.deleteByConnectionAndName(connectionId, sourceName);
|
||||
return 'deleted (did not exist at session start)';
|
||||
} catch {
|
||||
await deps.slSourcesRepository.deleteByConnectionAndName(connectionId, sourceName);
|
||||
return 'no-op (already absent)';
|
||||
}
|
||||
}
|
||||
267
packages/context/src/sl/tools/sl-write-source.tool.test.ts
Normal file
267
packages/context/src/sl/tools/sl-write-source.tool.test.ts
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { ToolSession } from '../../tools/index.js';
|
||||
import { createTouchedSlSources, hasTouchedSlSource, type ToolContext } from '../../tools/index.js';
|
||||
import { SlWriteSourceTool } from './sl-write-source.tool.js';
|
||||
|
||||
function makeTool(overrides: Partial<Record<string, any>> = {}) {
|
||||
const semanticLayerService = {
|
||||
listManifestSourceNames: vi.fn().mockResolvedValue(['ACCOUNTS', 'ORDERS']),
|
||||
isManifestBacked: vi.fn().mockResolvedValue(false),
|
||||
loadSource: vi.fn().mockResolvedValue(null),
|
||||
loadAllSources: vi.fn().mockResolvedValue([]),
|
||||
validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
|
||||
writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }),
|
||||
deleteSource: vi.fn().mockResolvedValue(undefined),
|
||||
readSourceFile: vi.fn().mockRejectedValue(new Error('not found')),
|
||||
...overrides.semanticLayerService,
|
||||
};
|
||||
const slSearchService = {
|
||||
indexSources: vi.fn().mockResolvedValue(undefined),
|
||||
...overrides.slSearchService,
|
||||
};
|
||||
const tool = new SlWriteSourceTool({
|
||||
semanticLayerService: semanticLayerService as never,
|
||||
slSearchService: slSearchService as never,
|
||||
authorResolver: { resolve: vi.fn().mockResolvedValue({ name: 'T U', email: 't@u.com' }) },
|
||||
});
|
||||
return { tool, semanticLayerService, slSearchService };
|
||||
}
|
||||
|
||||
const baseContext: ToolContext = { sourceId: 's', messageId: 'm', userId: 'u' };
|
||||
|
||||
describe('SlWriteSourceTool — orphan overlay guard', () => {
|
||||
it('rejects overlay YAMLs targeting a name absent from the manifest', async () => {
|
||||
const { tool } = makeTool();
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
sourceName: 'does_not_exist',
|
||||
source: {
|
||||
name: 'does_not_exist',
|
||||
measures: [{ name: 'count_rows', expr: 'count(*)' }],
|
||||
} as any,
|
||||
} as any,
|
||||
baseContext,
|
||||
);
|
||||
expect(result.structured.success).toBe(false);
|
||||
expect(result.markdown).toMatch(/no manifest entry with that name exists/i);
|
||||
expect(result.markdown).toMatch(/ACCOUNTS|ORDERS/);
|
||||
});
|
||||
});
|
||||
|
||||
describe('SlWriteSourceTool — session gating', () => {
|
||||
function makeSession(overrides: Partial<ToolSession> = {}): ToolSession {
|
||||
return {
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
isWorktreeScoped: true,
|
||||
preHead: 'base',
|
||||
touchedSlSources: createTouchedSlSources(),
|
||||
actions: [],
|
||||
semanticLayerService: {
|
||||
loadSource: vi.fn().mockResolvedValue(null),
|
||||
loadAllSources: vi.fn().mockResolvedValue([]),
|
||||
validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
|
||||
writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }),
|
||||
deleteSource: vi.fn().mockResolvedValue(undefined),
|
||||
listManifestSourceNames: vi.fn().mockResolvedValue([]),
|
||||
isManifestBacked: vi.fn().mockResolvedValue(false),
|
||||
readSourceFile: vi.fn().mockRejectedValue(new Error('not found')),
|
||||
findManifestEntryByTableRef: vi.fn().mockResolvedValue(null),
|
||||
} as any,
|
||||
wikiService: {} as any,
|
||||
configService: {} as any,
|
||||
gitService: {} as any,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
it('skips slSearchService.indexSources when session is worktree-scoped', async () => {
|
||||
const { tool, slSearchService } = makeTool();
|
||||
const session = makeSession();
|
||||
const context: ToolContext = { ...baseContext, session };
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: session.connectionId,
|
||||
sourceName: 'my_source',
|
||||
source: {
|
||||
name: 'my_source',
|
||||
sql: 'select 1 as id',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'string' }],
|
||||
measures: [],
|
||||
joins: [],
|
||||
} as any,
|
||||
} as any,
|
||||
context,
|
||||
);
|
||||
expect(result.structured.success).toBe(true);
|
||||
expect(slSearchService.indexSources).not.toHaveBeenCalled();
|
||||
expect(hasTouchedSlSource(session.touchedSlSources, session.connectionId!, 'my_source')).toBe(true);
|
||||
expect(session.actions).toContainEqual(expect.objectContaining({ target: 'sl', key: 'my_source' }));
|
||||
});
|
||||
|
||||
it('records cross-connection SL writes with targetConnectionId', async () => {
|
||||
const { tool } = makeTool();
|
||||
const session = makeSession({ connectionId: '11111111-1111-4111-8111-111111111111' });
|
||||
const warehouseConnectionId = '22222222-2222-4222-8222-222222222222';
|
||||
const context: ToolContext = { ...baseContext, session };
|
||||
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: warehouseConnectionId,
|
||||
sourceName: 'mapped_orders',
|
||||
source: {
|
||||
name: 'mapped_orders',
|
||||
table: 'public.orders',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'string' }],
|
||||
measures: [],
|
||||
joins: [],
|
||||
} as any,
|
||||
} as any,
|
||||
context,
|
||||
);
|
||||
|
||||
expect(result.structured.success).toBe(true);
|
||||
expect(hasTouchedSlSource(session.touchedSlSources, warehouseConnectionId, 'mapped_orders')).toBe(true);
|
||||
expect(session.actions).toContainEqual(
|
||||
expect.objectContaining({
|
||||
target: 'sl',
|
||||
key: 'mapped_orders',
|
||||
targetConnectionId: warehouseConnectionId,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('indexes normally when no session is present', async () => {
|
||||
const { tool, slSearchService } = makeTool();
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
sourceName: 'my_source',
|
||||
source: {
|
||||
name: 'my_source',
|
||||
sql: 'select 1 as id',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'string' }],
|
||||
measures: [],
|
||||
joins: [],
|
||||
} as any,
|
||||
} as any,
|
||||
baseContext,
|
||||
);
|
||||
expect(result.structured.success).toBe(true);
|
||||
expect(slSearchService.indexSources).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('uses session.semanticLayerService when session is present', async () => {
|
||||
const { tool } = makeTool();
|
||||
const session = makeSession();
|
||||
const context: ToolContext = { ...baseContext, session };
|
||||
await tool.call(
|
||||
{
|
||||
connectionId: session.connectionId,
|
||||
sourceName: 'my_source',
|
||||
source: {
|
||||
name: 'my_source',
|
||||
sql: 'select 1 as id',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'string' }],
|
||||
measures: [],
|
||||
joins: [],
|
||||
} as any,
|
||||
} as any,
|
||||
context,
|
||||
);
|
||||
expect((session.semanticLayerService as any).writeSource).toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('SlWriteSourceTool — disconnected-components warning in markdown', () => {
|
||||
it('surfaces validation warnings (including disconnected-components) in the markdown body', async () => {
|
||||
const { tool } = makeTool({
|
||||
semanticLayerService: {
|
||||
validateWithProposedSource: vi.fn().mockResolvedValue({
|
||||
errors: [],
|
||||
warnings: ['orders: disconnected-components — no join path to ACCOUNTS'],
|
||||
}),
|
||||
},
|
||||
});
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
sourceName: 'orders',
|
||||
source: {
|
||||
name: 'orders',
|
||||
sql: 'select 1 as id',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'string' }],
|
||||
measures: [],
|
||||
joins: [],
|
||||
} as any,
|
||||
} as any,
|
||||
baseContext,
|
||||
);
|
||||
expect(result.markdown).toMatch(/disconnected-components/i);
|
||||
});
|
||||
|
||||
it('renders per-source warnings prominently when the just-written source becomes a singleton component', async () => {
|
||||
const { tool } = makeTool({
|
||||
semanticLayerService: {
|
||||
validateWithProposedSource: vi.fn().mockResolvedValue({
|
||||
errors: [],
|
||||
warnings: ['Model has 2 disconnected components.'],
|
||||
perSourceWarnings: {
|
||||
foo: ["Source 'foo' is now a singleton component (no joins to any other source)."],
|
||||
},
|
||||
}),
|
||||
},
|
||||
});
|
||||
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
sourceName: 'foo',
|
||||
source: {
|
||||
name: 'foo',
|
||||
sql: 'select 1 as id',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'string' }],
|
||||
measures: [],
|
||||
joins: [],
|
||||
} as any,
|
||||
} as any,
|
||||
baseContext,
|
||||
);
|
||||
|
||||
expect(result.markdown).toMatch(/Action required/i);
|
||||
expect(result.markdown).toContain("Source 'foo' is now a singleton component");
|
||||
});
|
||||
});
|
||||
|
||||
describe('SlWriteSourceTool — standalone shadow guard', () => {
|
||||
it('rejects standalone YAMLs that shadow a manifest entry', async () => {
|
||||
const { tool } = makeTool({
|
||||
semanticLayerService: {
|
||||
isManifestBacked: vi.fn().mockResolvedValue(true),
|
||||
},
|
||||
});
|
||||
const result = await tool.call(
|
||||
{
|
||||
connectionId: '11111111-1111-1111-1111-111111111111',
|
||||
sourceName: 'ACCOUNTS',
|
||||
source: {
|
||||
name: 'ACCOUNTS',
|
||||
table: 'raw.accounts',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'string' }],
|
||||
measures: [],
|
||||
joins: [],
|
||||
} as any,
|
||||
} as any,
|
||||
baseContext,
|
||||
);
|
||||
expect(result.structured.success).toBe(false);
|
||||
expect(result.markdown).toMatch(/shadows an existing manifest entry|already exists/i);
|
||||
});
|
||||
});
|
||||
380
packages/context/src/sl/tools/sl-write-source.tool.ts
Normal file
380
packages/context/src/sl/tools/sl-write-source.tool.ts
Normal file
|
|
@ -0,0 +1,380 @@
|
|||
import YAML from 'yaml';
|
||||
import { z } from 'zod';
|
||||
import { addTouchedSlSource, type ToolContext, type ToolOutput } from '../../tools/index.js';
|
||||
import { sourceOverlaySchema } from '../schemas.js';
|
||||
import type { SemanticLayerService } from '../semantic-layer.service.js';
|
||||
import type { SemanticLayerSource } from '../types.js';
|
||||
import {
|
||||
BaseSemanticLayerTool,
|
||||
type BaseSemanticLayerToolDeps,
|
||||
type SemanticLayerStructured,
|
||||
sourceDefinitionSchema,
|
||||
} from './base-semantic-layer.tool.js';
|
||||
import { slToolConnectionIdSchema } from './connection-id-schema.js';
|
||||
|
||||
const sourceInputSchema = z.union([sourceDefinitionSchema, sourceOverlaySchema]);
|
||||
|
||||
const slWriteSourceInputSchema = z.object({
|
||||
connectionId: slToolConnectionIdSchema.describe('Data source connection ID'),
|
||||
sourceName: z
|
||||
.string()
|
||||
.regex(/^[a-z0-9][a-z0-9_]*$/, 'Source name must be snake_case (lowercase alphanumeric and underscores)')
|
||||
.describe('Name of the source to create, edit, or delete'),
|
||||
source: sourceInputSchema
|
||||
.optional()
|
||||
.describe('Source definition (standalone with table/sql) or overlay (measures, computed columns, etc.)'),
|
||||
delete: z.boolean().optional().describe('Set to true to delete this source entirely'),
|
||||
});
|
||||
|
||||
type SlWriteSourceInput = z.infer<typeof slWriteSourceInputSchema>;
|
||||
|
||||
function actionTargetConnectionId(
|
||||
runConnectionId: string | null | undefined,
|
||||
actionConnectionId: string,
|
||||
): string | null {
|
||||
return runConnectionId && runConnectionId !== actionConnectionId ? actionConnectionId : null;
|
||||
}
|
||||
|
||||
export class SlWriteSourceTool extends BaseSemanticLayerTool<typeof slWriteSourceInputSchema> {
|
||||
readonly name = 'sl_write_source';
|
||||
|
||||
constructor(deps: BaseSemanticLayerToolDeps) {
|
||||
super(deps);
|
||||
}
|
||||
|
||||
get description(): string {
|
||||
return `<purpose>
|
||||
Create a new semantic layer source or fully rewrite an existing one.
|
||||
If the source already exists, this tool will overwrite it with the new definition.
|
||||
</purpose>
|
||||
|
||||
<when_to_use>
|
||||
- First time creating a source definition
|
||||
- When modeling a new SQL-backed source (e.g., churn risk view, ARR calculation)
|
||||
- When the user asks to start over / fully rewrite a source
|
||||
- Consolidating multiple sources into one (write merged definition)
|
||||
- For targeted edits to existing sources (add/remove measures, update joins), prefer sl_edit_source instead
|
||||
</when_to_use>
|
||||
|
||||
<editing_approach>
|
||||
- New source: provide \`source\` with full definition
|
||||
- Full rewrite: provide \`source\` (overwrites existing)
|
||||
- Targeted edits on an existing source: use sl_edit_source instead
|
||||
- Delete: set \`delete: true\`
|
||||
</editing_approach>
|
||||
|
||||
<source_definition>
|
||||
- name: Unique identifier for the source
|
||||
- table: For physical table/view sources (e.g., "public.orders"). Mutually exclusive with sql.
|
||||
- sql: For SQL-based sources (the SQL query). Mutually exclusive with table.
|
||||
- grain: What one row represents (e.g., ["id"], ["customer_id", "product_id"])
|
||||
- columns: All columns with type (string/number/time/boolean) and optional descriptions
|
||||
- joins: Relationships to other sources (to, on, relationship: many_to_one/one_to_many/one_to_one)
|
||||
- measures: Pre-defined aggregations (name, expr like "sum(amount)", optional filter, optional segments — bare names of segments defined on the same source, optional description)
|
||||
- segments: Named, reusable boolean predicates scoped to this source (name, expr — a SQL boolean over this source's columns, optional description). A measure references one with \`segments: [name]\`; a query references one with the dotted form \`source.segment_name\`. Use when the same predicate appears on 3+ measures — e.g. extract \`is_paid = true and is_refunded = '0'\` as \`segments: [{name: paid_non_refunded, expr: "..."}]\` and have each measure use \`segments: [paid_non_refunded]\` instead of re-typing the predicate inside \`sum(case when ... then x end)\`. Segments are predicates only — they cannot be selected as dimensions or grouped by; if you need to group by the predicate, add a \`columns[]\` entry instead.
|
||||
</source_definition>
|
||||
|
||||
<join_requirements>
|
||||
Sources with joins: [] are disconnected from the semantic layer join graph and cannot be composed with other sources in semantic queries.
|
||||
Before writing, use discover_data to check existing sources and their grain columns.
|
||||
For each grain/key column in your source (e.g., account_id, item_id), find the matching dimension source (e.g., ACCOUNTS, ITEMS) and declare a many_to_one join.
|
||||
Example: a source graining on [account_id] should declare:
|
||||
joins:
|
||||
- to: ACCOUNTS
|
||||
on: source_name.account_id = ACCOUNTS.ACCOUNT_ID
|
||||
relationship: many_to_one
|
||||
The on condition format: local_column = TARGET_SOURCE.target_column (right side must include target source name).
|
||||
Do NOT join back to a table that the SQL already aggregates from if the grain column is not in the output (the relationship is already baked into the SQL).
|
||||
</join_requirements>`;
|
||||
}
|
||||
|
||||
get inputSchema() {
|
||||
return slWriteSourceInputSchema;
|
||||
}
|
||||
|
||||
async call(input: SlWriteSourceInput, context: ToolContext): Promise<ToolOutput<SemanticLayerStructured>> {
|
||||
const { connectionId, sourceName } = input;
|
||||
const { name: author, email: authorEmail } = await this.authorResolver.resolve(context.userId);
|
||||
|
||||
const semanticLayerService = context.session?.semanticLayerService ?? this.semanticLayerService;
|
||||
const skipIndex = context.session?.isWorktreeScoped === true;
|
||||
|
||||
// Handle delete
|
||||
if (input.delete) {
|
||||
try {
|
||||
await semanticLayerService.deleteSource(connectionId, sourceName, author, authorEmail);
|
||||
if (!skipIndex) {
|
||||
const allSources = await semanticLayerService.loadAllSources(connectionId);
|
||||
await this.slSearchService.indexSources(connectionId, allSources).catch(() => {});
|
||||
}
|
||||
if (context.session) {
|
||||
addTouchedSlSource(context.session.touchedSlSources, connectionId, sourceName);
|
||||
context.session.actions.push({
|
||||
target: 'sl',
|
||||
type: 'removed',
|
||||
key: sourceName,
|
||||
detail: 'Deleted source',
|
||||
targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId),
|
||||
});
|
||||
}
|
||||
return this.buildOutput(true, [], sourceName, { yaml: undefined, commitHash: undefined });
|
||||
} catch (error) {
|
||||
return this.buildOutput(false, [error instanceof Error ? error.message : String(error)], sourceName);
|
||||
}
|
||||
}
|
||||
|
||||
// Require source for create/rewrite
|
||||
if (!input.source) {
|
||||
return this.buildOutput(
|
||||
false,
|
||||
['Provide `source` to create or rewrite. For targeted edits, use sl_edit_source.'],
|
||||
sourceName,
|
||||
);
|
||||
}
|
||||
|
||||
return this.writeFullSource(
|
||||
connectionId,
|
||||
input.source,
|
||||
sourceName,
|
||||
author,
|
||||
authorEmail,
|
||||
context,
|
||||
semanticLayerService,
|
||||
skipIndex,
|
||||
);
|
||||
}
|
||||
|
||||
private async writeFullSource(
|
||||
connectionId: string,
|
||||
source: z.infer<typeof sourceInputSchema>,
|
||||
sourceName: string,
|
||||
author: string,
|
||||
authorEmail: string,
|
||||
context: ToolContext,
|
||||
semanticLayerService: SemanticLayerService,
|
||||
skipIndex: boolean,
|
||||
): Promise<ToolOutput<SemanticLayerStructured>> {
|
||||
const isOverlay = !('table' in source && source.table) && !('sql' in source && source.sql);
|
||||
|
||||
const existing = await this.readSourceYamlFromService(semanticLayerService, connectionId, sourceName);
|
||||
const commitMessage = existing
|
||||
? `${isOverlay ? 'Update overlay' : 'Rewrite source'}: ${sourceName}`
|
||||
: `${isOverlay ? 'Create overlay' : 'Create source'}: ${sourceName}`;
|
||||
|
||||
const yamlContent = YAML.stringify(source);
|
||||
|
||||
const orphanError = await this.rejectOrphanOverlay(semanticLayerService, connectionId, sourceName, yamlContent);
|
||||
if (orphanError) {
|
||||
return this.buildOutput(false, [orphanError], sourceName, { yaml: yamlContent });
|
||||
}
|
||||
const shadowError = await this.rejectStandaloneShadow(semanticLayerService, connectionId, sourceName, yamlContent);
|
||||
if (shadowError) {
|
||||
return this.buildOutput(false, [shadowError], sourceName, { yaml: yamlContent });
|
||||
}
|
||||
|
||||
const validatedSource = source as SemanticLayerSource;
|
||||
const validationResult = await semanticLayerService.validateWithProposedSource(connectionId, validatedSource);
|
||||
const validationErrors = validationResult.errors;
|
||||
const validationWarnings = [...validationResult.warnings];
|
||||
const actionRequiredWarnings = validationResult.perSourceWarnings?.[sourceName] ?? [];
|
||||
if (validationErrors.length > 0) {
|
||||
return this.buildOutput(false, ['Validation failed — source was NOT saved:', ...validationErrors], sourceName, {
|
||||
yaml: yamlContent,
|
||||
validationErrors,
|
||||
validationWarnings,
|
||||
actionRequiredWarnings,
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await semanticLayerService.writeSource(
|
||||
connectionId,
|
||||
validatedSource,
|
||||
author,
|
||||
authorEmail,
|
||||
commitMessage,
|
||||
);
|
||||
|
||||
if (!skipIndex) {
|
||||
const allSources = await semanticLayerService.loadAllSources(connectionId);
|
||||
await this.slSearchService.indexSources(connectionId, allSources).catch(() => {});
|
||||
}
|
||||
|
||||
if (context.session) {
|
||||
addTouchedSlSource(context.session.touchedSlSources, connectionId, sourceName);
|
||||
context.session.actions.push({
|
||||
target: 'sl',
|
||||
type: existing ? 'updated' : 'created',
|
||||
key: sourceName,
|
||||
detail: existing ? `Rewrote source` : `Created source`,
|
||||
targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId),
|
||||
});
|
||||
}
|
||||
|
||||
return this.buildOutput(true, [], sourceName, {
|
||||
yaml: yamlContent,
|
||||
commitHash: result.commitHash ?? undefined,
|
||||
validationErrors,
|
||||
validationWarnings,
|
||||
actionRequiredWarnings,
|
||||
});
|
||||
} catch (error) {
|
||||
return this.buildOutput(false, [error instanceof Error ? error.message : String(error)], sourceName);
|
||||
}
|
||||
}
|
||||
|
||||
private async readSourceYamlFromService(
|
||||
service: SemanticLayerService,
|
||||
connectionId: string,
|
||||
sourceName: string,
|
||||
): Promise<string | null> {
|
||||
try {
|
||||
const { content } = await service.readSourceFile(connectionId, sourceName);
|
||||
return content;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private async rejectOrphanOverlay(
|
||||
semanticLayerService: SemanticLayerService,
|
||||
connectionId: string,
|
||||
sourceName: string,
|
||||
content: string,
|
||||
): Promise<string | null> {
|
||||
let parsed: Record<string, unknown>;
|
||||
try {
|
||||
parsed = YAML.parse(content) as Record<string, unknown>;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
if (!parsed || typeof parsed !== 'object') {
|
||||
return null;
|
||||
}
|
||||
const isOverlay = !('table' in parsed && parsed.table) && !('sql' in parsed && parsed.sql);
|
||||
if (!isOverlay) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const manifestNames = await semanticLayerService.listManifestSourceNames(connectionId);
|
||||
if (manifestNames.includes(sourceName)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const suggestions = this.nearestMatches(sourceName, manifestNames, 3);
|
||||
return [
|
||||
`Error: cannot write "${sourceName}" as an overlay — no manifest entry with that name exists.`,
|
||||
suggestions.length > 0
|
||||
? ` Nearest manifest matches: ${suggestions.join(', ')}.`
|
||||
: ` No manifest entries resemble "${sourceName}".`,
|
||||
`To customize an existing base table, retarget the overlay at one of the nearest matches.`,
|
||||
`For a LookML derived_table or any source backed by inline SQL, rewrite as a standalone`,
|
||||
`curated source with a top-level "sql:" block plus explicit "grain:" and "columns:".`,
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
private async rejectStandaloneShadow(
|
||||
semanticLayerService: SemanticLayerService,
|
||||
connectionId: string,
|
||||
sourceName: string,
|
||||
content: string,
|
||||
): Promise<string | null> {
|
||||
let parsed: Record<string, unknown>;
|
||||
try {
|
||||
parsed = YAML.parse(content) as Record<string, unknown>;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
if (!parsed || typeof parsed !== 'object') {
|
||||
return null;
|
||||
}
|
||||
const isOverlay = !('table' in parsed && parsed.table) && !('sql' in parsed && parsed.sql);
|
||||
if (isOverlay) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const isManifestBacked = await semanticLayerService.isManifestBacked(connectionId, sourceName);
|
||||
if (!isManifestBacked) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return [
|
||||
`Error: cannot write "${sourceName}" as a standalone source — a manifest entry with that name already exists.`,
|
||||
` Writing standalone would drop the manifest's columns and joins, leaving only what you list here.`,
|
||||
`To add measures/segments on top of the manifest, rewrite this YAML as an overlay:`,
|
||||
` - Remove "sql:", "table:", "grain:", "columns:", and "joins:".`,
|
||||
` - Keep only "name:", plus "measures:", "segments:", and/or "description:".`,
|
||||
` - The manifest's schema is inherited automatically.`,
|
||||
`If you really need a different base table, use a different source name.`,
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
private nearestMatches(needle: string, haystack: string[], limit: number): string[] {
|
||||
if (haystack.length === 0) {
|
||||
return [];
|
||||
}
|
||||
const lowerNeedle = needle.toLowerCase();
|
||||
const scored = haystack.map((candidate) => {
|
||||
const lower = candidate.toLowerCase();
|
||||
const prefixBoost = lower.startsWith(lowerNeedle) || lowerNeedle.startsWith(lower) ? 0.2 : 0;
|
||||
const substringBoost = lower.includes(lowerNeedle) || lowerNeedle.includes(lower) ? 0.1 : 0;
|
||||
const score = jaroWinkler(lowerNeedle, lower) + prefixBoost + substringBoost;
|
||||
return { candidate, score };
|
||||
});
|
||||
scored.sort((a, b) => b.score - a.score);
|
||||
return scored
|
||||
.filter((s) => s.score > 0.4)
|
||||
.slice(0, limit)
|
||||
.map((s) => s.candidate);
|
||||
}
|
||||
}
|
||||
|
||||
function jaroWinkler(a: string, b: string): number {
|
||||
if (a === b) {
|
||||
return 1;
|
||||
}
|
||||
const matchDistance = Math.max(0, Math.floor(Math.max(a.length, b.length) / 2) - 1);
|
||||
const aMatches = new Array<boolean>(a.length).fill(false);
|
||||
const bMatches = new Array<boolean>(b.length).fill(false);
|
||||
let matches = 0;
|
||||
for (let i = 0; i < a.length; i++) {
|
||||
const start = Math.max(0, i - matchDistance);
|
||||
const end = Math.min(i + matchDistance + 1, b.length);
|
||||
for (let j = start; j < end; j++) {
|
||||
if (bMatches[j]) {
|
||||
continue;
|
||||
}
|
||||
if (a[i] !== b[j]) {
|
||||
continue;
|
||||
}
|
||||
aMatches[i] = true;
|
||||
bMatches[j] = true;
|
||||
matches++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (matches === 0) {
|
||||
return 0;
|
||||
}
|
||||
let transpositions = 0;
|
||||
let k = 0;
|
||||
for (let i = 0; i < a.length; i++) {
|
||||
if (!aMatches[i]) {
|
||||
continue;
|
||||
}
|
||||
while (!bMatches[k]) {
|
||||
k++;
|
||||
}
|
||||
if (a[i] !== b[k]) {
|
||||
transpositions++;
|
||||
}
|
||||
k++;
|
||||
}
|
||||
const jaro = (matches / a.length + matches / b.length + (matches - transpositions / 2) / matches) / 3;
|
||||
let prefix = 0;
|
||||
const maxPrefix = Math.min(4, a.length, b.length);
|
||||
while (prefix < maxPrefix && a[prefix] === b[prefix]) {
|
||||
prefix++;
|
||||
}
|
||||
return jaro + prefix * 0.1 * (1 - jaro);
|
||||
}
|
||||
88
packages/context/src/sl/types.ts
Normal file
88
packages/context/src/sl/types.ts
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
export interface SemanticLayerSource {
|
||||
name: string;
|
||||
descriptions?: Record<string, string>;
|
||||
table?: string;
|
||||
sql?: string;
|
||||
inherits_columns_from?: string;
|
||||
grain: string[];
|
||||
columns: Array<{
|
||||
name: string;
|
||||
type: string;
|
||||
role?: string;
|
||||
visibility?: string;
|
||||
descriptions?: Record<string, string>;
|
||||
expr?: string;
|
||||
natural_granularity?: string;
|
||||
constraints?: { dbt?: { not_null?: boolean; unique?: boolean } };
|
||||
enum_values?: { dbt?: string[] };
|
||||
tests?: {
|
||||
dbt?: Array<{ name: string; package: string; kwargs?: Record<string, unknown> }>;
|
||||
dbt_by_package?: Record<string, string[]>;
|
||||
};
|
||||
}>;
|
||||
joins: Array<{
|
||||
to: string;
|
||||
on: string;
|
||||
relationship: string;
|
||||
alias?: string;
|
||||
source?: string;
|
||||
}>;
|
||||
measures: Array<{
|
||||
name: string;
|
||||
expr: string;
|
||||
filter?: string;
|
||||
segments?: string[];
|
||||
description?: string;
|
||||
}>;
|
||||
segments?: Array<{
|
||||
name: string;
|
||||
expr: string;
|
||||
description?: string;
|
||||
}>;
|
||||
default_time_dimension?: { dbt?: string };
|
||||
tags?: { dbt?: string[] };
|
||||
freshness?: { dbt?: { raw?: unknown; loaded_at_field?: string | null } };
|
||||
}
|
||||
|
||||
export interface SemanticLayerQueryInput {
|
||||
measures: Array<string | { expr: string; name: string }>;
|
||||
dimensions: Array<string | { field: string; granularity?: string }>;
|
||||
filters?: string[];
|
||||
segments?: string[];
|
||||
order_by?: Array<string | { field: string; direction?: string }>;
|
||||
limit?: number;
|
||||
include_empty?: boolean;
|
||||
}
|
||||
|
||||
export interface SemanticLayerQueryExecutionResult {
|
||||
sql: string;
|
||||
headers: string[];
|
||||
rows: unknown[][];
|
||||
totalRows: number;
|
||||
plan: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export type SlSearchMatchReason = 'lexical' | 'semantic' | 'dictionary' | 'token' | (string & {});
|
||||
|
||||
export interface SlDictionaryMatch {
|
||||
column: string;
|
||||
values: string[];
|
||||
overflowCount?: number;
|
||||
}
|
||||
|
||||
export interface SlSearchLaneSummary {
|
||||
lane: string;
|
||||
status: 'available' | 'skipped' | 'failed';
|
||||
requestedCandidatePoolLimit: number;
|
||||
effectiveCandidatePoolLimit: number;
|
||||
returnedCandidateCount: number;
|
||||
weight: number;
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
export interface SlSearchMetadata {
|
||||
score: number;
|
||||
matchReasons: SlSearchMatchReason[];
|
||||
dictionaryMatches?: SlDictionaryMatch[];
|
||||
lanes?: SlSearchLaneSummary[];
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue