refactor(workspace): fold internal packages into cli

This commit is contained in:
Andrey Avtomonov 2026-05-21 03:27:33 +02:00
parent 8c2333cc15
commit ac3885b652
945 changed files with 517 additions and 2686 deletions

View file

@ -0,0 +1,9 @@
export type {
AgentRunnerPort,
RunLoopParams,
RunLoopResult,
RunLoopStepInfo,
RunLoopStopReason,
} from '../llm/runtime-port.js';
export { RuntimeAgentRunner } from '../llm/runtime-port.js';
export type { AgentTelemetryPort } from '../llm/ai-sdk-runtime.js';

View file

@ -0,0 +1,27 @@
import { z } from 'zod';
export const connectionTypeSchema = z.enum([
'POSTGRESQL',
'SQLITE',
'SQLSERVER',
'BIGQUERY',
'SNOWFLAKE',
'CENTRALREACH',
'EPIC',
'CERNER',
'ATHENA',
'QUICKBOOKS',
'WORKDAY',
'REST',
'S3',
'SLACK',
'METABASE',
'LOOKER',
'NOTION',
'MYSQL',
'CLICKHOUSE',
'PLAIN',
'BETTERSTACK',
]);
export type ConnectionType = z.infer<typeof connectionTypeSchema>;

View file

@ -0,0 +1,30 @@
import { describe, expect, it } from 'vitest';
import { getDialectForDriver } from './dialects.js';
describe('getDialectForDriver', () => {
it.each([
['postgres', '"public"."orders"'],
['postgresql', '"public"."orders"'],
['mysql', '`public`.`orders`'],
['clickhouse', '`public`.`orders`'],
['sqlite', '"orders"'],
['snowflake', '"analytics"."public"."orders"'],
['bigquery', '`analytics`.`public`.`orders`'],
['sqlserver', '[analytics].[public].[orders]'],
] as const)('formats table names for %s', (driver, expected) => {
const dialect = getDialectForDriver(driver);
expect(
dialect.formatTableName({
catalog: driver === 'snowflake' || driver === 'bigquery' || driver === 'sqlserver' ? 'analytics' : null,
db: driver === 'sqlite' ? null : 'public',
name: 'orders',
}),
).toBe(expected);
});
it('throws with a supported-driver list for unknown drivers', () => {
expect(() => getDialectForDriver('oracle')).toThrow(
'Unsupported warehouse driver "oracle". Supported drivers: bigquery, clickhouse, mysql, postgres, postgresql, sqlite, sqlite3, snowflake, sqlserver',
);
});
});

View file

@ -0,0 +1,102 @@
import type { KtxSchemaDimensionType, KtxTableRef } from '../scan/types.js';
export type SupportedDriver =
| 'postgres'
| 'postgresql'
| 'mysql'
| 'sqlserver'
| 'snowflake'
| 'bigquery'
| 'clickhouse'
| 'sqlite'
| 'sqlite3';
export interface KtxDialect {
readonly type: SupportedDriver;
quoteIdentifier(identifier: string): string;
formatTableName(table: KtxTableRef): string;
mapToDimensionType(nativeType: string): KtxSchemaDimensionType;
}
const supportedDrivers: SupportedDriver[] = [
'bigquery',
'clickhouse',
'mysql',
'postgres',
'postgresql',
'sqlite',
'sqlite3',
'snowflake',
'sqlserver',
];
function doubleQuoted(identifier: string): string {
return `"${identifier.replace(/"/g, '""')}"`;
}
function backtickQuoted(identifier: string): string {
return `\`${identifier.replace(/`/g, '``')}\``;
}
function bigQueryQuoted(identifier: string): string {
return `\`${identifier.replace(/`/g, '\\`')}\``;
}
function bracketQuoted(identifier: string): string {
return `[${identifier.replace(/\]/g, ']]')}]`;
}
function inferDimensionType(nativeType: string): KtxSchemaDimensionType {
const normalized = nativeType.toLowerCase().trim();
if (normalized.includes('date') || normalized.includes('time')) {
return 'time';
}
if (
normalized.includes('int') ||
normalized.includes('num') ||
normalized.includes('dec') ||
normalized.includes('float') ||
normalized.includes('double') ||
normalized.includes('real')
) {
return 'number';
}
if (normalized.includes('bool') || normalized === 'bit') {
return 'boolean';
}
return 'string';
}
function formatWithParts(table: KtxTableRef, quote: (identifier: string) => string, sqlite = false): string {
const parts = sqlite ? [table.name] : [table.catalog, table.db, table.name].filter((part): part is string => !!part);
return parts.map(quote).join('.');
}
function createDialect(type: SupportedDriver, quote: (identifier: string) => string, sqlite = false): KtxDialect {
return {
type,
quoteIdentifier: quote,
formatTableName: (table) => formatWithParts(table, quote, sqlite),
mapToDimensionType: inferDimensionType,
};
}
const dialects: Record<SupportedDriver, KtxDialect> = {
postgres: createDialect('postgres', doubleQuoted),
postgresql: createDialect('postgresql', doubleQuoted),
mysql: createDialect('mysql', backtickQuoted),
clickhouse: createDialect('clickhouse', backtickQuoted),
sqlite: createDialect('sqlite', doubleQuoted, true),
sqlite3: createDialect('sqlite3', doubleQuoted, true),
snowflake: createDialect('snowflake', doubleQuoted),
bigquery: createDialect('bigquery', bigQueryQuoted),
sqlserver: createDialect('sqlserver', bracketQuoted),
};
export function getDialectForDriver(driver: string): KtxDialect {
const normalized = driver.toLowerCase().trim();
if (normalized in dialects) {
return dialects[normalized as SupportedDriver];
}
throw new Error(`Unsupported warehouse driver "${driver}". Supported drivers: ${supportedDrivers.join(', ')}`);
}

View file

@ -0,0 +1,30 @@
export type {
KtxSqlQueryExecutionInput,
KtxSqlQueryExecutionResult,
KtxSqlQueryExecutorPort,
} from './query-executor.js';
export type { KtxDialect, SupportedDriver } from './dialects.js';
export { createDefaultLocalQueryExecutor, type DefaultLocalQueryExecutorOptions } from './local-query-executor.js';
export { getDialectForDriver } from './dialects.js';
export { normalizeQueryRows } from './query-executor.js';
export { createPostgresQueryExecutor } from './postgres-query-executor.js';
export { assertReadOnlySql, limitSqlForExecution } from './read-only-sql.js';
export { createSqliteQueryExecutor, sqliteDatabasePathFromConnection } from './sqlite-query-executor.js';
export { connectionTypeSchema, type ConnectionType } from './connection-type.js';
export {
localConnectionInfoFromConfig,
localConnectionToWarehouseDescriptor,
localConnectionTypeForConfig,
type LocalConnectionInfo,
type LocalWarehouseDescriptor,
} from './local-warehouse-descriptor.js';
export {
KTX_NOTION_ORG_KNOWLEDGE_WARNING,
notionConnectionToPullConfig,
parseNotionConnectionConfig,
redactNotionConnectionConfig,
resolveNotionConnectionAuthToken,
resolveNotionAuthToken,
type KtxNotionConnectionConfig,
type RedactedKtxNotionConnectionConfig,
} from './notion-config.js';

View file

@ -0,0 +1,59 @@
import { describe, expect, it, vi } from 'vitest';
import { createDefaultLocalQueryExecutor } from './local-query-executor.js';
describe('createDefaultLocalQueryExecutor', () => {
it('dispatches postgres and sqlite drivers to their executors', async () => {
const postgres = {
execute: vi.fn(async () => ({
headers: ['pg'],
rows: [[1]],
totalRows: 1,
command: 'SELECT',
rowCount: 1,
})),
};
const sqlite = {
execute: vi.fn(async () => ({
headers: ['sqlite'],
rows: [[2]],
totalRows: 1,
command: 'SELECT',
rowCount: 1,
})),
};
const executor = createDefaultLocalQueryExecutor({ postgres, sqlite });
await expect(
executor.execute({
connectionId: 'pg',
connection: { driver: 'postgres' },
sql: 'select 1',
}),
).resolves.toMatchObject({ headers: ['pg'] });
await expect(
executor.execute({
connectionId: 'local',
connection: { driver: 'sqlite' },
sql: 'select 1',
}),
).resolves.toMatchObject({ headers: ['sqlite'] });
expect(postgres.execute).toHaveBeenCalledTimes(1);
expect(sqlite.execute).toHaveBeenCalledTimes(1);
});
it('rejects unsupported local execution drivers', async () => {
const executor = createDefaultLocalQueryExecutor({
postgres: { execute: vi.fn() },
sqlite: { execute: vi.fn() },
});
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'snowflake' },
sql: 'select 1',
}),
).rejects.toThrow('No local query executor is configured for driver "snowflake".');
});
});

View file

@ -0,0 +1,34 @@
import { createPostgresQueryExecutor } from './postgres-query-executor.js';
import type {
KtxSqlQueryExecutionInput,
KtxSqlQueryExecutionResult,
KtxSqlQueryExecutorPort,
} from './query-executor.js';
import { createSqliteQueryExecutor } from './sqlite-query-executor.js';
export interface DefaultLocalQueryExecutorOptions {
postgres?: KtxSqlQueryExecutorPort;
sqlite?: KtxSqlQueryExecutorPort;
}
function driverFor(input: KtxSqlQueryExecutionInput): string {
return String(input.connection?.driver ?? '').toLowerCase();
}
export function createDefaultLocalQueryExecutor(options: DefaultLocalQueryExecutorOptions = {}): KtxSqlQueryExecutorPort {
const postgres = options.postgres ?? createPostgresQueryExecutor();
const sqlite = options.sqlite ?? createSqliteQueryExecutor();
return {
async execute(input: KtxSqlQueryExecutionInput): Promise<KtxSqlQueryExecutionResult> {
const driver = driverFor(input);
if (driver === 'postgres' || driver === 'postgresql') {
return postgres.execute(input);
}
if (driver === 'sqlite' || driver === 'sqlite3') {
return sqlite.execute(input);
}
throw new Error(`No local query executor is configured for driver "${input.connection?.driver ?? 'unknown'}".`);
},
};
}

View file

@ -0,0 +1,71 @@
import { describe, expect, it } from 'vitest';
import {
localConnectionInfoFromConfig,
localConnectionToWarehouseDescriptor,
localConnectionTypeForConfig,
} from './local-warehouse-descriptor.js';
describe('localConnectionToWarehouseDescriptor', () => {
it('maps local Postgres URLs to canonical warehouse descriptors', () => {
expect(
localConnectionToWarehouseDescriptor('warehouse', {
driver: 'postgres',
url: 'postgresql://readonly@db.example.test/analytics',
}),
).toMatchObject({
id: 'warehouse',
connection_type: 'POSTGRESQL',
host: 'db.example.test',
database: 'analytics',
});
});
it('maps BigQuery project and dataset from explicit fields', () => {
expect(
localConnectionToWarehouseDescriptor('bq', {
driver: 'bigquery',
project_id: 'acme',
dataset_id: 'warehouse',
}),
).toMatchObject({
id: 'bq',
connection_type: 'BIGQUERY',
project_id: 'acme',
dataset_id: 'warehouse',
});
});
it('returns null for non-warehouse adapters', () => {
expect(
localConnectionToWarehouseDescriptor('looker', {
driver: 'looker',
base_url: 'https://looker.example.com',
client_id: 'client',
}),
).toBeNull();
});
});
describe('local connection info helpers', () => {
it('returns canonical warehouse connection types for local catalogs', () => {
expect(localConnectionTypeForConfig('warehouse', { driver: 'postgres' })).toBe('POSTGRESQL');
expect(localConnectionTypeForConfig('bq', { driver: 'bigquery', project_id: 'acme' })).toBe('BIGQUERY');
expect(localConnectionTypeForConfig('snowflake', { driver: 'snowflake' })).toBe('SNOWFLAKE');
});
it('keeps non-warehouse adapter labels for display-only local connection surfaces', () => {
expect(localConnectionTypeForConfig('prod-metabase', { driver: 'metabase', api_url: 'https://metabase.example.com' })).toBe(
'metabase',
);
expect(localConnectionTypeForConfig('missing-driver', {} as never)).toBe('unknown');
});
it('builds nullable local connection info records', () => {
expect(localConnectionInfoFromConfig('warehouse', { driver: 'postgres' })).toEqual({
id: 'warehouse',
name: 'warehouse',
connectionType: 'POSTGRESQL',
});
expect(localConnectionInfoFromConfig('missing', undefined)).toBeNull();
});
});

View file

@ -0,0 +1,102 @@
import type { KtxProjectConnectionConfig } from '../project/config.js';
import type { ConnectionType } from './connection-type.js';
export interface LocalWarehouseDescriptor {
id: string;
connection_type: ConnectionType;
host?: string | null;
database?: string | null;
account?: string | null;
project_id?: string | null;
dataset_id?: string | null;
connection_params: Record<string, unknown>;
}
export interface LocalConnectionInfo {
id: string;
name: string;
connectionType: string;
}
const DRIVER_TO_CONNECTION_TYPE: Record<string, ConnectionType> = {
postgres: 'POSTGRESQL',
postgresql: 'POSTGRESQL',
sqlite: 'SQLITE',
sqlserver: 'SQLSERVER',
mssql: 'SQLSERVER',
mysql: 'MYSQL',
clickhouse: 'CLICKHOUSE',
snowflake: 'SNOWFLAKE',
bigquery: 'BIGQUERY',
};
export function localConnectionToWarehouseDescriptor(
id: string,
connection: KtxProjectConnectionConfig | undefined,
): LocalWarehouseDescriptor | null {
if (!connection) {
return null;
}
const connectionType = DRIVER_TO_CONNECTION_TYPE[String(connection.driver ?? '').toLowerCase()];
if (!connectionType) {
return null;
}
const info: LocalWarehouseDescriptor = {
id,
connection_type: connectionType,
connection_params: { ...connection },
};
const url = typeof connection.url === 'string' ? connection.url : null;
if (url && !url.startsWith('env:') && !url.startsWith('file:')) {
try {
const parsed = new URL(url);
info.host = parsed.hostname || null;
if (parsed.pathname.length > 1) {
const [first, second] = parsed.pathname.slice(1).split('/');
if (connectionType === 'BIGQUERY') {
info.project_id = stringField(connection.project_id) ?? parsed.hostname ?? first ?? null;
info.dataset_id = stringField(connection.dataset_id) ?? second ?? null;
} else {
info.database = first ?? null;
}
}
} catch {
info.host = stringField(connection.host);
}
}
info.host = stringField(connection.host) ?? info.host ?? null;
info.database = stringField(connection.database) ?? info.database ?? null;
info.account = stringField(connection.account) ?? null;
info.project_id = stringField(connection.project_id) ?? info.project_id ?? null;
info.dataset_id = stringField(connection.dataset_id) ?? info.dataset_id ?? null;
return info;
}
export function localConnectionTypeForConfig(id: string, connection: KtxProjectConnectionConfig | undefined): string {
const descriptor = localConnectionToWarehouseDescriptor(id, connection);
if (descriptor) {
return descriptor.connection_type;
}
const driver = typeof connection?.driver === 'string' ? connection.driver.trim() : '';
return driver.length > 0 ? driver : 'unknown';
}
export function localConnectionInfoFromConfig(
id: string,
connection: KtxProjectConnectionConfig | undefined,
): LocalConnectionInfo | null {
if (!connection) {
return null;
}
return {
id,
name: id,
connectionType: localConnectionTypeForConfig(id, connection),
};
}
function stringField(value: unknown): string | null {
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : null;
}

View file

@ -0,0 +1,157 @@
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import {
notionConnectionToPullConfig,
parseNotionConnectionConfig,
redactNotionConnectionConfig,
resolveNotionAuthToken,
} from './notion-config.js';
describe('standalone Notion connection config', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-notion-config-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('parses selected-root Notion config with safe defaults', () => {
const parsed = parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'env:NOTION_TOKEN',
crawl_mode: 'selected_roots',
root_page_ids: ['page-1'],
});
expect(parsed).toEqual({
driver: 'notion',
auth_token: null,
auth_token_ref: 'env:NOTION_TOKEN',
crawl_mode: 'selected_roots',
root_page_ids: ['page-1'],
root_database_ids: [],
root_data_source_ids: [],
max_pages_per_run: 1000,
max_knowledge_creates_per_run: 25,
max_knowledge_updates_per_run: 20,
});
expect(parsed).not.toHaveProperty('last_successful_cursor');
});
it('parses inline Notion auth tokens without requiring auth_token_ref', () => {
const parsed = parseNotionConnectionConfig({
driver: 'notion',
auth_token: ' ntn_inline_token ',
crawl_mode: 'selected_roots',
root_page_ids: ['page-1'],
});
expect(parsed).toMatchObject({
driver: 'notion',
auth_token: 'ntn_inline_token',
auth_token_ref: null,
crawl_mode: 'selected_roots',
root_page_ids: ['page-1'],
});
});
it('redacts token references from display output', () => {
expect(
redactNotionConnectionConfig(
parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'file:/Users/example/.config/notion-token',
crawl_mode: 'all_accessible',
max_pages_per_run: 80,
}),
),
).toEqual({
driver: 'notion',
hasAuthToken: true,
crawlMode: 'all_accessible',
rootPageIds: [],
rootDatabaseIds: [],
rootDataSourceIds: [],
maxPagesPerRun: 80,
maxKnowledgeCreatesPerRun: 25,
maxKnowledgeUpdatesPerRun: 20,
warning: 'Anything accessible to this Notion integration can become organization knowledge.',
});
});
it('requires at least one selected root in selected_roots mode', () => {
expect(() =>
parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'env:NOTION_TOKEN',
crawl_mode: 'selected_roots',
}),
).toThrow('selected_roots requires at least one root page, database, or data source id');
});
it('resolves env and file token references without exposing the reference in errors', async () => {
const tokenPath = join(tempDir, 'notion-token.txt');
await writeFile(tokenPath, 'ntn_file_token\n', 'utf-8');
await expect(
resolveNotionAuthToken('env:NOTION_TOKEN', {
env: { NOTION_TOKEN: 'ntn_env_token' },
}),
).resolves.toBe('ntn_env_token');
await expect(resolveNotionAuthToken(`file:${tokenPath}`)).resolves.toBe('ntn_file_token');
await expect(resolveNotionAuthToken('env:MISSING_NOTION_TOKEN', { env: {} })).rejects.toThrow(
'Notion token environment variable MISSING_NOTION_TOKEN is not set',
);
});
it('converts standalone config into adapter pull config', async () => {
const pullConfig = await notionConnectionToPullConfig(
parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'env:NOTION_TOKEN',
crawl_mode: 'all_accessible',
max_pages_per_run: 12,
max_knowledge_creates_per_run: 2,
max_knowledge_updates_per_run: 7,
last_successful_cursor: '{"phase":"all_accessible_pages","cursor":"cursor-1"}',
}),
{ env: { NOTION_TOKEN: 'ntn_env_token' } },
);
expect(pullConfig).toEqual({
authToken: 'ntn_env_token',
crawlMode: 'all_accessible',
rootPageIds: [],
rootDatabaseIds: [],
rootDataSourceIds: [],
maxPagesPerRun: 12,
maxKnowledgeCreatesPerRun: 2,
maxKnowledgeUpdatesPerRun: 7,
lastSuccessfulCursor: null,
});
});
it('uses inline Notion auth_token when building adapter pull config', async () => {
const pullConfig = await notionConnectionToPullConfig(
parseNotionConnectionConfig({
driver: 'notion',
auth_token: 'ntn_inline_token',
auth_token_ref: 'env:STALE_NOTION_TOKEN',
crawl_mode: 'all_accessible',
}),
{
env: {},
readTextFile: async () => {
throw new Error('readTextFile should not be called for inline auth_token');
},
},
);
expect(pullConfig.authToken).toBe('ntn_inline_token');
});
});

View file

@ -0,0 +1,221 @@
import { readFile } from 'node:fs/promises';
import { homedir } from 'node:os';
import { resolve } from 'node:path';
import {
NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN,
type NotionPullConfig,
notionPullConfigSchema,
} from '../ingest/adapters/notion/types.js';
import type { KtxProjectConnectionConfig } from '../project/config.js';
export const KTX_NOTION_ORG_KNOWLEDGE_WARNING =
'Anything accessible to this Notion integration can become organization knowledge.';
type KtxNotionCrawlMode = 'all_accessible' | 'selected_roots';
type RawKtxNotionConnectionConfig = Extract<KtxProjectConnectionConfig, { driver: 'notion' }>;
export type KtxNotionConnectionConfig = Omit<
RawKtxNotionConnectionConfig,
| 'auth_token'
| 'auth_token_ref'
| 'crawl_mode'
| 'root_page_ids'
| 'root_database_ids'
| 'root_data_source_ids'
| 'max_pages_per_run'
| 'max_knowledge_creates_per_run'
| 'max_knowledge_updates_per_run'
> & {
driver: 'notion';
auth_token: string | null;
auth_token_ref: string | null;
crawl_mode: KtxNotionCrawlMode;
root_page_ids: string[];
root_database_ids: string[];
root_data_source_ids: string[];
max_pages_per_run: number;
max_knowledge_creates_per_run: number;
max_knowledge_updates_per_run: number;
};
export interface RedactedKtxNotionConnectionConfig {
driver: 'notion';
hasAuthToken: boolean;
crawlMode: KtxNotionCrawlMode;
rootPageIds: string[];
rootDatabaseIds: string[];
rootDataSourceIds: string[];
maxPagesPerRun: number;
maxKnowledgeCreatesPerRun: number;
maxKnowledgeUpdatesPerRun: number;
warning: typeof KTX_NOTION_ORG_KNOWLEDGE_WARNING;
}
interface ResolveNotionTokenOptions {
env?: Record<string, string | undefined>;
readTextFile?: (path: string) => Promise<string>;
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function record(value: unknown): Record<string, unknown> {
if (!isRecord(value)) {
throw new Error('Notion connection config must be an object');
}
return value;
}
function stringValue(value: unknown, fallback: string): string {
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : fallback;
}
function optionalString(value: unknown): string | null {
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : null;
}
function stringArray(value: unknown): string[] {
if (!Array.isArray(value)) {
return [];
}
return value.filter((item): item is string => typeof item === 'string' && item.trim().length > 0);
}
function integerWithFallback(value: unknown, fallback: number, name: string): number {
if (value === undefined || value === null) {
return fallback;
}
if (typeof value !== 'number' || !Number.isInteger(value)) {
throw new Error(`${name} must be an integer`);
}
return value;
}
function boundedInteger(value: unknown, fallback: number, name: string, min: number, max: number): number {
const parsed = integerWithFallback(value, fallback, name);
if (parsed < min || parsed > max) {
throw new Error(`${name} must be between ${min} and ${max}`);
}
return parsed;
}
export function parseNotionConnectionConfig(raw: unknown): KtxNotionConnectionConfig {
const input = record(raw);
if (input.driver !== 'notion') {
throw new Error('Notion connection config requires driver: notion');
}
const authToken = optionalString(input.auth_token);
const authTokenRef = optionalString(input.auth_token_ref);
if (!authToken && !authTokenRef) {
throw new Error('Notion connection config requires auth_token or auth_token_ref');
}
if (authTokenRef && !authTokenRef.startsWith('env:') && !authTokenRef.startsWith('file:')) {
throw new Error('Notion auth_token_ref must use env:NAME or file:/path');
}
const crawlMode = stringValue(input.crawl_mode, 'selected_roots');
if (crawlMode !== 'selected_roots' && crawlMode !== 'all_accessible') {
throw new Error(`Unsupported Notion crawl_mode: ${crawlMode}`);
}
const rootPageIds = stringArray(input.root_page_ids);
const rootDatabaseIds = stringArray(input.root_database_ids);
const rootDataSourceIds = stringArray(input.root_data_source_ids);
if (crawlMode === 'selected_roots' && rootPageIds.length + rootDatabaseIds.length + rootDataSourceIds.length === 0) {
throw new Error('selected_roots requires at least one root page, database, or data source id');
}
return {
driver: 'notion',
auth_token: authToken,
auth_token_ref: authTokenRef,
crawl_mode: crawlMode,
root_page_ids: rootPageIds,
root_database_ids: rootDatabaseIds,
root_data_source_ids: rootDataSourceIds,
max_pages_per_run: boundedInteger(input.max_pages_per_run, 1000, 'max_pages_per_run', 1, 10_000),
max_knowledge_creates_per_run: boundedInteger(
input.max_knowledge_creates_per_run,
NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN,
'max_knowledge_creates_per_run',
0,
25,
),
max_knowledge_updates_per_run: boundedInteger(
input.max_knowledge_updates_per_run,
20,
'max_knowledge_updates_per_run',
0,
100,
),
};
}
export function redactNotionConnectionConfig(config: KtxNotionConnectionConfig): RedactedKtxNotionConnectionConfig {
return {
driver: 'notion',
hasAuthToken: Boolean(config.auth_token ?? config.auth_token_ref),
crawlMode: config.crawl_mode,
rootPageIds: config.root_page_ids,
rootDatabaseIds: config.root_database_ids,
rootDataSourceIds: config.root_data_source_ids,
maxPagesPerRun: config.max_pages_per_run,
maxKnowledgeCreatesPerRun: config.max_knowledge_creates_per_run,
maxKnowledgeUpdatesPerRun: config.max_knowledge_updates_per_run,
warning: KTX_NOTION_ORG_KNOWLEDGE_WARNING,
};
}
function expandHome(path: string): string {
return path === '~' || path.startsWith('~/') ? resolve(homedir(), path.slice(2)) : path;
}
export async function resolveNotionAuthToken(
authTokenRef: string,
options: ResolveNotionTokenOptions = {},
): Promise<string> {
if (authTokenRef.startsWith('env:')) {
const envName = authTokenRef.slice('env:'.length);
const value = (options.env ?? process.env)[envName];
if (!value) {
throw new Error(`Notion token environment variable ${envName} is not set`);
}
return value.trim();
}
if (authTokenRef.startsWith('file:')) {
const path = expandHome(authTokenRef.slice('file:'.length));
const readTextFile = options.readTextFile ?? ((filePath: string) => readFile(filePath, 'utf-8'));
const value = (await readTextFile(path)).trim();
if (!value) {
throw new Error(`Notion token file is empty: ${path}`);
}
return value;
}
throw new Error('Notion auth_token_ref must use env:NAME or file:/path');
}
export async function resolveNotionConnectionAuthToken(
config: Pick<KtxNotionConnectionConfig, 'auth_token' | 'auth_token_ref'>,
options: ResolveNotionTokenOptions = {},
): Promise<string> {
return config.auth_token ?? (await resolveNotionAuthToken(config.auth_token_ref ?? '', options));
}
export async function notionConnectionToPullConfig(
config: KtxNotionConnectionConfig,
options: ResolveNotionTokenOptions = {},
): Promise<NotionPullConfig> {
const authToken = await resolveNotionConnectionAuthToken(config, options);
return notionPullConfigSchema.parse({
authToken,
crawlMode: config.crawl_mode,
rootPageIds: config.root_page_ids,
rootDatabaseIds: config.root_database_ids,
rootDataSourceIds: config.root_data_source_ids,
maxPagesPerRun: config.max_pages_per_run,
maxKnowledgeCreatesPerRun: config.max_knowledge_creates_per_run,
maxKnowledgeUpdatesPerRun: config.max_knowledge_updates_per_run,
lastSuccessfulCursor: null,
});
}

View file

@ -0,0 +1,103 @@
import { describe, expect, it, vi } from 'vitest';
import { createPostgresQueryExecutor } from './postgres-query-executor.js';
function makeClient() {
const calls: unknown[] = [];
const client = {
connect: vi.fn(async () => undefined),
query: vi.fn(async (input: unknown) => {
calls.push(input);
if (input === 'BEGIN READ ONLY') {
return { rows: [], fields: [], rowCount: null, command: 'BEGIN' };
}
if (input === 'COMMIT') {
return { rows: [], fields: [], rowCount: null, command: 'COMMIT' };
}
return {
rows: [
['paid', 2],
['open', 1],
],
fields: [{ name: 'status' }, { name: 'order_count' }],
rowCount: 2,
command: 'SELECT',
};
}),
end: vi.fn(async () => undefined),
};
return { client, calls };
}
describe('createPostgresQueryExecutor', () => {
it('runs a read-only transaction in array row mode and closes the client', async () => {
const { client, calls } = makeClient();
const executor = createPostgresQueryExecutor({
clientFactory: vi.fn(() => client),
});
const result = await executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres', url: 'postgres://example/db' },
sql: 'select status, count(*) as order_count from public.orders group by status',
maxRows: 50,
});
expect(client.connect).toHaveBeenCalledTimes(1);
expect(calls[0]).toBe('BEGIN READ ONLY');
expect(calls[1]).toEqual({
text: 'select * from (select status, count(*) as order_count from public.orders group by status) as ktx_query_result limit 50',
rowMode: 'array',
});
expect(calls[2]).toBe('COMMIT');
expect(client.end).toHaveBeenCalledTimes(1);
expect(result).toEqual({
headers: ['status', 'order_count'],
rows: [
['paid', 2],
['open', 1],
],
totalRows: 2,
command: 'SELECT',
rowCount: 2,
});
});
it('rolls back and closes the client when query execution fails', async () => {
const client = {
connect: vi.fn(async () => undefined),
query: vi.fn(async (input: unknown) => {
if (input === 'BEGIN READ ONLY' || input === 'ROLLBACK') {
return { rows: [], fields: [], rowCount: null, command: String(input) };
}
throw new Error('syntax error');
}),
end: vi.fn(async () => undefined),
};
const executor = createPostgresQueryExecutor({
clientFactory: vi.fn(() => client),
});
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres', url: 'postgres://example/db' },
sql: 'select * from broken',
maxRows: 10,
}),
).rejects.toThrow('syntax error');
expect(client.query).toHaveBeenCalledWith('ROLLBACK');
expect(client.end).toHaveBeenCalledTimes(1);
});
it('requires a Postgres url', async () => {
const executor = createPostgresQueryExecutor({ clientFactory: vi.fn() });
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres' },
sql: 'select 1',
}),
).rejects.toThrow('Local Postgres execution requires connections.warehouse.url');
});
});

View file

@ -0,0 +1,78 @@
import { Client, type ClientConfig } from 'pg';
import type {
KtxSqlQueryExecutionInput,
KtxSqlQueryExecutionResult,
KtxSqlQueryExecutorPort,
} from './query-executor.js';
import { limitSqlForExecution } from './read-only-sql.js';
interface PgClientLike {
connect(): Promise<unknown>;
query(input: string | { text: string; rowMode: 'array' }): Promise<{
fields: Array<{ name: string }>;
rows: unknown[][];
command: string;
rowCount: number | null;
}>;
end(): Promise<void>;
}
interface PostgresQueryExecutorOptions {
statementTimeoutMs?: number;
queryTimeoutMs?: number;
connectionTimeoutMs?: number;
clientFactory?: (config: ClientConfig) => PgClientLike;
}
function connectionDriver(input: KtxSqlQueryExecutionInput): string {
return String(input.connection?.driver ?? '').toLowerCase();
}
function createDefaultClient(config: ClientConfig): PgClientLike {
return new Client(config);
}
export function createPostgresQueryExecutor(options: PostgresQueryExecutorOptions = {}): KtxSqlQueryExecutorPort {
const clientFactory = options.clientFactory ?? createDefaultClient;
return {
async execute(input: KtxSqlQueryExecutionInput): Promise<KtxSqlQueryExecutionResult> {
const driver = connectionDriver(input);
const connection = input.connection;
if (driver !== 'postgres' && driver !== 'postgresql') {
throw new Error(`Local Postgres execution cannot run driver "${connection?.driver ?? 'unknown'}".`);
}
if (typeof connection?.url !== 'string' || connection.url.trim().length === 0) {
throw new Error(`Local Postgres execution requires connections.${input.connectionId}.url.`);
}
const client = clientFactory({
connectionString: connection.url,
statement_timeout: options.statementTimeoutMs ?? 30_000,
query_timeout: options.queryTimeoutMs ?? 35_000,
connectionTimeoutMillis: options.connectionTimeoutMs ?? 5_000,
application_name: 'ktx-local-query',
});
await client.connect();
try {
await client.query('BEGIN READ ONLY');
const result = await client.query({
text: limitSqlForExecution(input.sql, input.maxRows),
rowMode: 'array',
});
await client.query('COMMIT');
return {
headers: result.fields.map((field) => field.name),
rows: result.rows,
totalRows: result.rows.length,
command: result.command,
rowCount: result.rowCount,
};
} catch (error) {
await client.query('ROLLBACK').catch(() => undefined);
throw error;
} finally {
await client.end();
}
},
};
}

View file

@ -0,0 +1,25 @@
import type { KtxProjectConnectionConfig } from '../project/index.js';
export interface KtxSqlQueryExecutionInput {
connectionId: string;
projectDir?: string;
connection: KtxProjectConnectionConfig | undefined;
sql: string;
maxRows?: number;
}
export interface KtxSqlQueryExecutionResult {
headers: string[];
rows: unknown[][];
totalRows: number;
command: string;
rowCount: number | null;
}
export interface KtxSqlQueryExecutorPort {
execute(input: KtxSqlQueryExecutionInput): Promise<KtxSqlQueryExecutionResult>;
}
export function normalizeQueryRows(rows: unknown[]): unknown[][] {
return rows.map((row) => (Array.isArray(row) ? row : Object.values(row as Record<string, unknown>)));
}

View file

@ -0,0 +1,30 @@
import { describe, expect, it } from 'vitest';
import { assertReadOnlySql, limitSqlForExecution } from './read-only-sql.js';
describe('assertReadOnlySql', () => {
it('allows select and with queries', () => {
expect(assertReadOnlySql('select * from orders')).toBe('select * from orders');
expect(assertReadOnlySql('with paid as (select * from orders) select * from paid')).toContain('with paid');
});
it('rejects mutating statements before opening a database connection', () => {
expect(() => assertReadOnlySql('delete from orders')).toThrow(
'Only read-only SELECT/WITH queries can be executed locally',
);
expect(() => assertReadOnlySql('create table x(id int)')).toThrow(
'Only read-only SELECT/WITH queries can be executed locally',
);
});
});
describe('limitSqlForExecution', () => {
it('wraps compiled SQL and strips trailing semicolons', () => {
expect(limitSqlForExecution('select * from public.orders; ', 25)).toBe(
'select * from (select * from public.orders) as ktx_query_result limit 25',
);
});
it('returns the trimmed SQL when no maxRows value is provided', () => {
expect(limitSqlForExecution('select * from orders; ', undefined)).toBe('select * from orders');
});
});

View file

@ -0,0 +1,22 @@
const MUTATING_SQL =
/^\s*(insert|update|delete|merge|alter|drop|create|truncate|grant|revoke|copy|call|do|vacuum|analyze|refresh)\b/i;
const READ_SQL = /^\s*(select|with)\b/i;
export function assertReadOnlySql(sql: string): string {
const trimmed = sql.trim();
if (!READ_SQL.test(trimmed) || MUTATING_SQL.test(trimmed)) {
throw new Error('Only read-only SELECT/WITH queries can be executed locally.');
}
return trimmed;
}
export function limitSqlForExecution(sql: string, maxRows: number | undefined): string {
const trimmed = assertReadOnlySql(sql).replace(/;+\s*$/, '');
if (!maxRows) {
return trimmed;
}
if (!Number.isInteger(maxRows) || maxRows <= 0) {
throw new Error('maxRows must be a positive integer.');
}
return `select * from (${trimmed}) as ktx_query_result limit ${maxRows}`;
}

View file

@ -0,0 +1,139 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { writeFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import Database from 'better-sqlite3';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { createSqliteQueryExecutor, sqliteDatabasePathFromConnection } from './sqlite-query-executor.js';
describe('createSqliteQueryExecutor', () => {
let tempDir: string;
let dbPath: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-sqlite-query-'));
dbPath = join(tempDir, 'warehouse.db');
const db = new Database(dbPath);
db.exec(`
CREATE TABLE orders (
id INTEGER PRIMARY KEY,
status TEXT NOT NULL,
amount INTEGER NOT NULL
);
INSERT INTO orders (status, amount) VALUES
('paid', 20),
('paid', 30),
('open', 10);
`);
db.close();
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('executes read-only SELECT SQL against a relative SQLite path', async () => {
const executor = createSqliteQueryExecutor();
const result = await executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: 'warehouse.db' },
sql: 'select status, count(*) as order_count from orders group by status order by status',
maxRows: 10,
});
expect(result).toEqual({
headers: ['status', 'order_count'],
rows: [
['open', 1],
['paid', 2],
],
totalRows: 2,
command: 'SELECT',
rowCount: 2,
});
});
it('supports file urls for SQLite database paths', async () => {
expect(
sqliteDatabasePathFromConnection({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', url: `file://${dbPath}` },
sql: 'select 1',
}),
).toBe(dbPath);
});
it('resolves file references for SQLite path fields', async () => {
const pointerPath = join(tempDir, 'sqlite-path.txt');
writeFileSync(pointerPath, dbPath, 'utf-8');
expect(
sqliteDatabasePathFromConnection({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: `file:${pointerPath}` },
sql: 'select 1',
}),
).toBe(dbPath);
});
it('resolves env references for SQLite database urls', async () => {
const originalDatabaseUrl = process.env.KTX_SQLITE_TEST_URL;
process.env.KTX_SQLITE_TEST_URL = `sqlite:${dbPath}`;
try {
expect(
sqliteDatabasePathFromConnection({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', url: 'env:KTX_SQLITE_TEST_URL' },
sql: 'select 1',
}),
).toBe(dbPath);
} finally {
if (originalDatabaseUrl === undefined) {
delete process.env.KTX_SQLITE_TEST_URL;
} else {
process.env.KTX_SQLITE_TEST_URL = originalDatabaseUrl;
}
}
});
it('rejects mutating SQL before opening the database', async () => {
const executor = createSqliteQueryExecutor();
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: 'warehouse.db' },
sql: 'delete from orders',
}),
).rejects.toThrow('Only read-only SELECT/WITH queries can be executed locally');
});
it('requires a SQLite driver and a database path', async () => {
const executor = createSqliteQueryExecutor();
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'postgres', path: 'warehouse.db' },
sql: 'select 1',
}),
).rejects.toThrow('Local SQLite execution cannot run driver "postgres"');
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite' },
sql: 'select 1',
}),
).rejects.toThrow('Local SQLite execution requires connections.warehouse.path or connections.warehouse.url');
});
});

View file

@ -0,0 +1,91 @@
import { isAbsolute, resolve } from 'node:path';
import { fileURLToPath } from 'node:url';
import Database from 'better-sqlite3';
import { readFileSync } from 'node:fs';
import { homedir } from 'node:os';
import type {
KtxSqlQueryExecutionInput,
KtxSqlQueryExecutionResult,
KtxSqlQueryExecutorPort,
} from './query-executor.js';
import { normalizeQueryRows } from './query-executor.js';
import { limitSqlForExecution } from './read-only-sql.js';
type SqliteConnectionConfig = Record<string, unknown> | undefined;
function connectionDriver(input: KtxSqlQueryExecutionInput): string {
return String(input.connection?.driver ?? '').toLowerCase();
}
function stringConfigValue(connection: SqliteConnectionConfig, key: string): string | undefined {
const value = connection?.[key];
return typeof value === 'string' && value.trim().length > 0 ? resolveStringReference(key, value.trim()) : undefined;
}
function resolveStringReference(key: string, value: string): string {
if (value.startsWith('env:')) {
return process.env[value.slice('env:'.length)] ?? '';
}
if (key !== 'url' && value.startsWith('file:')) {
const rawPath = value.slice('file:'.length);
const path = rawPath.startsWith('~') ? resolve(homedir(), rawPath.slice(1)) : rawPath;
return readFileSync(path, 'utf-8').trim();
}
return value;
}
function sqlitePathFromUrl(url: string): string {
if (url.startsWith('file:')) {
return fileURLToPath(url);
}
if (url.startsWith('sqlite:')) {
const parsed = new URL(url);
if (parsed.pathname.length > 0) {
return decodeURIComponent(parsed.pathname);
}
}
return url;
}
export function sqliteDatabasePathFromConnection(input: KtxSqlQueryExecutionInput): string {
const driver = connectionDriver(input);
if (driver !== 'sqlite' && driver !== 'sqlite3') {
throw new Error(`Local SQLite execution cannot run driver "${input.connection?.driver ?? 'unknown'}".`);
}
const pathValue = stringConfigValue(input.connection, 'path');
const urlValue = stringConfigValue(input.connection, 'url');
if (!pathValue && !urlValue) {
throw new Error(
`Local SQLite execution requires connections.${input.connectionId}.path or connections.${input.connectionId}.url.`,
);
}
const candidate = pathValue ?? sqlitePathFromUrl(urlValue as string);
return isAbsolute(candidate) ? candidate : resolve(input.projectDir ?? process.cwd(), candidate);
}
export function createSqliteQueryExecutor(): KtxSqlQueryExecutorPort {
return {
async execute(input: KtxSqlQueryExecutionInput): Promise<KtxSqlQueryExecutionResult> {
const sql = limitSqlForExecution(input.sql, input.maxRows);
const dbPath = sqliteDatabasePathFromConnection(input);
const db = new Database(dbPath, { readonly: true, fileMustExist: true });
try {
const statement = db.prepare(sql);
const rows = statement.all() as unknown[];
return {
headers: statement.columns().map((column) => column.name),
rows: normalizeQueryRows(rows),
totalRows: rows.length,
command: 'SELECT',
rowCount: rows.length,
};
} finally {
db.close();
}
},
};
}

View file

@ -0,0 +1,34 @@
import { mkdir, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { resolveKtxConfigReference, resolveKtxHomePath } from './config-reference.js';
describe('KTX config references', () => {
it('resolves env references without returning empty values', () => {
expect(resolveKtxConfigReference('env:AI_GATEWAY_API_KEY', { AI_GATEWAY_API_KEY: ' gateway-key ' })).toBe(
'gateway-key',
);
expect(resolveKtxConfigReference('env:AI_GATEWAY_API_KEY', { AI_GATEWAY_API_KEY: ' ' })).toBeUndefined();
expect(resolveKtxConfigReference('env:AI_GATEWAY_API_KEY', {})).toBeUndefined();
});
it('resolves file references and trims file content', async () => {
const dir = join(tmpdir(), `ktx-config-reference-${process.pid}`);
await mkdir(dir, { recursive: true });
const keyPath = join(dir, 'gateway-key.txt');
await writeFile(keyPath, 'file-gateway-key\n', 'utf8');
expect(resolveKtxConfigReference(`file:${keyPath}`, {})).toBe('file-gateway-key');
});
it('returns literal values unchanged after trimming blank-only values', () => {
expect(resolveKtxConfigReference('provider/model', {})).toBe('provider/model');
expect(resolveKtxConfigReference(' ', {})).toBeUndefined();
expect(resolveKtxConfigReference(undefined, {})).toBeUndefined();
});
it('resolves home-prefixed paths', () => {
expect(resolveKtxHomePath('~/ktx/key.txt')).toContain('/ktx/key.txt');
});
});

View file

@ -0,0 +1,36 @@
import { readFileSync } from 'node:fs';
import { homedir } from 'node:os';
import { resolve } from 'node:path';
export function resolveKtxHomePath(path: string): string {
if (path === '~') {
return homedir();
}
if (path.startsWith('~/')) {
return resolve(homedir(), path.slice(2));
}
return resolve(path);
}
export function resolveKtxConfigReference(value: string | undefined, env: NodeJS.ProcessEnv): string | undefined {
if (!value) {
return undefined;
}
if (value.startsWith('env:')) {
const envName = value.slice('env:'.length).trim();
const envValue = env[envName];
return envValue && envValue.trim().length > 0 ? envValue.trim() : undefined;
}
if (value.startsWith('file:')) {
const filePath = resolveKtxHomePath(value.slice('file:'.length).trim());
const fileValue = readFileSync(filePath, 'utf8').trim();
return fileValue.length > 0 ? fileValue : undefined;
}
const trimmed = value.trim();
return trimmed.length > 0 ? trimmed : undefined;
}

View file

@ -0,0 +1,42 @@
export interface KtxStorageConfig {
configDir?: string;
homeDir?: string;
worktreesDir?: string;
}
export interface KtxGitConfig {
userName: string;
userEmail: string;
bootstrapMessage?: string;
bootstrapAuthor?: string;
bootstrapAuthorEmail?: string;
}
export interface KtxCoreConfig {
storage: KtxStorageConfig;
git: KtxGitConfig;
}
export interface KtxLogger {
debug(message: string): void;
log(message: string): void;
warn(message: string): void;
error(message: string, error?: unknown): void;
}
export const noopLogger: KtxLogger = {
debug: () => undefined,
log: () => undefined,
warn: () => undefined,
error: () => undefined,
};
export function resolveConfigDir(config: KtxCoreConfig): string {
const homeDir = config.storage.homeDir ?? '/tmp';
return config.storage.configDir ?? `${homeDir}/ktx/config`;
}
export function resolveWorktreesDir(config: KtxCoreConfig): string {
const homeDir = config.storage.homeDir ?? '/tmp';
return config.storage.worktreesDir ?? `${homeDir}/.worktrees`;
}

View file

@ -0,0 +1,5 @@
export interface KtxEmbeddingPort {
maxBatchSize: number;
computeEmbedding(text: string): Promise<number[]>;
computeEmbeddingsBulk(texts: string[]): Promise<number[][]>;
}

View file

@ -0,0 +1,43 @@
export interface KtxFileWriteResult {
commitHash?: string | null;
[key: string]: unknown;
}
export interface KtxFileReadResult {
content: string;
[key: string]: unknown;
}
export interface KtxFileListResult {
files: string[];
}
export interface KtxFileHistoryEntry {
sha?: string;
message?: string;
author?: string;
date?: string | Date;
[key: string]: unknown;
}
export interface KtxFileStorePort<TSelf = unknown> {
writeFile(
path: string,
content: string,
author: string,
authorEmail: string,
commitMessage: string,
options?: { skipLock?: boolean },
): Promise<KtxFileWriteResult>;
readFile(path: string): Promise<KtxFileReadResult>;
deleteFile(
path: string,
author: string,
authorEmail: string,
commitMessage: string,
options?: { skipLock?: boolean },
): Promise<KtxFileWriteResult | null>;
listFiles(path: string, recursive?: boolean): Promise<KtxFileListResult>;
getFileHistory(path: string): Promise<KtxFileHistoryEntry[] | unknown>;
forWorktree(workdir: string): TSelf;
}

View file

@ -0,0 +1,29 @@
import { simpleGit, type SimpleGit } from 'simple-git';
const GIT_HOOK_ENV_KEYS = [
'GIT_ALTERNATE_OBJECT_DIRECTORIES',
'GIT_DIR',
'GIT_INDEX_FILE',
'GIT_OBJECT_DIRECTORY',
'GIT_PREFIX',
'GIT_QUARANTINE_PATH',
'GIT_WORK_TREE',
'GIT_EDITOR',
'GIT_EXEC_PATH',
'GIT_PAGER',
'PAGER',
'VISUAL',
'EDITOR',
] as const;
function sanitizedGitEnv(env: NodeJS.ProcessEnv = process.env): NodeJS.ProcessEnv {
const sanitized = { ...env };
for (const key of GIT_HOOK_ENV_KEYS) {
delete sanitized[key];
}
return sanitized;
}
export function createSimpleGit(baseDir: string): SimpleGit {
return simpleGit({ baseDir, unsafe: { allowUnsafeAskPass: true } }).env(sanitizedGitEnv());
}

View file

@ -0,0 +1,75 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import type { KtxCoreConfig } from './config.js';
import { createSimpleGit } from './git-env.js';
import { GitService } from './git.service.js';
describe('GitService.assertWorktreeClean', () => {
let workdir: string;
let git: SimpleGit;
let gitService: GitService;
beforeEach(async () => {
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-clean-'));
git = createSimpleGit(workdir);
await git.init();
await git.addConfig('user.email', 't@test');
await git.addConfig('user.name', 'Test');
await writeFile(join(workdir, 'init'), 'init');
await git.add('.');
await git.commit('init');
const coreConfig: KtxCoreConfig = {
storage: { configDir: workdir, homeDir: workdir },
git: { userName: 'Test', userEmail: 't@test' },
};
gitService = new GitService(coreConfig);
(gitService as any).git = git;
(gitService as any).configDir = workdir;
});
afterEach(async () => rm(workdir, { recursive: true, force: true }));
it('does not throw on a clean worktree', async () => {
await expect(gitService.assertWorktreeClean()).resolves.toBeUndefined();
});
it('throws when MERGE_HEAD exists', async () => {
await writeFile(join(workdir, '.git', 'MERGE_HEAD'), 'deadbeef\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/MERGE_HEAD/);
});
it('throws when CHERRY_PICK_HEAD exists', async () => {
await writeFile(join(workdir, '.git', 'CHERRY_PICK_HEAD'), 'deadbeef\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/CHERRY_PICK_HEAD/);
});
it('throws when REVERT_HEAD exists', async () => {
await writeFile(join(workdir, '.git', 'REVERT_HEAD'), 'deadbeef\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/REVERT_HEAD/);
});
it('throws when sequencer/todo exists (interrupted multi-commit revert/cherry-pick)', async () => {
await mkdir(join(workdir, '.git', 'sequencer'), { recursive: true });
await writeFile(join(workdir, '.git', 'sequencer', 'todo'), 'pick deadbeef foo\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/sequencer/);
});
it('throws when the index has unmerged paths', async () => {
await git.checkoutLocalBranch('a');
await writeFile(join(workdir, 'shared'), 'A version');
await git.add('.');
await git.commit('a');
await git.checkout('master').catch(() => git.checkout('main'));
await git.checkoutLocalBranch('b');
await writeFile(join(workdir, 'shared'), 'B version');
await git.add('.');
await git.commit('b');
await git.raw(['merge', 'a']).catch(() => undefined);
await expect(gitService.assertWorktreeClean()).rejects.toThrow();
});
});

View file

@ -0,0 +1,78 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { mkdir, mkdtemp, readdir, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import type { KtxCoreConfig } from './config.js';
import { createSimpleGit } from './git-env.js';
import { GitService } from './git.service.js';
describe('GitService.deleteDirectories', () => {
let workdir: string;
let git: SimpleGit;
let gitService: GitService;
beforeEach(async () => {
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-dd-'));
git = createSimpleGit(workdir);
await git.init();
await git.addConfig('user.email', 't@test');
await git.addConfig('user.name', 'Test');
await writeFile(join(workdir, 'keep'), 'k');
await git.add('.');
await git.commit('init');
const coreConfig: KtxCoreConfig = {
storage: { configDir: workdir, homeDir: workdir },
git: { userName: 'Test', userEmail: 't@test' },
};
gitService = new GitService(coreConfig);
(gitService as any).git = git;
(gitService as any).configDir = workdir;
});
afterEach(async () => rm(workdir, { recursive: true, force: true }));
it('removes multiple directories in a single commit', async () => {
for (const name of ['a', 'b', 'c']) {
await mkdir(join(workdir, name), { recursive: true });
await writeFile(join(workdir, name, 'f.txt'), name);
}
await git.add('.');
await git.commit('seed 3 dirs');
const beforeCommits = (await git.log()).total;
const result = await gitService.deleteDirectories(['a', 'b'], 'gc: drop a+b', 'System User', 'system@example.com');
expect(result.commitHash).toBeTruthy();
const entries = await readdir(workdir);
expect(entries).not.toContain('a');
expect(entries).not.toContain('b');
expect(entries).toContain('c');
const afterCommits = (await git.log()).total;
expect(afterCommits).toBe(beforeCommits + 1);
});
it('no-ops and returns a null hash when the input list is empty', async () => {
const result = await gitService.deleteDirectories([], 'empty', 'X', 'x@example.com');
expect(result.commitHash).toBe('');
expect(result.created).toBe(false);
});
it('ignores paths that have already been deleted — commits only the remaining ones', async () => {
await mkdir(join(workdir, 'stale'), { recursive: true });
await writeFile(join(workdir, 'stale', 'x'), 'x');
await git.add('.');
await git.commit('seed stale');
const result = await gitService.deleteDirectories(
['stale', 'missing'],
'gc: drop stale + missing',
'System User',
'system@example.com',
);
expect(result.commitHash).toBeTruthy();
const entries = await readdir(workdir);
expect(entries).not.toContain('stale');
});
});

View file

@ -0,0 +1,45 @@
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { GitService } from './git.service.js';
async function makeGit() {
const homeDir = await mkdtemp(join(tmpdir(), 'ktx-git-patch-'));
const configDir = join(homeDir, 'config');
const git = new GitService({
storage: { configDir, homeDir },
git: {
userName: 'System User',
userEmail: 'system@example.com',
bootstrapMessage: 'init',
bootstrapAuthor: 'system',
bootstrapAuthorEmail: 'system@example.com',
},
});
await git.onModuleInit();
return { homeDir, configDir, git };
}
describe('GitService patch helpers', () => {
it('collects binary-safe no-rename patches and applies them with --3way --index', async () => {
const { homeDir, configDir, git } = await makeGit();
await mkdir(join(configDir, 'wiki/global'), { recursive: true });
await writeFile(join(configDir, 'wiki/global/page.md'), 'old\n');
await git.commitFiles(['wiki/global/page.md'], 'add page', 'System User', 'system@example.com');
const base = await git.revParseHead();
await writeFile(join(configDir, 'wiki/global/page.md'), 'new\n');
await git.commitFiles(['wiki/global/page.md'], 'edit page', 'System User', 'system@example.com');
const patchPath = join(homeDir, 'proposal.patch');
await git.writeBinaryNoRenamePatch(base, 'HEAD', patchPath);
const targetDir = join(homeDir, 'target');
await git.addWorktree(targetDir, 'target', base);
const targetGit = git.forWorktree(targetDir);
await targetGit.applyPatchFile3WayIndex(patchPath);
await targetGit.commitStaged('apply proposal', 'System User', 'system@example.com');
await expect(readFile(join(targetDir, 'wiki/global/page.md'), 'utf-8')).resolves.toBe('new\n');
});
});

View file

@ -0,0 +1,56 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import type { KtxCoreConfig } from './config.js';
import { createSimpleGit } from './git-env.js';
import { GitService } from './git.service.js';
describe('GitService.resetHardTo', () => {
let workdir: string;
let git: SimpleGit;
let gitService: GitService;
beforeEach(async () => {
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-reset-'));
git = createSimpleGit(workdir);
await git.init();
await git.addConfig('user.email', 't@test');
await git.addConfig('user.name', 'Test');
await writeFile(join(workdir, 'init'), 'init');
await git.add('.');
await git.commit('init');
const coreConfig: KtxCoreConfig = {
storage: { configDir: workdir, homeDir: workdir },
git: { userName: 'Test', userEmail: 't@test' },
};
gitService = new GitService(coreConfig);
(gitService as any).git = git;
(gitService as any).configDir = workdir;
});
afterEach(async () => rm(workdir, { recursive: true, force: true }));
it('rewinds HEAD to the target SHA, removing later commits and their files', async () => {
const baseSha = (await git.revparse(['HEAD'])).trim();
await writeFile(join(workdir, 'a'), 'a1');
await git.add('.');
await git.commit('a');
await writeFile(join(workdir, 'b'), 'b1');
await git.add('.');
await git.commit('b');
await gitService.resetHardTo(baseSha);
expect((await git.revparse(['HEAD'])).trim()).toBe(baseSha);
expect(await readFile(join(workdir, 'a'), 'utf-8').catch(() => null)).toBeNull();
expect(await readFile(join(workdir, 'b'), 'utf-8').catch(() => null)).toBeNull();
});
it('is a no-op when target SHA equals current HEAD', async () => {
const sha = (await git.revparse(['HEAD'])).trim();
await gitService.resetHardTo(sha);
expect((await git.revparse(['HEAD'])).trim()).toBe(sha);
});
});

View file

@ -0,0 +1,450 @@
import { mkdtemp, readFile, realpath, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import type { KtxCoreConfig } from './config.js';
import { GitService } from './git.service.js';
// These tests drive a real git repo inside a temp directory — simple-git shells out to the
// system `git` binary. They are fast enough to run as unit tests and catch real issues that
// would be invisible with mocked git.
describe('GitService', () => {
let service: GitService;
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'git-service-spec-'));
const coreConfig: KtxCoreConfig = {
storage: { configDir: tempDir, homeDir: tempDir },
git: {
userName: 'Test User',
userEmail: 'test@example.com',
bootstrapMessage: 'Initialize test config repo',
bootstrapAuthor: 'test-system',
bootstrapAuthorEmail: 'system@example.com',
},
};
service = new GitService(coreConfig);
await service.onModuleInit();
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
const writeAndCommit = async (filePath: string, content: string, message = 'msg') => {
await writeFile(join(tempDir, filePath), content, 'utf-8');
return service.commitFile(filePath, message, 'Test', 'test@example.com');
};
describe('cold-start bootstrap commit', () => {
it('writes an empty commit on init so HEAD always resolves', async () => {
// beforeEach already ran onModuleInit() against an empty temp dir.
const head = await service.revParseHead();
expect(head).toMatch(/^[0-9a-f]{40}$/);
});
it('does not double-commit when re-initialized', async () => {
const before = await service.revParseHead();
await service.onModuleInit();
const after = await service.revParseHead();
expect(after).toBe(before);
});
it('keeps git auto-maintenance attached for deterministic cleanup', async () => {
const config = await readFile(join(tempDir, '.git', 'config'), 'utf-8');
expect(config).toMatch(/\[gc]\n\s+autoDetach = false/);
expect(config).toMatch(/\[maintenance]\n\s+autoDetach = false/);
});
it('initializes when release automation sets GIT_ASKPASS', async () => {
const releaseEnvDir = await mkdtemp(join(tmpdir(), 'git-service-release-env-'));
const previousAskPass = process.env.GIT_ASKPASS;
process.env.GIT_ASKPASS = 'echo';
try {
const releaseEnvService = new GitService({
storage: { configDir: releaseEnvDir, homeDir: releaseEnvDir },
git: {
userName: 'Test User',
userEmail: 'test@example.com',
bootstrapMessage: 'Initialize test config repo',
bootstrapAuthor: 'test-system',
bootstrapAuthorEmail: 'system@example.com',
},
});
await expect(releaseEnvService.onModuleInit()).resolves.toBeUndefined();
} finally {
if (previousAskPass === undefined) {
delete process.env.GIT_ASKPASS;
} else {
process.env.GIT_ASKPASS = previousAskPass;
}
await rm(releaseEnvDir, { recursive: true, force: true });
}
});
});
describe('commitFile `created` flag', () => {
it('is true for a real commit', async () => {
const info = await writeAndCommit('a.md', '# Hello');
expect(info.created).toBe(true);
});
it('is false on a no-op write (content unchanged)', async () => {
await writeAndCommit('a.md', '# Hello');
const second = await writeAndCommit('a.md', '# Hello', 'unused');
expect(second.created).toBe(false);
});
});
describe('addNote / getNote', () => {
it('attaches a note and reads it back', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, 'Rich message from LLM');
expect(await service.getNote(info.commitHash)).toBe('Rich message from LLM');
});
it('returns undefined when no note exists', async () => {
const info = await writeAndCommit('a.md', '# Hello');
expect(await service.getNote(info.commitHash)).toBeUndefined();
});
it('overwrites an existing note (idempotent retries)', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, 'First');
await service.addNote(info.commitHash, 'Second');
expect(await service.getNote(info.commitHash)).toBe('Second');
});
it('skips empty/whitespace messages silently', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, ' ');
expect(await service.getNote(info.commitHash)).toBeUndefined();
});
});
describe('getFileHistory', () => {
it('surfaces enhancedMessage when a note is present', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, 'Note body');
const history = await service.getFileHistory('a.md');
expect(history[0]?.enhancedMessage).toBe('Note body');
});
it('leaves enhancedMessage undefined when no note is attached', async () => {
await writeAndCommit('a.md', '# Hello');
const history = await service.getFileHistory('a.md');
expect(history[0]?.enhancedMessage).toBeUndefined();
});
});
describe('getCommitDiff', () => {
it('returns the patch scoped to the requested path', async () => {
const info = await writeAndCommit('a.md', '# Hello');
const diff = await service.getCommitDiff(info.commitHash, 'a.md');
expect(diff).toContain('diff --git');
expect(diff).toContain('Hello');
});
it('handles the repository initial commit without throwing', async () => {
const info = await writeAndCommit('first.md', 'first');
await expect(service.getCommitDiff(info.commitHash, 'first.md')).resolves.toBeDefined();
});
});
describe('squashTo', () => {
const writeAsSystem = async (filePath: string, content: string, message = 'msg') => {
await writeFile(join(tempDir, filePath), content, 'utf-8');
return service.commitFile(filePath, message, 'System User', 'system@example.com');
};
it('collapses 3 commits after preHead into a single commit', async () => {
const pre = await writeAsSystem('a.md', 'v1');
const preHead = pre.commitHash;
await writeAsSystem('b.md', 'b', 'add b');
await writeAsSystem('c.md', 'c', 'add c');
await writeAsSystem('a.md', 'v2', 'update a');
const result = await service.squashTo(preHead, {
message: 'Ingest: bundle 3 writes',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(true);
expect(result.squashedCount).toBe(3);
expect(result.commitHash).toBeTruthy();
expect(result.commitHash).not.toBe(preHead);
const commitHash = result.commitHash;
if (!commitHash) {
throw new Error('Expected squash commit hash');
}
// The squashed commit should preserve the final tree state.
const fileAtSquash = await service.getFileAtCommit('a.md', commitHash);
expect(fileAtSquash).toBe('v2');
const bAtSquash = await service.getFileAtCommit('b.md', commitHash);
expect(bAtSquash).toBe('b');
});
it('is a no-op when preHead equals HEAD', async () => {
const pre = await writeAsSystem('a.md', 'v1');
const result = await service.squashTo(pre.commitHash, {
message: 'nothing to squash',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(false);
expect(result.commitHash).toBe(pre.commitHash);
});
it('skips squash when a foreign-author commit sits between preHead and HEAD', async () => {
const pre = await writeAsSystem('a.md', 'v1');
const preHead = pre.commitHash;
await writeAsSystem('b.md', 'from us', 'ours');
// Foreign commit
await writeAndCommit('c.md', 'from someone else', 'foreign');
await writeAsSystem('d.md', 'ours again', 'ours 2');
const result = await service.squashTo(preHead, {
message: 'should be skipped',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(false);
expect(result.reason).toContain('foreign');
expect(result.squashedCount).toBe(3);
});
it('returns cleanly when preHead is empty (no starting commit)', async () => {
const result = await service.squashTo('', {
message: 'would have squashed',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(false);
expect(result.commitHash).toBeNull();
});
});
describe('worktree lifecycle', () => {
// macOS canonicalizes tmp paths (/var/folders → /private/var/folders) when git
// returns them from `worktree list`. Resolve through realpath() before comparing.
const canonicalSiblingPath = async (suffix: string): Promise<string> => {
const parent = await realpath(join(tempDir, '..'));
return join(parent, `wt-${Date.now()}-${suffix}`);
};
it('addWorktree creates a branch + directory at the given startSha', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const wtDir = await canonicalSiblingPath('add');
await service.addWorktree(wtDir, 'session/alpha', commitHash);
const list = await service.listWorktrees();
expect(list.find((e) => e.path === wtDir && e.branch === 'refs/heads/session/alpha')).toBeTruthy();
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('removeWorktree detaches the worktree entry', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const wtDir = await canonicalSiblingPath('rm');
await service.addWorktree(wtDir, 'session/beta', commitHash);
await service.removeWorktree(wtDir);
const list = await service.listWorktrees();
expect(list.find((e) => e.path === wtDir)).toBeFalsy();
});
it('deleteBranch removes a branch ref', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const wtDir = await canonicalSiblingPath('br');
await service.addWorktree(wtDir, 'session/gamma', commitHash);
await service.removeWorktree(wtDir);
await service.deleteBranch('session/gamma', true);
const branches = await (service as unknown as { git: import('simple-git').SimpleGit }).git.branchLocal();
expect(branches.all).not.toContain('session/gamma');
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
});
describe('forWorktree', () => {
it('returns a GitService whose operations run inside the given worktree', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-fw`);
await service.addWorktree(wtDir, 'session/delta', commitHash);
const scoped = service.forWorktree(wtDir);
expect(await scoped.revParseHead()).toBe(commitHash);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('serializes concurrent commits from scoped services targeting the same worktree', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-fw-concurrent`);
await service.addWorktree(wtDir, 'session/concurrent', commitHash);
const first = service.forWorktree(wtDir);
const second = service.forWorktree(wtDir);
await writeFile(join(wtDir, 'a.md'), 'a\n', 'utf-8');
await writeFile(join(wtDir, 'b.md'), 'b\n', 'utf-8');
const [a, b] = await Promise.all([
first.commitFile('a.md', 'add a', 'System User', 'system@example.com'),
second.commitFile('b.md', 'add b', 'System User', 'system@example.com'),
]);
expect(a.commitHash).toMatch(/^[0-9a-f]{40}$/);
expect(b.commitHash).toMatch(/^[0-9a-f]{40}$/);
await expect(first.getFileAtCommit('a.md', a.commitHash)).resolves.toBe('a\n');
await expect(second.getFileAtCommit('b.md', b.commitHash)).resolves.toBe('b\n');
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
});
describe('squashMergeIntoMain', () => {
it('merges a session branch as one commit on main, returning the new SHA + touched paths', async () => {
const { commitHash: baseSha } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-sm`);
await service.addWorktree(wtDir, 'session/happy', baseSha);
const scoped = service.forWorktree(wtDir);
await writeFile(join(wtDir, 'a.yaml'), 'one: 1\n', 'utf-8');
await scoped.commitFile('a.yaml', 'wip a', 'System User', 'system@example.com');
await writeFile(join(wtDir, 'b.yaml'), 'two: 2\n', 'utf-8');
await scoped.commitFile('b.yaml', 'wip b', 'System User', 'system@example.com');
const result = await service.squashMergeIntoMain(
'session/happy',
'System User',
'system@example.com',
'Memory capture: 2 files [chat=abcd1234]',
);
expect(result.ok).toBe(true);
if (!result.ok) {
throw new Error('unreachable');
}
expect(result.squashSha).toMatch(/^[0-9a-f]{40}$/);
expect(result.touchedPaths.sort()).toEqual(['a.yaml', 'b.yaml']);
const mainHead = await service.revParseHead();
expect(mainHead).toBe(result.squashSha);
expect(mainHead).not.toBe(baseSha);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('returns ok with empty touchedPaths when the session branch has no diff vs main', async () => {
const { commitHash: baseSha } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-sm-empty`);
await service.addWorktree(wtDir, 'session/empty', baseSha);
const result = await service.squashMergeIntoMain(
'session/empty',
'System User',
'system@example.com',
'should be a no-op',
);
expect(result.ok).toBe(true);
if (!result.ok) {
throw new Error('unreachable');
}
expect(result.touchedPaths).toEqual([]);
expect(result.squashSha).toBe(baseSha);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('returns conflict=true and leaves main clean when session+main touched same file differently', async () => {
await writeAndCommit('shared.yaml', 'base\n');
const base = await service.revParseHead();
if (!base) {
throw new Error('no base head');
}
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-conf`);
await service.addWorktree(wtDir, 'session/conf', base);
const scoped = service.forWorktree(wtDir);
await writeFile(join(wtDir, 'shared.yaml'), 'session-edit\n', 'utf-8');
await scoped.commitFile('shared.yaml', 'session edit', 'System User', 'system@example.com');
// Main edits the same file a different way, after the session branched.
await writeAndCommit('shared.yaml', 'main-edit\n');
const result = await service.squashMergeIntoMain(
'session/conf',
'System User',
'system@example.com',
'Memory capture: 1 file [chat=dead1234]',
);
expect(result.ok).toBe(false);
if (result.ok) {
throw new Error('unreachable');
}
expect(result.conflict).toBe(true);
expect(result.conflictPaths).toContain('shared.yaml');
const status = await (service as unknown as { git: import('simple-git').SimpleGit }).git.status();
expect(status.isClean()).toBe(true);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('reports untracked files that would be overwritten by the squash merge', async () => {
const { commitHash: baseSha } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-untracked`);
await service.addWorktree(wtDir, 'session/untracked', baseSha);
const scoped = service.forWorktree(wtDir);
await writeFile(join(wtDir, 'knowledge.md'), 'session version\n', 'utf-8');
await scoped.commitFile('knowledge.md', 'session write', 'System User', 'system@example.com');
await writeFile(join(tempDir, 'knowledge.md'), 'untracked local version\n', 'utf-8');
const result = await service.squashMergeIntoMain(
'session/untracked',
'System User',
'system@example.com',
'Memory capture: 1 file [chat=untracked]',
);
expect(result.ok).toBe(false);
if (result.ok) {
throw new Error('unreachable');
}
expect(result.conflict).toBe(true);
expect(result.conflictPaths).toEqual(['knowledge.md']);
const status = await (service as unknown as { git: import('simple-git').SimpleGit }).git.status();
expect(status.not_added).toContain('knowledge.md');
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
});
});

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,27 @@
export type { KtxCoreConfig, KtxGitConfig, KtxLogger, KtxStorageConfig } from './config.js';
export { noopLogger, resolveConfigDir, resolveWorktreesDir } from './config.js';
export { resolveKtxConfigReference, resolveKtxHomePath } from './config-reference.js';
export type { KtxEmbeddingPort } from './embedding.js';
export {
REDACTED_KTX_CREDENTIAL_VALUE,
redactKtxSensitiveMetadata,
redactKtxSensitiveText,
redactKtxSensitiveValue,
} from './redaction.js';
export type {
KtxFileHistoryEntry,
KtxFileListResult,
KtxFileReadResult,
KtxFileStorePort,
KtxFileWriteResult,
} from './file-store.js';
export type { GitCommitInfo, SquashMergeResult, WorktreeEntry } from './git.service.js';
export { GitService } from './git.service.js';
export type {
SentinelPayload,
SessionOutcome,
SessionWorktree,
SessionWorktreeServiceDeps,
WorktreeConfigPort,
} from './session-worktree.service.js';
export { SessionWorktreeService } from './session-worktree.service.js';

View file

@ -0,0 +1,47 @@
export const REDACTED_KTX_CREDENTIAL_VALUE = '<redacted>';
const SENSITIVE_FIELD_NAME = /(password|secret|token|api[_-]?key|private[_-]?key|passphrase|credential|authorization|url)/i;
const URL_CREDENTIAL_PATTERN = /([a-z][a-z0-9+.-]*:\/\/[^:\s/@]+:)([^@\s/]+)(@)/gi;
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function isSensitiveField(key: string): boolean {
return SENSITIVE_FIELD_NAME.test(key);
}
export function redactKtxSensitiveValue(key: string, value: unknown): unknown {
if (isSensitiveField(key)) {
return REDACTED_KTX_CREDENTIAL_VALUE;
}
if (Array.isArray(value)) {
return value.map((item) => redactKtxSensitiveValue(key, item));
}
if (isRecord(value)) {
return redactKtxSensitiveMetadata(value);
}
return value;
}
export function redactKtxSensitiveMetadata(metadata: Record<string, unknown>): Record<string, unknown> {
const redacted: Record<string, unknown> = {};
for (const [key, value] of Object.entries(metadata)) {
if (Array.isArray(value)) {
redacted[key] = value.map((item) =>
isRecord(item) ? redactKtxSensitiveMetadata(item) : redactKtxSensitiveValue(key, item),
);
continue;
}
if (isRecord(value)) {
redacted[key] = redactKtxSensitiveValue(key, value);
continue;
}
redacted[key] = redactKtxSensitiveValue(key, value);
}
return redacted;
}
export function redactKtxSensitiveText(value: string): string {
return value.replace(URL_CREDENTIAL_PATTERN, `$1${REDACTED_KTX_CREDENTIAL_VALUE}$3`);
}

View file

@ -0,0 +1,124 @@
import { mkdtemp, realpath, rm, stat } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import type { KtxCoreConfig } from './config.js';
import { GitService } from './git.service.js';
import { SessionWorktreeService, type WorktreeConfigPort } from './session-worktree.service.js';
interface TestWorktreeConfig extends WorktreeConfigPort<TestWorktreeConfig> {
workdir?: string;
}
// SessionWorktreeService glues a real GitService to a scoped config adapter.
describe('SessionWorktreeService', () => {
let sessionService: SessionWorktreeService<TestWorktreeConfig>;
let gitService: GitService;
let homeDir: string;
beforeEach(async () => {
homeDir = await mkdtemp(join(tmpdir(), 'sws-spec-'));
homeDir = await realpath(homeDir);
const coreConfig: KtxCoreConfig = {
storage: { configDir: homeDir, homeDir },
git: {
userName: 'System User',
userEmail: 'system@example.com',
bootstrapMessage: 'Initialize test config repo',
bootstrapAuthor: 'test-system',
bootstrapAuthorEmail: 'system@example.com',
},
};
gitService = new GitService(coreConfig);
await gitService.onModuleInit();
const configService: TestWorktreeConfig = {
forWorktree: vi.fn(
(workdir: string): TestWorktreeConfig => ({ workdir, forWorktree: configService.forWorktree }),
),
};
sessionService = new SessionWorktreeService({
coreConfig,
gitService,
configService,
});
});
afterEach(async () => {
await rm(homeDir, { recursive: true, force: true });
});
describe('create', () => {
it('creates a worktree + branch and returns scoped services', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const session = await sessionService.create('chat-abc', baseSha);
expect(session.workdir).toBe(join(homeDir, '.worktrees', 'session-chat-abc'));
expect(session.branch).toBe('session/chat-abc');
expect(session.baseSha).toBe(baseSha);
const stats = await stat(session.workdir);
expect(stats.isDirectory()).toBe(true);
// Scoped git instance reports the worktree's HEAD (= baseSha at creation time).
expect(await session.git.revParseHead()).toBe(baseSha);
const list = await gitService.listWorktrees();
expect(list.find((e) => e.path === session.workdir)).toBeTruthy();
});
it('appends a timestamp suffix when the primary dir already exists', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const first = await sessionService.create('chat-dup', baseSha);
const second = await sessionService.create('chat-dup', baseSha);
expect(first.workdir).not.toBe(second.workdir);
expect(second.branch).toMatch(/^session\/chat-dup-\d+$/);
});
});
describe('cleanup', () => {
it('success removes the worktree dir and deletes the branch', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const session = await sessionService.create('chat-cleanup-ok', baseSha);
await sessionService.cleanup(session, 'success');
const list = await gitService.listWorktrees();
expect(list.find((e) => e.path === session.workdir)).toBeFalsy();
await expect(stat(session.workdir)).rejects.toThrow();
});
it('conflict keeps the worktree and writes a sentinel file', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const session = await sessionService.create('chat-cleanup-conflict', baseSha);
await sessionService.cleanup(session, 'conflict', { conflictPaths: ['shared.yaml'] });
// Dir still exists.
await expect(stat(session.workdir)).resolves.toBeTruthy();
const { readFile } = await import('node:fs/promises');
const raw = await readFile(join(session.workdir, '.ktx-outcome'), 'utf-8');
const parsed = JSON.parse(raw);
expect(parsed.outcome).toBe('conflict');
expect(parsed.chatId).toBe('chat-cleanup-conflict');
expect(parsed.conflictPaths).toEqual(['shared.yaml']);
expect(typeof parsed.at).toBe('string');
});
});
});

View file

@ -0,0 +1,113 @@
import { mkdir, stat, writeFile } from 'node:fs/promises';
import { join } from 'node:path';
import { noopLogger, resolveWorktreesDir, type KtxCoreConfig, type KtxLogger } from './config.js';
import { GitService } from './git.service.js';
export type SessionOutcome = 'success' | 'empty' | 'conflict' | 'crash';
export interface SentinelPayload {
outcome: SessionOutcome;
at: string;
chatId: string;
baseSha: string;
conflictPaths?: string[];
}
export interface WorktreeConfigPort<TConfig> {
forWorktree(workdir: string): TConfig;
}
export interface SessionWorktree<TConfig> {
chatId: string;
workdir: string;
branch: string;
baseSha: string;
createdAt: Date;
git: GitService;
config: TConfig;
}
export interface SessionWorktreeServiceDeps<TConfig extends WorktreeConfigPort<TConfig>> {
coreConfig: KtxCoreConfig;
gitService: GitService;
configService: TConfig;
logger?: KtxLogger;
}
export class SessionWorktreeService<TConfig extends WorktreeConfigPort<TConfig> = WorktreeConfigPort<never>> {
private readonly logger: KtxLogger;
private readonly worktreesRoot: string;
constructor(private readonly deps: SessionWorktreeServiceDeps<TConfig>) {
this.logger = deps.logger ?? noopLogger;
this.worktreesRoot = resolveWorktreesDir(deps.coreConfig);
}
async create(sessionKey: string, baseSha: string): Promise<SessionWorktree<TConfig>> {
await mkdir(this.worktreesRoot, { recursive: true });
let dirName = `session-${sessionKey}`;
let branch = `session/${sessionKey}`;
let workdir = join(this.worktreesRoot, dirName);
try {
await stat(workdir);
const suffix = Date.now().toString();
dirName = `session-${sessionKey}-${suffix}`;
branch = `session/${sessionKey}-${suffix}`;
workdir = join(this.worktreesRoot, dirName);
this.logger.warn(`session worktree collision for key=${sessionKey}; using suffix ${suffix}`);
} catch {
// no collision: primary name is free
}
await this.deps.gitService.addWorktree(workdir, branch, baseSha);
return {
chatId: sessionKey,
workdir,
branch,
baseSha,
createdAt: new Date(),
git: this.deps.gitService.forWorktree(workdir),
config: this.deps.configService.forWorktree(workdir),
};
}
async cleanup(
session: SessionWorktree<TConfig>,
outcome: SessionOutcome,
extra?: { conflictPaths?: string[] },
): Promise<void> {
if (outcome === 'success' || outcome === 'empty') {
try {
await this.deps.gitService.removeWorktree(session.workdir);
await this.deps.gitService.deleteBranch(session.branch, true);
} catch (error) {
this.logger.warn(
`cleanup(${outcome}) failed for ${session.chatId}: ${
error instanceof Error ? error.message : String(error)
}`,
);
}
return;
}
const payload: SentinelPayload = {
outcome,
at: new Date().toISOString(),
chatId: session.chatId,
baseSha: session.baseSha,
...(extra?.conflictPaths ? { conflictPaths: extra.conflictPaths } : {}),
};
try {
await writeFile(join(session.workdir, '.ktx-outcome'), JSON.stringify(payload, null, 2), 'utf-8');
} catch (error) {
this.logger.warn(
`cleanup(${outcome}) failed to write sentinel for ${session.chatId}: ${
error instanceof Error ? error.message : String(error)
}`,
);
}
}
}

View file

@ -0,0 +1 @@
export * from './semantic-layer-compute.js';

View file

@ -0,0 +1,339 @@
import { once } from 'node:events';
import { createServer } from 'node:http';
import { describe, expect, it, vi } from 'vitest';
import { createHttpSemanticLayerComputePort, createPythonSemanticLayerComputePort } from './semantic-layer-compute.js';
const source = {
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'number' }],
joins: [],
measures: [{ name: 'order_count', expr: 'count(*)' }],
};
const sourceGenerationInput = {
tables: [
{
name: 'orders',
db: 'public',
comment: 'Orders table',
columns: [
{ name: 'id', type: 'integer', primaryKey: true, nullable: false, comment: 'Order ID' },
{ name: 'customer_id', type: 'integer' },
{ name: 'amount', type: 'decimal', comment: 'Order amount' },
],
},
{
name: 'customers',
db: 'public',
columns: [
{ name: 'id', type: 'integer', primaryKey: true },
{ name: 'email', type: 'varchar' },
],
},
],
links: [
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
relationshipType: 'MANY_TO_ONE',
},
],
dialect: 'postgres',
};
const sourceGenerationDaemonPayload = {
tables: [
{
name: 'orders',
db: 'public',
comment: 'Orders table',
columns: [
{ name: 'id', type: 'integer', primary_key: true, nullable: false, comment: 'Order ID' },
{ name: 'customer_id', type: 'integer' },
{ name: 'amount', type: 'decimal', comment: 'Order amount' },
],
},
{
name: 'customers',
db: 'public',
columns: [
{ name: 'id', type: 'integer', primary_key: true },
{ name: 'email', type: 'varchar' },
],
},
],
links: [
{
from_table: 'orders',
from_column: 'customer_id',
to_table: 'customers',
to_column: 'id',
relationship_type: 'MANY_TO_ONE',
},
],
dialect: 'postgres',
};
const sourceGenerationDaemonResponse = {
source_count: 2,
sources: [
{
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'number' }],
joins: [
{
to: 'customers',
on: 'customer_id = customers.id',
relationship: 'many_to_one',
},
],
measures: [{ name: 'record_count', expr: 'count(id)' }],
},
],
};
describe('createPythonSemanticLayerComputePort', () => {
it('calls the semantic-query stdio command', async () => {
const runJson = vi.fn(async () => ({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
}));
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toEqual({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
});
expect(runJson).toHaveBeenCalledWith('semantic-query', {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
});
});
it('calls the semantic-validate stdio command', async () => {
const runJson = vi.fn(async () => ({
valid: true,
errors: [],
warnings: [],
per_source_warnings: {},
}));
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(
port.validateSources({
sources: [source],
dialect: 'postgres',
recentlyTouched: ['orders'],
}),
).resolves.toEqual({
valid: true,
errors: [],
warnings: [],
perSourceWarnings: {},
});
expect(runJson).toHaveBeenCalledWith('semantic-validate', {
sources: [source],
dialect: 'postgres',
recently_touched: ['orders'],
});
});
it('calls the semantic-generate-sources stdio command', async () => {
const runJson = vi.fn(async () => sourceGenerationDaemonResponse);
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(runJson).toHaveBeenCalledWith('semantic-generate-sources', sourceGenerationDaemonPayload);
});
});
describe('createHttpSemanticLayerComputePort', () => {
it('calls semantic query and validate HTTP endpoints through an injected runner', async () => {
const requestJson = vi.fn(async (path: string) => {
if (path === '/semantic-layer/query') {
return {
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
};
}
return {
valid: true,
errors: [],
warnings: [],
per_source_warnings: {},
};
});
const port = createHttpSemanticLayerComputePort({ baseUrl: 'http://127.0.0.1:8765/', requestJson });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toEqual({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
});
await expect(
port.validateSources({
sources: [source],
dialect: 'postgres',
recentlyTouched: ['orders'],
}),
).resolves.toEqual({
valid: true,
errors: [],
warnings: [],
perSourceWarnings: {},
});
expect(requestJson).toHaveBeenNthCalledWith(1, '/semantic-layer/query', {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
});
expect(requestJson).toHaveBeenNthCalledWith(2, '/semantic-layer/validate', {
sources: [source],
dialect: 'postgres',
recently_touched: ['orders'],
});
});
it('calls the semantic source-generation HTTP endpoint through an injected runner', async () => {
const requestJson = vi.fn(async () => sourceGenerationDaemonResponse);
const port = createHttpSemanticLayerComputePort({ baseUrl: 'http://127.0.0.1:8765/', requestJson });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(requestJson).toHaveBeenCalledWith('/semantic-layer/generate-sources', sourceGenerationDaemonPayload);
});
it('posts JSON to a running HTTP daemon endpoint', async () => {
const requests: Array<{ url: string | undefined; body: unknown }> = [];
const server = createServer((request, response) => {
const chunks: Buffer[] = [];
request.on('data', (chunk: Buffer) => chunks.push(chunk));
request.on('end', () => {
requests.push({
url: request.url,
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
});
response.writeHead(200, { 'content-type': 'application/json' });
response.end(
JSON.stringify({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
}),
);
});
});
server.listen(0, '127.0.0.1');
await once(server, 'listening');
try {
const address = server.address();
if (!address || typeof address === 'string') {
throw new Error('expected TCP server address');
}
const port = createHttpSemanticLayerComputePort({ baseUrl: `http://127.0.0.1:${address.port}` });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toMatchObject({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
});
expect(requests).toEqual([
{
url: '/semantic-layer/query',
body: {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
},
},
]);
} finally {
server.close();
}
});
it('posts source-generation JSON to a running HTTP daemon endpoint', async () => {
const requests: Array<{ url: string | undefined; body: unknown }> = [];
const server = createServer((request, response) => {
const chunks: Buffer[] = [];
request.on('data', (chunk: Buffer) => chunks.push(chunk));
request.on('end', () => {
requests.push({
url: request.url,
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
});
response.writeHead(200, { 'content-type': 'application/json' });
response.end(JSON.stringify(sourceGenerationDaemonResponse));
});
});
server.listen(0, '127.0.0.1');
await once(server, 'listening');
try {
const address = server.address();
if (!address || typeof address === 'string') {
throw new Error('expected TCP server address');
}
const port = createHttpSemanticLayerComputePort({ baseUrl: `http://127.0.0.1:${address.port}` });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(requests).toEqual([
{
url: '/semantic-layer/generate-sources',
body: sourceGenerationDaemonPayload,
},
]);
} finally {
server.close();
}
});
});

View file

@ -0,0 +1,312 @@
import { request as httpRequest } from 'node:http';
import { request as httpsRequest } from 'node:https';
import { URL } from 'node:url';
import { spawn } from 'node:child_process';
import type { ResolvedSemanticLayerSource, SemanticLayerQueryInput } from '../sl/types.js';
export interface KtxSemanticLayerComputeQueryResult {
sql: string;
dialect: string;
columns: Array<Record<string, unknown>>;
plan: Record<string, unknown>;
}
export interface KtxSemanticLayerComputeValidationResult {
valid: boolean;
errors: string[];
warnings: string[];
perSourceWarnings: Record<string, string[]>;
}
export interface KtxSemanticLayerSourceGenerationColumnInput {
name: string;
type: string;
primaryKey?: boolean;
nullable?: boolean;
comment?: string | null;
}
export interface KtxSemanticLayerSourceGenerationTableInput {
name: string;
catalog?: string | null;
db?: string | null;
comment?: string | null;
columns: KtxSemanticLayerSourceGenerationColumnInput[];
}
export interface KtxSemanticLayerSourceGenerationLinkInput {
fromTable: string;
fromColumn: string;
toTable: string;
toColumn: string;
relationshipType: string;
}
export interface KtxSemanticLayerSourceGenerationInput {
tables: KtxSemanticLayerSourceGenerationTableInput[];
links: KtxSemanticLayerSourceGenerationLinkInput[];
dialect?: string;
}
export interface KtxSemanticLayerSourceGenerationResult {
sources: Array<Record<string, unknown>>;
sourceCount: number;
}
export interface KtxSemanticLayerComputePort {
/**
* Callers must pass sources sanitized through toResolvedWire. The Python
* daemon rejects authoring-only fields such as usage and inherits_columns_from.
*/
query(input: {
sources: ResolvedSemanticLayerSource[];
query: SemanticLayerQueryInput;
dialect: string;
}): Promise<KtxSemanticLayerComputeQueryResult>;
/**
* Callers must pass sources sanitized through toResolvedWire. The Python
* daemon rejects authoring-only fields such as usage and inherits_columns_from.
*/
validateSources(input: {
sources: ResolvedSemanticLayerSource[];
dialect: string;
recentlyTouched?: string[];
}): Promise<KtxSemanticLayerComputeValidationResult>;
generateSources(input: KtxSemanticLayerSourceGenerationInput): Promise<KtxSemanticLayerSourceGenerationResult>;
}
export type KtxDaemonCommand = 'semantic-query' | 'semantic-validate' | 'semantic-generate-sources';
export type KtxDaemonJsonRunner = (
subcommand: KtxDaemonCommand,
payload: Record<string, unknown>,
) => Promise<Record<string, unknown>>;
export type KtxDaemonHttpJsonRunner = (path: string, payload: Record<string, unknown>) => Promise<Record<string, unknown>>;
export interface PythonSemanticLayerComputeOptions {
command?: string;
args?: string[];
cwd?: string;
env?: NodeJS.ProcessEnv;
runJson?: KtxDaemonJsonRunner;
}
export interface HttpSemanticLayerComputeOptions {
baseUrl: string;
requestJson?: KtxDaemonHttpJsonRunner;
}
function parseJsonObject(raw: string, subcommand: string): Record<string, unknown> {
const parsed = JSON.parse(raw) as unknown;
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
throw new Error(`ktx-daemon ${subcommand} returned non-object JSON`);
}
return parsed as Record<string, unknown>;
}
function runProcessJson(
options: Required<Pick<PythonSemanticLayerComputeOptions, 'command' | 'args'>> &
Pick<PythonSemanticLayerComputeOptions, 'cwd' | 'env'>,
): KtxDaemonJsonRunner {
return async (subcommand: KtxDaemonCommand, payload: Record<string, unknown>): Promise<Record<string, unknown>> =>
new Promise((resolve, reject) => {
const child = spawn(options.command, [...options.args, subcommand], {
cwd: options.cwd,
env: { ...process.env, ...options.env },
stdio: ['pipe', 'pipe', 'pipe'],
});
const stdout: Buffer[] = [];
const stderr: Buffer[] = [];
child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk));
child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk));
child.on('error', reject);
child.on('close', (code) => {
const stdoutText = Buffer.concat(stdout).toString('utf8').trim();
const stderrText = Buffer.concat(stderr).toString('utf8').trim();
if (code !== 0) {
reject(new Error(`ktx-daemon ${subcommand} failed: ${stderrText || `exit code ${code}`}`));
return;
}
try {
resolve(parseJsonObject(stdoutText, subcommand));
} catch (error) {
reject(error);
}
});
child.stdin.end(`${JSON.stringify(payload)}\n`);
});
}
function normalizedBaseUrl(baseUrl: string): string {
return baseUrl.endsWith('/') ? baseUrl : `${baseUrl}/`;
}
function postJson(baseUrl: string): KtxDaemonHttpJsonRunner {
return async (path, payload) =>
new Promise((resolve, reject) => {
const target = new URL(path.replace(/^\//, ''), normalizedBaseUrl(baseUrl));
const body = JSON.stringify(payload);
const client = target.protocol === 'https:' ? httpsRequest : httpRequest;
const request = client(
target,
{
method: 'POST',
headers: {
accept: 'application/json',
'content-type': 'application/json',
'content-length': Buffer.byteLength(body),
},
},
(response) => {
const chunks: Buffer[] = [];
response.on('data', (chunk: Buffer) => chunks.push(chunk));
response.on('end', () => {
const text = Buffer.concat(chunks).toString('utf8');
const statusCode = response.statusCode ?? 0;
if (statusCode < 200 || statusCode >= 300) {
reject(new Error(`ktx-daemon HTTP ${path} failed with ${statusCode}: ${text}`));
return;
}
try {
resolve(parseJsonObject(text, path));
} catch (error) {
reject(error);
}
});
},
);
request.on('error', reject);
request.end(body);
});
}
function stringArray(value: unknown): string[] {
return Array.isArray(value) ? value.filter((item): item is string => typeof item === 'string') : [];
}
function recordValue(value: unknown): Record<string, unknown> {
return value && typeof value === 'object' && !Array.isArray(value) ? (value as Record<string, unknown>) : {};
}
function recordArray(value: unknown): Array<Record<string, unknown>> {
return Array.isArray(value)
? value.filter(
(item): item is Record<string, unknown> => item !== null && typeof item === 'object' && !Array.isArray(item),
)
: [];
}
function sourceGenerationPayload(input: KtxSemanticLayerSourceGenerationInput): Record<string, unknown> {
return {
tables: input.tables.map((table) => ({
name: table.name,
...(table.catalog !== undefined ? { catalog: table.catalog } : {}),
...(table.db !== undefined ? { db: table.db } : {}),
...(table.comment !== undefined ? { comment: table.comment } : {}),
columns: table.columns.map((column) => ({
name: column.name,
type: column.type,
...(column.primaryKey !== undefined ? { primary_key: column.primaryKey } : {}),
...(column.nullable !== undefined ? { nullable: column.nullable } : {}),
...(column.comment !== undefined ? { comment: column.comment } : {}),
})),
})),
links: input.links.map((link) => ({
from_table: link.fromTable,
from_column: link.fromColumn,
to_table: link.toTable,
to_column: link.toColumn,
relationship_type: link.relationshipType,
})),
dialect: input.dialect ?? 'postgres',
};
}
function sourceGenerationResult(raw: Record<string, unknown>): KtxSemanticLayerSourceGenerationResult {
return {
sources: recordArray(raw.sources),
sourceCount: typeof raw.source_count === 'number' ? raw.source_count : recordArray(raw.sources).length,
};
}
export function createPythonSemanticLayerComputePort(
options: PythonSemanticLayerComputeOptions = {},
): KtxSemanticLayerComputePort {
const command = options.command ?? 'python';
const args = options.args ?? ['-m', 'ktx_daemon'];
const runJson = options.runJson ?? runProcessJson({ command, args, cwd: options.cwd, env: options.env });
return {
async query(input) {
const raw = await runJson('semantic-query', {
sources: input.sources,
dialect: input.dialect,
query: input.query,
});
return {
sql: typeof raw.sql === 'string' ? raw.sql : '',
dialect: typeof raw.dialect === 'string' ? raw.dialect : input.dialect,
columns: recordArray(raw.columns),
plan: recordValue(raw.plan),
};
},
async validateSources(input) {
const raw = await runJson('semantic-validate', {
sources: input.sources,
dialect: input.dialect,
recently_touched: input.recentlyTouched,
});
return {
valid: raw.valid === true,
errors: stringArray(raw.errors),
warnings: stringArray(raw.warnings),
perSourceWarnings: recordValue(raw.per_source_warnings) as Record<string, string[]>,
};
},
async generateSources(input) {
const raw = await runJson('semantic-generate-sources', sourceGenerationPayload(input));
return sourceGenerationResult(raw);
},
};
}
export function createHttpSemanticLayerComputePort(
options: HttpSemanticLayerComputeOptions,
): KtxSemanticLayerComputePort {
const requestJson = options.requestJson ?? postJson(options.baseUrl);
return {
async query(input) {
const raw = await requestJson('/semantic-layer/query', {
sources: input.sources,
dialect: input.dialect,
query: input.query,
});
return {
sql: typeof raw.sql === 'string' ? raw.sql : '',
dialect: typeof raw.dialect === 'string' ? raw.dialect : input.dialect,
columns: recordArray(raw.columns),
plan: recordValue(raw.plan),
};
},
async validateSources(input) {
const raw = await requestJson('/semantic-layer/validate', {
sources: input.sources,
dialect: input.dialect,
recently_touched: input.recentlyTouched,
});
return {
valid: raw.valid === true,
errors: stringArray(raw.errors),
warnings: stringArray(raw.warnings),
perSourceWarnings: recordValue(raw.per_source_warnings) as Record<string, string[]>,
};
},
async generateSources(input) {
const raw = await requestJson('/semantic-layer/generate-sources', sourceGenerationPayload(input));
return sourceGenerationResult(raw);
},
};
}

View file

@ -0,0 +1,2 @@
export type { ReindexOptions, ReindexScopeResult, ReindexSummary, ReindexWorkResult } from './types.js';
export { discoverReindexScopes, reindexLocalIndexes } from './reindex.js';

View file

@ -0,0 +1,196 @@
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import type { KtxEmbeddingPort } from '../core/index.js';
import { initKtxProject, loadKtxProject, type KtxLocalProject } from '../project/index.js';
import { SqliteKnowledgeIndex } from '../wiki/sqlite-knowledge-index.js';
import { reindexLocalIndexes } from './reindex.js';
class FakeEmbeddingPort implements KtxEmbeddingPort {
readonly maxBatchSize = 8;
async computeEmbedding(text: string): Promise<number[]> {
return [text.length, 1];
}
async computeEmbeddingsBulk(texts: string[]): Promise<number[][]> {
return texts.map((text) => [text.length, 1]);
}
}
async function createProject(tempDir: string): Promise<KtxLocalProject> {
await initKtxProject({ projectDir: tempDir, force: true });
return loadKtxProject({ projectDir: tempDir });
}
describe('reindexLocalIndexes', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-reindex-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('returns an empty summary when no wiki or semantic-layer directories exist', async () => {
const project = await createProject(tempDir);
await rm(join(project.projectDir, 'wiki'), { recursive: true, force: true });
await rm(join(project.projectDir, 'semantic-layer'), { recursive: true, force: true });
await expect(reindexLocalIndexes(project, { force: false, embeddingService: null })).resolves.toMatchObject({
scopes: [],
totals: { scanned: 0, updated: 0, deleted: 0, embeddingsRecomputed: 0, embeddingsFailed: 0 },
force: false,
embeddingsAvailable: false,
});
});
it('discovers empty directories as zero-row scopes', async () => {
const project = await createProject(tempDir);
await mkdir(join(project.projectDir, 'wiki/user/local'), { recursive: true });
await mkdir(join(project.projectDir, 'semantic-layer/warehouse'), { recursive: true });
const summary = await reindexLocalIndexes(project, { force: false, embeddingService: null });
expect(summary.scopes.map((scope) => scope.label)).toEqual(['global', 'user/local', 'warehouse']);
expect(summary.totals.scanned).toBe(0);
});
it('indexes mixed wiki and SL sources and reports totals', async () => {
const project = await createProject(tempDir);
await writeFile(
join(project.projectDir, 'wiki/global/revenue.md'),
'---\nsummary: Revenue\nusage_mode: auto\n---\n\nPaid orders.\n',
'utf-8',
);
await mkdir(join(project.projectDir, 'semantic-layer/warehouse'), { recursive: true });
await writeFile(
join(project.projectDir, 'semantic-layer/warehouse/orders.yaml'),
'name: orders\ntable: public.orders\ngrain: [id]\ncolumns:\n - name: id\n type: number\njoins: []\nmeasures: []\n',
'utf-8',
);
const summary = await reindexLocalIndexes(project, {
force: false,
embeddingService: new FakeEmbeddingPort(),
});
expect(summary.scopes).toHaveLength(2);
expect(summary.totals).toMatchObject({ scanned: 2, updated: 2, deleted: 0, embeddingsRecomputed: 2 });
expect(summary.embeddingsAvailable).toBe(true);
});
it('does not report unchanged lexical-only rows as updated on repeated runs', async () => {
const project = await createProject(tempDir);
await writeFile(
join(project.projectDir, 'wiki/global/revenue.md'),
'---\nsummary: Revenue\nusage_mode: auto\n---\n\nPaid orders.\n',
'utf-8',
);
await mkdir(join(project.projectDir, 'semantic-layer/warehouse'), { recursive: true });
await writeFile(
join(project.projectDir, 'semantic-layer/warehouse/orders.yaml'),
'name: orders\ntable: public.orders\ngrain: [id]\ncolumns:\n - name: id\n type: number\njoins: []\nmeasures: []\n',
'utf-8',
);
const first = await reindexLocalIndexes(project, { force: false, embeddingService: null });
expect(first.totals).toMatchObject({
scanned: 2,
updated: 2,
deleted: 0,
embeddingsRecomputed: 0,
embeddingsFailed: 0,
});
const second = await reindexLocalIndexes(project, { force: false, embeddingService: null });
expect(second.totals).toMatchObject({
scanned: 2,
updated: 0,
deleted: 0,
embeddingsRecomputed: 0,
embeddingsFailed: 0,
});
expect(second.scopes.map((scope) => [scope.label, scope.updated])).toEqual([
['global', 0],
['warehouse', 0],
]);
});
it('force clears stale rows before rebuilding each discovered scope', async () => {
const project = await createProject(tempDir);
const wikiIndex = new SqliteKnowledgeIndex({ dbPath: join(project.projectDir, '.ktx/db.sqlite') });
wikiIndex.sync([
{
path: 'wiki/global/stale.md',
key: 'stale',
scope: 'GLOBAL',
scopeId: null,
summary: 'Stale',
content: 'Stale content',
tags: [],
embedding: [1, 0],
},
]);
await writeFile(
join(project.projectDir, 'wiki/global/revenue.md'),
'---\nsummary: Revenue\nusage_mode: auto\n---\n\nPaid orders.\n',
'utf-8',
);
const summary = await reindexLocalIndexes(project, {
force: true,
embeddingService: new FakeEmbeddingPort(),
});
expect(summary.force).toBe(true);
expect(summary.totals).toMatchObject({ scanned: 1, updated: 1, deleted: 0 });
expect(wikiIndex.search('Stale', 10)).toEqual([]);
});
it('captures a per-scope error and continues other scopes', async () => {
const project = await createProject(tempDir);
await writeFile(
join(project.projectDir, 'wiki/global/revenue.md'),
'---\nsummary: Revenue\nusage_mode: auto\n---\n\nPaid orders.\n',
'utf-8',
);
await mkdir(join(project.projectDir, 'semantic-layer/warehouse'), { recursive: true });
await writeFile(join(project.projectDir, 'semantic-layer/warehouse/broken.yaml'), 'not: [valid', 'utf-8');
const summary = await reindexLocalIndexes(project, { force: false, embeddingService: null });
expect(summary.scopes.find((scope) => scope.label === 'global')?.error).toBeUndefined();
expect(summary.scopes.find((scope) => scope.label === 'warehouse')?.error).toContain('YAML');
});
it('marks a scope errored when configured embeddings fail', async () => {
const project = await createProject(tempDir);
await writeFile(
join(project.projectDir, 'wiki/global/revenue.md'),
'---\nsummary: Revenue\nusage_mode: auto\n---\n\nPaid orders.\n',
'utf-8',
);
const embeddingService: KtxEmbeddingPort = {
maxBatchSize: 8,
async computeEmbedding() {
throw new Error('embedding provider unavailable');
},
async computeEmbeddingsBulk() {
throw new Error('embedding provider unavailable');
},
};
const summary = await reindexLocalIndexes(project, { force: false, embeddingService });
expect(summary.scopes[0]).toMatchObject({
label: 'global',
embeddingsFailed: 1,
error: '1 embedding recomputation failed',
});
});
});

View file

@ -0,0 +1,162 @@
import { readdir, stat } from 'node:fs/promises';
import { join, relative } from 'node:path';
import { ktxLocalStateDbPath, type KtxLocalProject } from '../project/index.js';
import { loadLocalSlSourceRecords, SlSearchService, SqliteSlSourcesIndex } from '../sl/index.js';
import { KnowledgeWikiService, SqliteKnowledgeIndex } from '../wiki/index.js';
import type { ReindexOptions, ReindexScopeResult, ReindexSummary, ReindexWorkResult } from './types.js';
type DiscoveredScope =
| { kind: 'wiki'; scope: 'GLOBAL'; scopeId: null; label: 'global' }
| { kind: 'wiki'; scope: 'USER'; scopeId: string; label: `user/${string}` }
| { kind: 'sl'; connectionId: string; label: string };
const ZERO: ReindexWorkResult = {
scanned: 0,
updated: 0,
deleted: 0,
embeddingsRecomputed: 0,
embeddingsFailed: 0,
};
async function directoryExists(path: string): Promise<boolean> {
try {
return (await stat(path)).isDirectory();
} catch {
return false;
}
}
async function childDirectories(path: string): Promise<string[]> {
try {
const entries = await readdir(path, { withFileTypes: true });
return entries
.filter((entry) => entry.isDirectory())
.map((entry) => entry.name)
.sort((left, right) => left.localeCompare(right));
} catch (error) {
if ((error as NodeJS.ErrnoException).code === 'ENOENT') {
return [];
}
throw error;
}
}
export async function discoverReindexScopes(project: KtxLocalProject): Promise<DiscoveredScope[]> {
const scopes: DiscoveredScope[] = [];
if (await directoryExists(join(project.projectDir, 'wiki/global'))) {
scopes.push({ kind: 'wiki', scope: 'GLOBAL', scopeId: null, label: 'global' });
}
for (const userId of await childDirectories(join(project.projectDir, 'wiki/user'))) {
scopes.push({ kind: 'wiki', scope: 'USER', scopeId: userId, label: `user/${userId}` });
}
for (const connectionId of await childDirectories(join(project.projectDir, 'semantic-layer'))) {
if (connectionId !== '_schema') {
scopes.push({ kind: 'sl', connectionId, label: connectionId });
}
}
return scopes;
}
function errorMessage(error: unknown): string {
if (!(error instanceof Error)) {
return String(error);
}
return error.name && error.name !== 'Error' ? `${error.name}: ${error.message}` : error.message;
}
function addTotals(left: ReindexWorkResult, right: ReindexWorkResult): ReindexWorkResult {
return {
scanned: left.scanned + right.scanned,
updated: left.updated + right.updated,
deleted: left.deleted + right.deleted,
embeddingsRecomputed: left.embeddingsRecomputed + right.embeddingsRecomputed,
embeddingsFailed: left.embeddingsFailed + right.embeddingsFailed,
};
}
function durationSince(startedAt: bigint): number {
return Number((process.hrtime.bigint() - startedAt) / 1_000_000n);
}
function embeddingFailureError(work: ReindexWorkResult): string | undefined {
if (work.embeddingsFailed === 0) {
return undefined;
}
return `${work.embeddingsFailed} embedding recomputation${work.embeddingsFailed === 1 ? '' : 's'} failed`;
}
export async function reindexLocalIndexes(
project: KtxLocalProject,
options: ReindexOptions,
): Promise<ReindexSummary> {
const startedAt = process.hrtime.bigint();
const dbPath = ktxLocalStateDbPath(project);
const scopes = await discoverReindexScopes(project);
const wikiIndex = new SqliteKnowledgeIndex({ dbPath });
const slIndex = new SqliteSlSourcesIndex({ dbPath });
const wikiService = new KnowledgeWikiService(project.fileStore, options.embeddingService, wikiIndex, project.git);
const slService = new SlSearchService(options.embeddingService, slIndex);
const results: ReindexScopeResult[] = [];
for (const scope of scopes) {
const scopeStartedAt = process.hrtime.bigint();
try {
let work: ReindexWorkResult;
if (scope.kind === 'wiki') {
if (options.force) {
wikiIndex.clear(scope.scope, scope.scopeId);
}
work = await wikiService.syncIndex(scope.scope, scope.scopeId);
results.push({
kind: 'wiki',
label: scope.label,
scope: scope.scope === 'GLOBAL' ? 'global' : 'user',
scopeId: scope.scopeId,
...work,
...(options.force ? { deleted: 0 } : {}),
...(options.embeddingService && work.embeddingsFailed > 0 ? { error: embeddingFailureError(work) } : {}),
durationMs: durationSince(scopeStartedAt),
});
continue;
}
if (options.force) {
await slIndex.clear(scope.connectionId);
}
const records = await loadLocalSlSourceRecords(project, { connectionId: scope.connectionId });
work = await slService.indexSources(
scope.connectionId,
records.map((record) => record.source),
);
results.push({
kind: 'sl',
label: scope.label,
connectionId: scope.connectionId,
...work,
...(options.force ? { deleted: 0 } : {}),
...(options.embeddingService && work.embeddingsFailed > 0 ? { error: embeddingFailureError(work) } : {}),
durationMs: durationSince(scopeStartedAt),
});
} catch (error) {
results.push({
kind: scope.kind,
label: scope.label,
...(scope.kind === 'wiki'
? { scope: scope.scope === 'GLOBAL' ? 'global' : 'user', scopeId: scope.scopeId }
: { connectionId: scope.connectionId }),
...ZERO,
durationMs: durationSince(scopeStartedAt),
error: errorMessage(error),
});
}
}
return {
scopes: results,
totals: results.reduce(addTotals, ZERO),
dbPath: relative(project.projectDir, dbPath) || dbPath,
force: options.force,
embeddingsAvailable: options.embeddingService !== null,
durationMs: durationSince(startedAt),
};
}

View file

@ -0,0 +1,33 @@
import type { KtxEmbeddingPort } from '../core/index.js';
export interface ReindexOptions {
force: boolean;
embeddingService: KtxEmbeddingPort | null;
}
export interface ReindexWorkResult {
scanned: number;
updated: number;
deleted: number;
embeddingsRecomputed: number;
embeddingsFailed: number;
}
export interface ReindexScopeResult extends ReindexWorkResult {
kind: 'wiki' | 'sl';
label: string;
scope?: 'global' | 'user';
scopeId?: string | null;
connectionId?: string;
durationMs: number;
error?: string;
}
export interface ReindexSummary {
scopes: ReindexScopeResult[];
totals: ReindexWorkResult;
dbPath: string;
force: boolean;
embeddingsAvailable: boolean;
durationMs: number;
}

View file

@ -0,0 +1,128 @@
export * from './agent/index.js';
export * from './core/index.js';
export * from './daemon/index.js';
export * from './ingest/index.js';
export * from './index-sync/index.js';
export * from './llm/index.js';
export type {
CaptureSession,
CaptureSignals,
MemoryAgentInput,
MemoryAgentResult,
MemoryAgentServiceDeps,
MemoryAgentSettings,
MemoryAgentSourceType,
MemoryCommitMessagePort,
MemoryConnectionPort,
MemoryFileStorePort,
MemoryKnowledgeSlRefsPort,
MemoryLockPort,
MemorySlSourceReconcilerPort,
MemoryTelemetryPort,
MemoryToolSetLike,
MemoryToolsetFactoryPort,
} from './memory/index.js';
export * from './project/index.js';
export * from './prompts/index.js';
export * from './search/index.js';
export * from './sql-analysis/index.js';
export type {
KtxColumnAnalysisResult,
KtxColumnDescriptionPromptInput,
KtxColumnEmbeddingForeignKeys,
KtxColumnEmbeddingTextInput,
KtxColumnSampleInput,
KtxColumnSampleResult,
KtxColumnSampleUpdate,
KtxColumnStatsInput,
KtxColumnStatsResult,
KtxConnectionDriver,
KtxConnectorCapabilities,
KtxCredentialEnvelope,
KtxCredentialEnvReference,
KtxCredentialFileReference,
KtxDataDictionaryColumnState,
KtxDataDictionarySampleDecision,
KtxDataDictionarySettings,
KtxDataDictionarySkipReason,
KtxDataSourceDescriptionPromptInput,
KtxDescriptionCachePort,
KtxDescriptionColumn,
KtxDescriptionColumnTable,
KtxDescriptionGenerationSettings,
KtxDescriptionGeneratorOptions,
KtxDescriptionSource,
KtxDescriptionTableInput,
KtxDescriptionUpdate,
KtxEmbeddingPort as KtxScanEmbeddingPort,
KtxEmbeddingUpdate,
KtxEnrichedColumn,
KtxEnrichedRelationship,
KtxEnrichedSchema,
KtxEnrichedTable,
KtxGenerateColumnDescriptionsInput,
KtxGenerateDataSourceDescriptionInput,
KtxGenerateTableDescriptionInput,
KtxOptionalConnectorCapabilities,
KtxProgressPort,
KtxQueryResult as KtxScanQueryResult,
KtxReadOnlyQueryInput,
KtxRelationshipEndpoint,
KtxRelationshipSource,
KtxRelationshipType,
KtxRelationshipUpdate,
KtxResolvedCredentialEnvelope,
KtxScanArtifactPaths,
KtxScanConnector,
KtxScanContext,
KtxScanDiffSummary,
KtxScanEnrichmentSummary,
KtxScanInput,
KtxScanLoggerPort,
KtxScanMetadataStore,
KtxScanMode,
KtxScanRelationshipSummary,
KtxScanReport,
KtxScanTrigger,
KtxScanWarning,
KtxScanWarningCode,
KtxSchemaColumn,
KtxSchemaDimensionType,
KtxSchemaForeignKey,
KtxSchemaScope,
KtxSchemaSnapshot,
KtxSchemaTable,
KtxSchemaTableKind,
KtxSkippedRelationship,
KtxStructuralSyncPlan,
KtxStructuralSyncStats,
KtxTableDescriptionPromptInput,
KtxTableRef,
KtxTableSampleInput,
KtxTableSampleResult,
KtxColumnTypeMapping,
} from './scan/index.js';
export {
buildKtxColumnDescriptionPrompt,
buildKtxColumnEmbeddingText,
buildKtxDataSourceDescriptionPrompt,
buildKtxTableDescriptionPrompt,
createKtxConnectorCapabilities,
defaultKtxDataDictionarySettings,
inferKtxDimensionType,
isKtxDataDictionaryCandidate,
ktxColumnTypeMappingFromNative,
KtxDescriptionGenerator,
normalizeKtxNativeType,
REDACTED_KTX_CREDENTIAL_VALUE,
redactKtxCredentialEnvelope,
redactKtxCredentialValue,
redactKtxScanMetadata,
redactKtxScanReport,
redactKtxScanWarning,
shouldKtxSampleColumnForDictionary,
} from './scan/index.js';
export * from './skills/index.js';
export * from './sl/index.js';
export * from './tools/index.js';
export * from './wiki/index.js';

View file

@ -0,0 +1,42 @@
import { describe, expect, it } from 'vitest';
import { actionTargetConnectionId, memoryActionIdentity } from './action-identity.js';
describe('memory action target identity', () => {
it('keys SL actions by target connection and wiki actions by run connection', () => {
expect(
memoryActionIdentity(
{ target: 'sl', type: 'created', key: 'orders', detail: '', targetConnectionId: 'warehouse-b' },
'looker-run',
),
).toBe('sl:warehouse-b:orders');
expect(memoryActionIdentity({ target: 'sl', type: 'created', key: 'orders', detail: '' }, 'warehouse-a')).toBe(
'sl:warehouse-a:orders',
);
expect(
memoryActionIdentity(
{
target: 'wiki',
type: 'created',
key: 'wiki/global/orders.md',
detail: '',
targetConnectionId: 'ignored',
},
'looker-run',
),
).toBe('wiki:looker-run:wiki/global/orders.md');
});
it('resolves action target connection only for SL actions', () => {
expect(
actionTargetConnectionId(
{ target: 'sl', type: 'updated', key: 'orders', detail: '', targetConnectionId: 'warehouse-b' },
'looker-run',
),
).toBe('warehouse-b');
expect(actionTargetConnectionId({ target: 'wiki', type: 'updated', key: 'orders', detail: '' }, 'looker-run')).toBe(
'looker-run',
);
});
});

View file

@ -0,0 +1,9 @@
import type { MemoryAction } from '../memory/index.js';
export function actionTargetConnectionId(action: MemoryAction, runConnectionId: string): string {
return action.target === 'sl' ? (action.targetConnectionId ?? runConnectionId) : runConnectionId;
}
export function memoryActionIdentity(action: MemoryAction, runConnectionId: string): string {
return `${action.target}:${actionTargetConnectionId(action, runConnectionId)}:${action.key}`;
}

View file

@ -0,0 +1,75 @@
import { describe, expect, it } from 'vitest';
import type { DbtParsedTable } from './parse-schema.js';
import { findMatchingKtxTable, matchDbtTables, type DbtHostTableLite } from './match-tables.js';
const hostTables: DbtHostTableLite[] = [
{ id: '1', name: 'orders', catalog: 'warehouse', db: 'analytics', columns: [{ id: 'c1', name: 'id' }] },
{ id: '2', name: 'orders', catalog: 'warehouse', db: 'staging', columns: [{ id: 'c2', name: 'id' }] },
{ id: '3', name: 'customers', catalog: null, db: null, columns: [{ id: 'c3', name: 'id' }] },
];
function table(input: Partial<DbtParsedTable>): DbtParsedTable {
return {
name: 'orders',
description: null,
database: null,
schema: null,
columns: [],
resourceType: 'model',
...input,
};
}
describe('dbt descriptions table matching', () => {
it('uses schema plus name first and checks catalog when dbt database is present', () => {
expect(
findMatchingKtxTable(table({ database: 'warehouse', schema: 'analytics' }), hostTables, null)?.id,
).toBe('1');
});
it('does not fall back to name-only for source tables', () => {
expect(findMatchingKtxTable(table({ resourceType: 'source' }), hostTables, null)).toBeUndefined();
});
it('uses targetSchema for models and name-only only when unique', () => {
expect(findMatchingKtxTable(table({ resourceType: 'model' }), hostTables, 'staging')?.id).toBe('2');
expect(findMatchingKtxTable(table({ name: 'customers', resourceType: 'model' }), hostTables, null)?.id).toBe(
'3',
);
expect(findMatchingKtxTable(table({ resourceType: 'model' }), hostTables, null)).toBeUndefined();
});
it('summarizes matched columns and descriptions', () => {
const matches = matchDbtTables(
[
table({
name: 'customers',
description: 'Customers',
columns: [
{ name: 'id', description: 'Primary key', dataType: null },
{ name: 'missing', description: 'Missing', dataType: null },
],
}),
],
hostTables,
null,
);
expect(matches).toEqual([
{
dbtTable: 'customers',
dbtSchema: null,
dbtDatabase: null,
hostTableId: '3',
hostTableName: 'customers',
matched: true,
tableDescriptionAction: 'import',
tableDescriptionFound: true,
columnsToImport: 1,
columnsMatched: 1,
columnsTotal: 2,
columnDescriptionsFound: 1,
},
]);
});
});

View file

@ -0,0 +1,127 @@
import type { DbtParsedTable } from './parse-schema.js';
export interface DbtHostTableLite {
id: string;
name: string;
catalog: string | null;
db: string | null;
columns: Array<{ id: string; name: string }>;
}
export interface DbtTableMatch {
dbtTable: string;
dbtSchema: string | null;
dbtDatabase: string | null;
hostTableId: string | null;
hostTableName: string | null;
matched: boolean;
tableDescriptionAction: 'skip' | 'import';
tableDescriptionFound: boolean;
columnsToImport: number;
columnsMatched: number;
columnsTotal: number;
columnDescriptionsFound: number;
}
export function matchDbtTables(
dbtTables: DbtParsedTable[],
hostTables: DbtHostTableLite[],
targetSchema?: string | null,
): DbtTableMatch[] {
return dbtTables.map((dbtTable) => {
const hostTable = findMatchingKtxTable(dbtTable, hostTables, targetSchema);
if (!hostTable) {
return {
dbtTable: dbtTable.name,
dbtSchema: dbtTable.schema,
dbtDatabase: dbtTable.database,
hostTableId: null,
hostTableName: null,
matched: false,
tableDescriptionAction: 'skip',
tableDescriptionFound: Boolean(dbtTable.description),
columnsToImport: 0,
columnsMatched: 0,
columnsTotal: dbtTable.columns.length,
columnDescriptionsFound: dbtTable.columns.filter((column) => Boolean(column.description)).length,
};
}
const analysis = analyzeColumns(dbtTable, hostTable);
return {
dbtTable: dbtTable.name,
dbtSchema: dbtTable.schema,
dbtDatabase: dbtTable.database,
hostTableId: hostTable.id,
hostTableName: hostTable.name,
matched: true,
tableDescriptionAction: dbtTable.description ? 'import' : 'skip',
tableDescriptionFound: Boolean(dbtTable.description),
...analysis,
};
});
}
export function findMatchingKtxTable(
dbtTable: DbtParsedTable,
hostTables: DbtHostTableLite[],
targetSchema?: string | null,
): DbtHostTableLite | undefined {
const dbtName = dbtTable.name.toLowerCase();
const effectiveSchema = dbtTable.schema ?? targetSchema ?? null;
if (effectiveSchema) {
const strictMatch = hostTables.find((table) => {
const nameMatches = table.name.toLowerCase() === dbtName;
const schemaMatches = table.db?.toLowerCase() === effectiveSchema.toLowerCase();
if (!nameMatches || !schemaMatches) {
return false;
}
if (dbtTable.database && table.catalog) {
return table.catalog.toLowerCase() === dbtTable.database.toLowerCase();
}
return true;
});
if (strictMatch) {
return strictMatch;
}
}
if (dbtTable.resourceType === 'source') {
return undefined;
}
const nameMatches = hostTables.filter((table) => table.name.toLowerCase() === dbtName);
return nameMatches.length === 1 ? nameMatches[0] : undefined;
}
function analyzeColumns(
dbtTable: DbtParsedTable,
hostTable: DbtHostTableLite,
): Pick<DbtTableMatch, 'columnsToImport' | 'columnsMatched' | 'columnsTotal' | 'columnDescriptionsFound'> {
let columnsToImport = 0;
let columnsMatched = 0;
let columnDescriptionsFound = 0;
for (const dbtColumn of dbtTable.columns) {
const hostColumn = hostTable.columns.find(
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
);
if (!hostColumn) {
continue;
}
columnsMatched++;
if (dbtColumn.description) {
columnDescriptionsFound++;
columnsToImport++;
}
}
return {
columnsToImport,
columnsMatched,
columnsTotal: dbtTable.columns.length,
columnDescriptionsFound,
};
}

View file

@ -0,0 +1,62 @@
import { describe, expect, it } from 'vitest';
import type { ParsedSemanticModel } from '../metricflow/deep-parse.js';
import { mergeSemanticModelTables } from './merge-semantic-model-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
const semanticModel: ParsedSemanticModel = {
name: 'orders_semantic',
description: 'Order facts',
modelRef: 'fct_orders',
dimensions: [
{ name: 'status', column: 'status', type: 'categorical', description: 'Order status' },
{ name: 'ordered_at', column: 'ordered_at', type: 'time' },
],
measures: [],
entities: [],
defaultTimeDimension: null,
};
describe('mergeSemanticModelTables', () => {
it('adds missing MetricFlow model refs as dbt model tables', () => {
const input: DbtSchemaParseResult = { projectName: null, dbtVersion: null, tables: [], relationships: [] };
expect(mergeSemanticModelTables(input, [semanticModel])).toEqual({
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'fct_orders',
description: 'Order facts',
database: null,
schema: null,
resourceType: 'model',
columns: [
{ name: 'status', description: 'Order status', dataType: null },
{ name: 'ordered_at', description: null, dataType: 'TIMESTAMP' },
],
},
],
});
});
it('does not add a duplicate table when schema parsing already found the model ref', () => {
const input: DbtSchemaParseResult = {
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'FCT_ORDERS',
description: 'Existing',
database: null,
schema: null,
resourceType: 'model',
columns: [],
},
],
};
expect(mergeSemanticModelTables(input, [semanticModel]).tables).toHaveLength(1);
});
});

View file

@ -0,0 +1,37 @@
import type { ParsedSemanticModel } from '../metricflow/deep-parse.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export function mergeSemanticModelTables(
parseResult: DbtSchemaParseResult,
semanticModels: ParsedSemanticModel[],
): DbtSchemaParseResult {
const merged: DbtSchemaParseResult = {
...parseResult,
tables: [...parseResult.tables],
relationships: [...parseResult.relationships],
};
const existingTableNames = new Set(merged.tables.map((table) => table.name.toLowerCase()));
for (const model of semanticModels) {
const tableName = model.modelRef;
if (existingTableNames.has(tableName.toLowerCase())) {
continue;
}
merged.tables.push({
name: tableName,
description: model.description,
database: null,
schema: null,
columns: model.dimensions.map((dimension) => ({
name: dimension.column,
description: dimension.description ?? null,
dataType: dimension.type === 'time' ? 'TIMESTAMP' : null,
})),
resourceType: 'model',
});
existingTableNames.add(tableName.toLowerCase());
}
return merged;
}

View file

@ -0,0 +1,214 @@
import { describe, expect, it } from 'vitest';
import { parseDbtSchemaFile, parseDbtSchemaFiles } from './parse-schema.js';
describe('dbt descriptions schema parser', () => {
it('resolves shared dbt vars and defaults before parsing schema YAML', () => {
const result = parseDbtSchemaFile(
`
version: 2
sources:
- name: raw
database: "{{ var('database') }}"
schema: "{{ var('schema', 'fallback_schema') }}"
tables:
- name: orders
identifier: fct_orders
description: "Orders from {{ var('database') }}"
columns:
- name: customer_id
description: "Customer id"
tests:
- relationships:
to: ref('customers')
field: id
models:
- name: "{{ var('model_name', 'orders_model') }}"
schema: "{{ var('model_schema') }}"
columns:
- name: id
description: "Order id"
`,
{ path: 'models/schema.yml', variables: new Map([['database', 'analytics'], ['model_schema', 'mart']]) },
);
expect(result.tables).toEqual([
{
name: 'fct_orders',
description: 'Orders from analytics',
database: 'analytics',
schema: 'fallback_schema',
columns: [
{
name: 'customer_id',
description: 'Customer id',
dataType: null,
dataTests: [{ name: 'relationships', package: 'dbt', kwargs: { to: "ref('customers')", field: 'id' } }],
},
],
resourceType: 'source',
},
{
name: 'orders_model',
description: null,
database: null,
schema: 'mart',
columns: [{ name: 'id', description: 'Order id', dataType: null }],
resourceType: 'model',
},
]);
expect(result.relationships).toEqual([
{
fromTable: 'fct_orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
fromSchema: 'fallback_schema',
},
]);
});
it('deduplicates tables by database schema and name while merging columns', () => {
const result = parseDbtSchemaFiles([
{
path: 'models/a.yml',
content: `
version: 2
models:
- name: orders
description: Orders
columns:
- name: id
description: Primary key
`,
},
{
path: 'models/b.yml',
content: `
version: 2
models:
- name: orders
columns:
- name: status
description: Status
- name: id
data_type: integer
`,
},
]);
expect(result.tables).toEqual([
{
name: 'orders',
description: 'Orders',
database: null,
schema: null,
resourceType: 'model',
columns: [
{ name: 'id', description: 'Primary key', dataType: 'integer' },
{ name: 'status', description: 'Status', dataType: null },
],
},
]);
});
it('returns an empty result for malformed YAML and preserves unresolved Jinja text', () => {
expect(parseDbtSchemaFile('{{{{ invalid yaml', { path: 'broken.yml' })).toEqual({
projectName: null,
dbtVersion: null,
tables: [],
relationships: [],
});
const unresolved = parseDbtSchemaFile(
`
version: 2
models:
- name: "{{ var('missing_model') }}"
`,
{ variables: new Map() },
);
expect(unresolved.tables[0]?.name).toBe("{{ var('missing_model') }}");
});
it('extracts data tests, constraints, enum values, tags, and freshness', () => {
const result = parseDbtSchemaFile(`
version: 2
sources:
- name: raw
schema: jaffle
tags: ["raw"]
tables:
- name: customers
tags: ["core"]
loaded_at_field: updated_at
freshness:
warn_after: { count: 12, period: hour }
columns:
- name: id
tests:
- not_null
- unique
- name: status
data_tests:
- accepted_values:
values: ['active', 'inactive']
models:
- name: orders
tags: ["finance"]
loaded_at_field: run_at
columns:
- name: status
data_tests:
- dbt_utils.expression_is_true:
expression: "status is not null"
- accepted_values: ['placed', 'shipped']
`);
const customers = result.tables.find((table) => table.name === 'customers');
expect(customers?.tagsDbt).toEqual(['raw', 'core']);
expect(customers?.freshnessDbt?.loadedAtField).toBe('updated_at');
expect(customers?.freshnessDbt?.raw).toBeDefined();
const id = customers?.columns.find((column) => column.name === 'id');
expect(id?.constraints?.dbt).toEqual({ not_null: true, unique: true });
const status = customers?.columns.find((column) => column.name === 'status');
expect(status?.enumValuesDbt).toEqual(['active', 'inactive']);
const orders = result.tables.find((table) => table.name === 'orders');
expect(orders?.tagsDbt).toEqual(['finance']);
expect(orders?.freshnessDbt?.loadedAtField).toBe('run_at');
const ordersStatus = orders?.columns.find((column) => column.name === 'status');
expect(ordersStatus?.enumValuesDbt).toEqual(['placed', 'shipped']);
expect(ordersStatus?.dataTests).toEqual(
expect.arrayContaining([
expect.objectContaining({ package: 'dbt_utils', name: 'expression_is_true' }),
expect.objectContaining({ package: 'dbt', name: 'accepted_values' }),
]),
);
});
it('parses relationships from model column data tests', () => {
const result = parseDbtSchemaFile(`
version: 2
models:
- name: orders
schema: public
columns:
- name: customer_id
data_tests:
- relationships:
arguments:
to: "ref('customers')"
field: id
`);
expect(result.relationships).toEqual([
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
fromSchema: 'public',
},
]);
});
});

View file

@ -0,0 +1,655 @@
import { createHash } from 'node:crypto';
import { parse as parseYaml } from 'yaml';
import { type KtxLogger, noopLogger } from '../../../core/index.js';
import { resolveJinjaVariables } from '../../dbt-shared/project-vars.js';
export interface DbtParsedColumn {
name: string;
description: string | null;
dataType: string | null;
dataTests?: DbtDataTestRef[];
constraints?: DbtColumnConstraints;
enumValuesDbt?: string[];
}
export interface DbtDataTestRef {
name: string;
package: string;
kwargs?: Record<string, unknown>;
}
export interface DbtColumnConstraints {
dbt: {
not_null?: boolean;
unique?: boolean;
};
}
export interface DbtParsedRelationship {
fromTable: string;
fromColumn: string;
toTable: string;
toColumn: string;
fromSchema?: string;
toSchema?: string;
description?: string;
}
export interface DbtParsedTable {
name: string;
description: string | null;
database: string | null;
schema: string | null;
columns: DbtParsedColumn[];
resourceType?: 'source' | 'model';
tagsDbt?: string[];
freshnessDbt?: {
raw?: unknown;
loadedAtField?: string | null;
};
}
export interface DbtSchemaParseResult {
projectName: string | null;
dbtVersion: string | null;
tables: DbtParsedTable[];
relationships: DbtParsedRelationship[];
}
export interface DbtSchemaFile {
content: string;
path: string;
}
interface ParseDbtSchemaOptions {
path?: string;
variables?: Map<string, string>;
projectName?: string | null;
logger?: KtxLogger;
}
interface DbtSchemaYaml {
version?: number;
sources?: DbtSchemaSource[];
models?: DbtSchemaModel[];
}
interface DbtSchemaSource {
name: string;
description?: string;
database?: string;
schema?: string;
tags?: string[];
tables?: DbtSchemaTable[];
}
interface DbtSchemaTable {
name: string;
description?: string;
identifier?: string;
tags?: string[];
loaded_at_field?: string;
freshness?: unknown;
columns?: DbtSchemaColumn[];
}
interface DbtSchemaModel {
name: string;
description?: string;
database?: string;
schema?: string;
tags?: string[];
loaded_at_field?: string;
freshness?: unknown;
columns?: DbtSchemaColumn[];
}
interface DbtSchemaColumn {
name: string;
description?: string;
data_type?: string;
data_tests?: DbtSchemaDataTest[];
tests?: DbtSchemaDataTest[];
}
type DbtSchemaDataTest =
| string
| {
relationships?: {
to?: string;
field?: string;
arguments?: { to?: string; field?: string };
};
not_null?: unknown;
unique?: unknown;
accepted_values?: { values?: unknown } | unknown;
[key: string]: unknown;
};
export function parseDbtSchemaFile(content: string, options: ParseDbtSchemaOptions = {}): DbtSchemaParseResult {
return new DbtSchemaParser(options.logger ?? noopLogger).parseFile(content, options);
}
export function parseDbtSchemaFiles(
files: DbtSchemaFile[],
variables?: Map<string, string>,
options: { projectName?: string | null; logger?: KtxLogger } = {},
): DbtSchemaParseResult {
return new DbtSchemaParser(options.logger ?? noopLogger).parseFiles(files, variables, options.projectName ?? null);
}
export function computeDbtSchemaHash(files: DbtSchemaFile[]): string {
const combined = [...files]
.sort((a, b) => a.path.localeCompare(b.path))
.map((file) => `${file.path}:${file.content}`)
.join('\n');
return createHash('sha256').update(combined).digest('hex').substring(0, 16);
}
class DbtSchemaParser {
constructor(private readonly logger: KtxLogger) {}
parseFile(yamlContent: string, options: ParseDbtSchemaOptions = {}): DbtSchemaParseResult {
this.logger.debug(`Parsing schema file: ${options.path ?? 'unknown'}`);
const resolved = options.variables
? resolveJinjaVariables(yamlContent, options.variables)
: { content: yamlContent, unresolvedVars: [] };
if (resolved.unresolvedVars.length > 0) {
this.logger.warn(
`Unresolved dbt variables in ${options.path ?? 'schema file'}: ${resolved.unresolvedVars.join(', ')}`,
);
}
let schema: DbtSchemaYaml;
try {
schema = parseYaml(resolved.content) as DbtSchemaYaml;
} catch (error) {
this.logger.warn(`Failed to parse YAML${options.path ? ` at ${options.path}` : ''}: ${error}`);
return this.emptyResult(options.projectName ?? null);
}
if (!schema || typeof schema !== 'object') {
return this.emptyResult(options.projectName ?? null);
}
const tables = [...this.parseSources(schema.sources), ...this.parseModels(schema.models)];
const relationships = [
...this.parseSourceRelationships(schema.sources),
...this.parseModelRelationships(schema.models),
];
return {
projectName: options.projectName ?? null,
dbtVersion: null,
tables,
relationships,
};
}
parseFiles(
files: DbtSchemaFile[],
variables?: Map<string, string>,
projectName: string | null = null,
): DbtSchemaParseResult {
const allTables: DbtParsedTable[] = [];
const allRelationships: DbtParsedRelationship[] = [];
for (const file of files) {
const result = this.parseFile(file.content, { path: file.path, variables, projectName });
allTables.push(...result.tables);
allRelationships.push(...result.relationships);
}
return {
projectName,
dbtVersion: null,
tables: this.deduplicateTables(allTables),
relationships: this.deduplicateRelationships(allRelationships),
};
}
private parseSources(sources: DbtSchemaSource[] | undefined): DbtParsedTable[] {
if (!sources || !Array.isArray(sources)) {
return [];
}
const tables: DbtParsedTable[] = [];
for (const source of sources) {
const sourceSchema = source.schema ?? source.name;
const sourceDatabase = source.database ?? null;
const sourceTags = this.normalizeTagList(source.tags);
if (!source.tables || !Array.isArray(source.tables)) {
continue;
}
for (const table of source.tables) {
const tagsDbt = this.mergeTagsDbt(sourceTags, this.normalizeTagList(table.tags));
const freshnessDbt = this.buildFreshnessDbt(table.freshness, table.loaded_at_field);
tables.push({
name: table.identifier ?? table.name,
description: this.normalizeDescription(table.description),
database: sourceDatabase,
schema: sourceSchema,
columns: this.parseColumns(table.columns),
resourceType: 'source',
...(tagsDbt ? { tagsDbt } : {}),
...(freshnessDbt ? { freshnessDbt } : {}),
});
}
}
return tables;
}
private parseModels(models: DbtSchemaModel[] | undefined): DbtParsedTable[] {
if (!models || !Array.isArray(models)) {
return [];
}
const tables: DbtParsedTable[] = [];
for (const model of models) {
if (!model.name) {
continue;
}
const tagsDbt = this.mergeTagsDbt(this.normalizeTagList(model.tags));
const freshnessDbt = this.buildFreshnessDbt(model.freshness, model.loaded_at_field);
tables.push({
name: model.name,
description: this.normalizeDescription(model.description),
database: model.database ?? null,
schema: model.schema ?? null,
columns: this.parseColumns(model.columns),
resourceType: 'model',
...(tagsDbt ? { tagsDbt } : {}),
...(freshnessDbt ? { freshnessDbt } : {}),
});
}
return tables;
}
private parseColumns(columns: DbtSchemaColumn[] | undefined): DbtParsedColumn[] {
if (!columns || !Array.isArray(columns)) {
return [];
}
return columns.map((column) => {
const { refs, constraints, enumValues } = this.parseDataTests(column.data_tests ?? column.tests);
return {
name: column.name,
description: this.normalizeDescription(column.description),
dataType: column.data_type ?? null,
...(refs.length > 0 ? { dataTests: refs } : {}),
...(constraints ? { constraints } : {}),
...(enumValues.length > 0 ? { enumValuesDbt: enumValues } : {}),
};
});
}
private parseDataTests(tests: DbtSchemaDataTest[] | undefined): {
refs: DbtDataTestRef[];
constraints: DbtColumnConstraints | undefined;
enumValues: string[];
} {
const refs: DbtDataTestRef[] = [];
const dbt: { not_null?: boolean; unique?: boolean } = {};
const enumValues: string[] = [];
if (!tests?.length) {
return { refs, constraints: undefined, enumValues };
}
for (const test of tests) {
if (typeof test === 'string') {
const parsed = this.parseTestNameString(test);
refs.push(parsed);
if (parsed.package === 'dbt' && parsed.name === 'not_null') {
dbt.not_null = true;
}
if (parsed.package === 'dbt' && parsed.name === 'unique') {
dbt.unique = true;
}
continue;
}
for (const [key, value] of Object.entries(test)) {
if (key === 'relationships') {
refs.push({
name: 'relationships',
package: 'dbt',
...(value && typeof value === 'object' && !Array.isArray(value)
? { kwargs: value as Record<string, unknown> }
: {}),
});
continue;
}
if (key === 'not_null') {
refs.push({ name: 'not_null', package: 'dbt' });
dbt.not_null = true;
continue;
}
if (key === 'unique') {
refs.push({ name: 'unique', package: 'dbt' });
dbt.unique = true;
continue;
}
if (key === 'accepted_values') {
if (Array.isArray(value)) {
enumValues.push(...value.map((item) => String(item)));
refs.push({ name: 'accepted_values', package: 'dbt', kwargs: { values: value } });
continue;
}
if (value && typeof value === 'object' && !Array.isArray(value)) {
const values = (value as { values?: unknown }).values;
if (Array.isArray(values)) {
enumValues.push(...values.map((item) => String(item)));
}
refs.push({ name: 'accepted_values', package: 'dbt', kwargs: value as Record<string, unknown> });
continue;
}
}
refs.push({
...this.parseTestNameString(key),
...(value && typeof value === 'object' && !Array.isArray(value)
? { kwargs: value as Record<string, unknown> }
: {}),
});
}
}
const constraints = dbt.not_null || dbt.unique ? { dbt } : undefined;
return { refs, constraints, enumValues };
}
private parseTestNameString(value: string): { name: string; package: string } {
const parts = value.split('.');
if (parts.length >= 2) {
return { package: parts[0]!, name: parts.slice(1).join('.') };
}
return { package: 'dbt', name: value };
}
private parseSourceRelationships(sources: DbtSchemaSource[] | undefined): DbtParsedRelationship[] {
if (!sources || !Array.isArray(sources)) {
return [];
}
const relationships: DbtParsedRelationship[] = [];
for (const source of sources) {
const sourceSchema = source.schema ?? source.name;
if (!source.tables || !Array.isArray(source.tables)) {
continue;
}
for (const table of source.tables) {
const tableName = table.identifier ?? table.name;
if (!table.columns || !Array.isArray(table.columns)) {
continue;
}
for (const column of table.columns) {
const tests = column.data_tests ?? column.tests ?? [];
for (const test of tests) {
const relationship = this.parseRelationshipTest(test, tableName, column.name, sourceSchema);
if (relationship) {
relationships.push(relationship);
}
}
}
}
}
return relationships;
}
private parseModelRelationships(models: DbtSchemaModel[] | undefined): DbtParsedRelationship[] {
if (!models || !Array.isArray(models)) {
return [];
}
const relationships: DbtParsedRelationship[] = [];
for (const model of models) {
if (!model.name || !model.columns || !Array.isArray(model.columns)) {
continue;
}
for (const column of model.columns) {
const tests = column.data_tests ?? column.tests ?? [];
for (const test of tests) {
const relationship = this.parseRelationshipTest(test, model.name, column.name, model.schema ?? undefined);
if (relationship) {
relationships.push(relationship);
}
}
}
}
return relationships;
}
private parseRelationshipTest(
test: DbtSchemaDataTest,
fromTable: string,
fromColumn: string,
fromSchema?: string,
): DbtParsedRelationship | null {
if (typeof test === 'string' || !test.relationships) {
return null;
}
const relationship = test.relationships;
const toRef = relationship.to ?? relationship.arguments?.to;
const toColumn = relationship.field ?? relationship.arguments?.field;
if (!toRef || !toColumn) {
this.logger.debug(`Skipping incomplete relationship test for ${fromTable}.${fromColumn}`);
return null;
}
const toTable = this.parseRef(toRef);
if (!toTable) {
this.logger.debug(`Could not parse ref: ${toRef}`);
return null;
}
return {
fromTable,
fromColumn,
toTable,
toColumn,
...(fromSchema ? { fromSchema } : {}),
};
}
private parseRef(refString: string): string | null {
const refMatch = refString.match(/ref\s*\(\s*['"]([^'"]+)['"]\s*\)/);
if (refMatch) {
return refMatch[1];
}
const sourceMatch = refString.match(/source\s*\(\s*['"][^'"]+['"]\s*,\s*['"]([^'"]+)['"]\s*\)/);
if (sourceMatch) {
return sourceMatch[1];
}
return null;
}
private normalizeDescription(description: string | undefined): string | null {
if (!description) {
return null;
}
const trimmed = description.trim();
return trimmed.length > 0 ? trimmed : null;
}
private normalizeTagList(tags: string[] | undefined): string[] {
if (!tags || !Array.isArray(tags)) {
return [];
}
return tags.map((tag) => String(tag));
}
private mergeTagsDbt(...lists: Array<string[] | undefined>): string[] | undefined {
const merged: string[] = [];
const seen = new Set<string>();
for (const list of lists) {
for (const item of list ?? []) {
if (!seen.has(item)) {
seen.add(item);
merged.push(item);
}
}
}
return merged.length > 0 ? merged : undefined;
}
private buildFreshnessDbt(freshness: unknown, loadedAtField: string | undefined): DbtParsedTable['freshnessDbt'] {
const loadedTrim = loadedAtField?.trim();
const hasFreshness = freshness !== undefined && freshness !== null;
if (!hasFreshness && !loadedTrim) {
return undefined;
}
return {
...(hasFreshness ? { raw: freshness } : {}),
...(hasFreshness ? { loadedAtField: loadedTrim ?? null } : loadedTrim ? { loadedAtField: loadedTrim } : {}),
};
}
private deduplicateTables(tables: DbtParsedTable[]): DbtParsedTable[] {
const seen = new Map<string, DbtParsedTable>();
for (const table of tables) {
const key = `${table.database ?? ''}.${table.schema ?? ''}.${table.name}`.toLowerCase();
const existing = seen.get(key);
if (!existing) {
seen.set(key, table);
continue;
}
seen.set(key, {
...existing,
description: existing.description ?? table.description,
columns: this.mergeColumns(existing.columns, table.columns),
tagsDbt: this.mergeTagsDbt(existing.tagsDbt, table.tagsDbt),
freshnessDbt: this.mergeFreshnessDbt(existing.freshnessDbt, table.freshnessDbt),
});
}
return Array.from(seen.values());
}
private mergeColumns(existing: DbtParsedColumn[], incoming: DbtParsedColumn[]): DbtParsedColumn[] {
const seen = new Map<string, DbtParsedColumn>();
for (const column of existing) {
seen.set(column.name.toLowerCase(), column);
}
for (const column of incoming) {
const key = column.name.toLowerCase();
const existingColumn = seen.get(key);
if (!existingColumn) {
seen.set(key, column);
continue;
}
seen.set(key, {
...existingColumn,
description: existingColumn.description ?? column.description,
dataType: existingColumn.dataType ?? column.dataType,
dataTests: this.mergeDbtDataTests(existingColumn.dataTests, column.dataTests),
constraints: this.mergeDbtConstraints(existingColumn.constraints, column.constraints),
enumValuesDbt: this.mergeStringList(existingColumn.enumValuesDbt, column.enumValuesDbt),
});
}
return Array.from(seen.values());
}
private deduplicateRelationships(relationships: DbtParsedRelationship[]): DbtParsedRelationship[] {
const seen = new Set<string>();
const result: DbtParsedRelationship[] = [];
for (const relationship of relationships) {
const key =
`${relationship.fromTable}.${relationship.fromColumn}->${relationship.toTable}.${relationship.toColumn}`.toLowerCase();
if (!seen.has(key)) {
seen.add(key);
result.push(relationship);
}
}
return result;
}
private mergeFreshnessDbt(
existing?: DbtParsedTable['freshnessDbt'],
incoming?: DbtParsedTable['freshnessDbt'],
): DbtParsedTable['freshnessDbt'] {
if (!existing && !incoming) {
return undefined;
}
const raw = existing?.raw !== undefined ? existing.raw : incoming?.raw;
const loadedAtField = existing?.loadedAtField ?? incoming?.loadedAtField;
return {
...(raw !== undefined ? { raw } : {}),
...(loadedAtField !== undefined ? { loadedAtField } : {}),
};
}
private mergeDbtConstraints(
existing?: DbtColumnConstraints,
incoming?: DbtColumnConstraints,
): DbtColumnConstraints | undefined {
const notNull = !!(existing?.dbt.not_null || incoming?.dbt.not_null);
const unique = !!(existing?.dbt.unique || incoming?.dbt.unique);
if (!notNull && !unique) {
return undefined;
}
return { dbt: { ...(notNull ? { not_null: true } : {}), ...(unique ? { unique: true } : {}) } };
}
private mergeStringList(existing?: string[], incoming?: string[]): string[] | undefined {
return this.mergeTagsDbt(existing, incoming);
}
private mergeDbtDataTests(existing?: DbtDataTestRef[], incoming?: DbtDataTestRef[]): DbtDataTestRef[] | undefined {
if (!existing?.length) {
return incoming?.length ? [...incoming] : undefined;
}
if (!incoming?.length) {
return [...existing];
}
const tests = new Map<string, DbtDataTestRef>();
for (const test of [...existing, ...incoming]) {
const kwargsKey =
test.kwargs && Object.keys(test.kwargs).length > 0
? `:${createHash('sha256').update(JSON.stringify(test.kwargs)).digest('hex').slice(0, 16)}`
: '';
tests.set(`${test.package}:${test.name}${kwargsKey}`, test);
}
return [...tests.values()];
}
private emptyResult(projectName: string | null): DbtSchemaParseResult {
return {
projectName,
dbtVersion: null,
tables: [],
relationships: [],
};
}
}

View file

@ -0,0 +1,102 @@
import { describe, expect, it } from 'vitest';
import type { DbtSchemaParseResult } from './parse-schema.js';
import { toDescriptionUpdates } from './to-description-updates.js';
import type { DbtHostTableLite } from './match-tables.js';
const hostTables: DbtHostTableLite[] = [
{
id: '1',
name: 'orders',
catalog: 'warehouse',
db: 'analytics',
columns: [
{ id: 'c1', name: 'id' },
{ id: 'c2', name: 'amount' },
],
},
];
function parseResult(description: string | null, columnDescription: string | null): DbtSchemaParseResult {
return {
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'orders',
description,
database: 'warehouse',
schema: 'analytics',
resourceType: 'model',
columns: [
{ name: 'id', description: columnDescription, dataType: null },
{ name: 'missing', description: 'not imported', dataType: null },
],
},
],
};
}
describe('dbt descriptions update payloads', () => {
it('emits dbt writes and matching ai invalidations when descriptions exist', () => {
expect(
toDescriptionUpdates({
connectionId: 'conn-1',
parseResult: parseResult('Orders table', 'Primary key'),
hostTables,
targetSchema: null,
}),
).toEqual({
dbt: [
{
connectionId: 'conn-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'dbt',
tableDescription: 'Orders table',
columnDescriptions: { id: 'Primary key' },
},
],
aiInvalidations: [
{
connectionId: 'conn-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'ai',
},
],
});
});
it('does not emit spurious dbt writes or ai invalidations when no descriptions exist', () => {
expect(
toDescriptionUpdates({
connectionId: 'conn-1',
parseResult: parseResult(null, null),
hostTables,
targetSchema: null,
}),
).toEqual({ dbt: [], aiInvalidations: [] });
});
it('emits ai invalidation without a dbt description write when only structural metadata exists', () => {
const result = parseResult(null, null);
result.tables[0]!.tagsDbt = ['finance'];
expect(
toDescriptionUpdates({
connectionId: 'conn-1',
parseResult: result,
hostTables,
targetSchema: null,
}),
).toEqual({
dbt: [],
aiInvalidations: [
{
connectionId: 'conn-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'ai',
},
],
});
});
});

View file

@ -0,0 +1,70 @@
import type { KtxDescriptionUpdate } from '../../../scan/enrichment-types.js';
import { findMatchingKtxTable, type DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export interface DbtDescriptionUpdates {
dbt: KtxDescriptionUpdate[];
aiInvalidations: KtxDescriptionUpdate[];
}
export function toDescriptionUpdates(input: {
connectionId: string;
parseResult: DbtSchemaParseResult;
hostTables: DbtHostTableLite[];
targetSchema: string | null;
}): DbtDescriptionUpdates {
const dbt: KtxDescriptionUpdate[] = [];
const aiInvalidations: KtxDescriptionUpdate[] = [];
for (const dbtTable of input.parseResult.tables) {
const hostTable = findMatchingKtxTable(dbtTable, input.hostTables, input.targetSchema);
if (!hostTable) {
continue;
}
const tableDescription = dbtTable.description ?? undefined;
const columnDescriptions: Record<string, string | null> = {};
for (const dbtColumn of dbtTable.columns) {
if (!dbtColumn.description) {
continue;
}
const hostColumn = hostTable.columns.find(
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
);
if (hostColumn) {
columnDescriptions[hostColumn.name] = dbtColumn.description;
}
}
const hasColumnDescriptions = Object.keys(columnDescriptions).length > 0;
const hasDescriptionChange = tableDescription !== undefined || hasColumnDescriptions;
const hasMetadataChange =
!!dbtTable.tagsDbt?.length ||
dbtTable.freshnessDbt !== undefined ||
dbtTable.columns.some(
(column) => column.constraints !== undefined || !!column.enumValuesDbt?.length || !!column.dataTests?.length,
);
if (!hasDescriptionChange && !hasMetadataChange) {
continue;
}
const tableRef = { catalog: hostTable.catalog, db: hostTable.db, name: hostTable.name };
if (hasDescriptionChange) {
dbt.push({
connectionId: input.connectionId,
table: tableRef,
source: 'dbt',
...(tableDescription !== undefined ? { tableDescription } : {}),
...(hasColumnDescriptions ? { columnDescriptions } : {}),
});
}
aiInvalidations.push({
connectionId: input.connectionId,
table: tableRef,
source: 'ai',
});
}
return { dbt, aiInvalidations };
}

View file

@ -0,0 +1,70 @@
import { describe, expect, it } from 'vitest';
import { toMetadataUpdates } from './to-metadata-updates.js';
describe('toMetadataUpdates', () => {
it('emits source-keyed dbt metadata updates for matched tables and columns', () => {
const updates = toMetadataUpdates({
connectionId: 'conn_1',
targetSchema: 'analytics',
hostTables: [
{
id: 'orders-id',
name: 'orders',
catalog: 'warehouse',
db: 'analytics',
columns: [
{ id: 'status-id', name: 'status' },
{ id: 'created-id', name: 'created_at' },
],
},
],
parseResult: {
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'orders',
description: null,
database: 'warehouse',
schema: 'analytics',
resourceType: 'model',
tagsDbt: ['finance'],
freshnessDbt: { loadedAtField: 'created_at' },
columns: [
{
name: 'status',
description: null,
dataType: null,
enumValuesDbt: ['placed', 'shipped'],
constraints: { dbt: { not_null: true } },
dataTests: [{ name: 'accepted_values', package: 'dbt', kwargs: { values: ['placed', 'shipped'] } }],
},
],
},
],
},
});
expect(updates).toEqual([
{
connectionId: 'conn_1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'dbt',
tableFields: {
tags: ['finance'],
freshness: { loaded_at_field: 'created_at' },
},
columnFields: {
status: {
constraints: { not_null: true },
enum_values: ['placed', 'shipped'],
tests: [
{ name: 'accepted_values', package: 'dbt', kwargs: { values: ['placed', 'shipped'] } },
],
},
},
},
]);
});
});

View file

@ -0,0 +1,74 @@
import type { KtxMetadataUpdate } from '../../../scan/enrichment-types.js';
import { findMatchingKtxTable, type DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export function toMetadataUpdates(input: {
connectionId: string;
parseResult: DbtSchemaParseResult;
hostTables: DbtHostTableLite[];
targetSchema: string | null;
}): KtxMetadataUpdate[] {
const updates: KtxMetadataUpdate[] = [];
for (const dbtTable of input.parseResult.tables) {
const hostTable = findMatchingKtxTable(dbtTable, input.hostTables, input.targetSchema);
if (!hostTable) {
continue;
}
const tableFields: Record<string, unknown> = {};
if (dbtTable.tagsDbt?.length) {
tableFields.tags = dbtTable.tagsDbt;
}
if (dbtTable.freshnessDbt) {
tableFields.freshness = {
...(dbtTable.freshnessDbt.raw !== undefined ? { raw: dbtTable.freshnessDbt.raw } : {}),
...(dbtTable.freshnessDbt.loadedAtField !== undefined
? { loaded_at_field: dbtTable.freshnessDbt.loadedAtField }
: {}),
};
}
const columnFields: Record<string, Record<string, unknown>> = {};
for (const dbtColumn of dbtTable.columns) {
const hostColumn = hostTable.columns.find(
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
);
if (!hostColumn) {
continue;
}
const fields: Record<string, unknown> = {};
if (dbtColumn.constraints) {
fields.constraints = dbtColumn.constraints.dbt;
}
if (dbtColumn.enumValuesDbt?.length) {
fields.enum_values = dbtColumn.enumValuesDbt;
}
if (dbtColumn.dataTests?.length) {
fields.tests = dbtColumn.dataTests.map((test) => ({
name: test.name,
package: test.package,
...(test.kwargs ? { kwargs: test.kwargs } : {}),
}));
}
if (Object.keys(fields).length > 0) {
columnFields[hostColumn.name] = fields;
}
}
if (Object.keys(tableFields).length === 0 && Object.keys(columnFields).length === 0) {
continue;
}
updates.push({
connectionId: input.connectionId,
table: { catalog: hostTable.catalog, db: hostTable.db, name: hostTable.name },
source: 'dbt',
...(Object.keys(tableFields).length > 0 ? { tableFields } : {}),
...(Object.keys(columnFields).length > 0 ? { columnFields } : {}),
});
}
return updates;
}

View file

@ -0,0 +1,62 @@
import { describe, expect, it } from 'vitest';
import type { DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
import { toRelationshipUpdates } from './to-relationship-updates.js';
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
const hostTables: DbtHostTableLite[] = [
{
id: '1',
name: 'orders',
catalog: 'warehouse',
db: 'analytics',
columns: [{ id: 'c1', name: 'customer_id' }],
},
{
id: '2',
name: 'customers',
catalog: 'warehouse',
db: 'staging',
columns: [{ id: 'c2', name: 'id' }],
},
];
const parseResult: DbtSchemaParseResult = {
projectName: null,
dbtVersion: null,
tables: [],
relationships: [
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
fromSchema: 'analytics',
toSchema: 'analytics',
description: 'schema intentionally differs from the host customers table',
},
{ fromTable: 'orders', fromColumn: 'missing', toTable: 'customers', toColumn: 'id' },
{ fromTable: 'orders', fromColumn: 'customer_id', toTable: 'missing_table', toColumn: 'id' },
],
};
describe('dbt relationship update payloads', () => {
it('validates relationships using the current name-only matching behavior and dbt provenance', () => {
expect(toRelationshipUpdates({ connectionId: 'conn-1', parseResult, hostTables })).toEqual({
joins: [
{
connectionId: 'conn-1',
fromTable: 'orders',
fromColumns: ['customer_id'],
toTable: 'customers',
toColumns: ['id'],
relationship: 'many_to_one',
author: 'dbt',
authorEmail: DBT_SYSTEM_EMAIL,
},
],
skippedNoMatch: 2,
});
});
});

View file

@ -0,0 +1,57 @@
import type { KtxJoinUpdate } from '../../../scan/enrichment-types.js';
import type { DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export interface DbtRelationshipUpdates {
joins: KtxJoinUpdate[];
skippedNoMatch: number;
}
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
export function toRelationshipUpdates(input: {
connectionId: string;
parseResult: DbtSchemaParseResult;
hostTables: DbtHostTableLite[];
}): DbtRelationshipUpdates {
const tablesByName = new Map<string, DbtHostTableLite>();
for (const table of input.hostTables) {
tablesByName.set(table.name.toLowerCase(), table);
}
const joins: KtxJoinUpdate[] = [];
let skippedNoMatch = 0;
for (const relationship of input.parseResult.relationships) {
const fromTable = tablesByName.get(relationship.fromTable.toLowerCase());
const toTable = tablesByName.get(relationship.toTable.toLowerCase());
if (!fromTable || !toTable) {
skippedNoMatch++;
continue;
}
const fromColumn = fromTable.columns.find(
(column) => column.name.toLowerCase() === relationship.fromColumn.toLowerCase(),
);
const toColumn = toTable.columns.find(
(column) => column.name.toLowerCase() === relationship.toColumn.toLowerCase(),
);
if (!fromColumn || !toColumn) {
skippedNoMatch++;
continue;
}
joins.push({
connectionId: input.connectionId,
fromTable: fromTable.name,
fromColumns: [fromColumn.name],
toTable: toTable.name,
toColumns: [toColumn.name],
relationship: 'many_to_one',
author: 'dbt',
authorEmail: DBT_SYSTEM_EMAIL,
});
}
return { joins, skippedNoMatch };
}

View file

@ -0,0 +1,410 @@
import { describe, expect, it } from 'vitest';
import { type DbtHostTableLite, matchDbtTables } from './dbt-descriptions/match-tables.js';
import { mergeSemanticModelTables } from './dbt-descriptions/merge-semantic-model-tables.js';
import { parseDbtSchemaFiles } from './dbt-descriptions/parse-schema.js';
import { toDescriptionUpdates } from './dbt-descriptions/to-description-updates.js';
import { toRelationshipUpdates } from './dbt-descriptions/to-relationship-updates.js';
import { parseMetricflowFiles } from './metricflow/deep-parse.js';
import { mapCrossModelMetricToSource, mapSemanticModelToSource } from './metricflow/semantic-models.js';
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
const metricflowYaml = `
semantic_models:
- name: orders_semantic
description: MetricFlow order facts
model: ref('fct_orders')
defaults:
agg_time_dimension: ordered_at
entities:
- name: customer
type: foreign
expr: customer_id
description: Customer relationship
dimensions:
- name: status
type: categorical
expr: status
description: Order status
- name: ordered_at
type: time
expr: ordered_at
measures:
- name: total_revenue
agg: sum
expr: amount
description: Revenue
- name: customers_semantic
description: Customer dimension
model: ref('dim_customers')
entities:
- name: customer
type: primary
expr: id
dimensions:
- name: country
type: categorical
expr: country
description: Customer country
measures:
- name: customer_count
agg: count
expr: id
description: Customer count
metrics:
- name: total_revenue
type: simple
type_params:
measure: total_revenue
- name: customer_count
type: simple
type_params:
measure: customer_count
- name: revenue_per_customer
description: Revenue per customer
type: derived
type_params:
expr: total_revenue / NULLIF(customer_count, 0)
metrics:
- name: total_revenue
alias: total_revenue
- name: customer_count
alias: customer_count
`;
const schemaYaml = `
version: 2
sources:
- name: raw
database: warehouse
schema: landing
tables:
- name: customers
identifier: dim_customers
description: Raw customer dimension
columns:
- name: id
description: Customer primary key
- name: country
description: Country name
models:
- name: "{{ var('orders_model', 'fct_orders') }}"
schema: "{{ var('mart_schema', 'analytics') }}"
description: Modeled orders
columns:
- name: customer_id
description: Linked customer id
tests:
- relationships:
to: ref('dim_customers')
field: id
- name: status
description: Order status
- name: amount
description: Gross amount
`;
const hostTables: DbtHostTableLite[] = [
{
id: 'orders-table',
name: 'fct_orders',
catalog: 'warehouse',
db: 'analytics',
columns: [
{ id: 'orders-customer-id', name: 'customer_id' },
{ id: 'orders-status', name: 'status' },
{ id: 'orders-amount', name: 'amount' },
{ id: 'orders-ordered-at', name: 'ordered_at' },
],
},
{
id: 'customers-table',
name: 'dim_customers',
catalog: 'warehouse',
db: 'landing',
columns: [
{ id: 'customers-id', name: 'id' },
{ id: 'customers-country', name: 'country' },
],
},
];
describe('dbt extraction golden parity fixture', () => {
it('freezes the relocated MetricFlow and dbt-description contract together', () => {
const metricflow = parseMetricflowFiles([{ path: 'semantic_models/orders.yml', content: metricflowYaml }]);
expect(metricflow).toEqual({
semanticModels: [
{
name: 'orders_semantic',
description: 'MetricFlow order facts',
modelRef: 'fct_orders',
dimensions: [
{
name: 'status',
column: 'status',
type: 'string',
label: 'Status',
description: 'Order status',
},
{
name: 'ordered_at',
column: 'ordered_at',
type: 'time',
label: 'Ordered At',
description: undefined,
},
],
measures: [
{
type: 'simple',
name: 'total_revenue',
column: 'amount',
aggregation: 'sum',
label: 'Total Revenue',
description: 'Revenue',
},
],
entities: [{ name: 'customer', type: 'foreign', expr: 'customer_id', description: 'Customer relationship' }],
defaultTimeDimension: 'ordered_at',
},
{
name: 'customers_semantic',
description: 'Customer dimension',
modelRef: 'dim_customers',
dimensions: [
{
name: 'country',
column: 'country',
type: 'string',
label: 'Country',
description: 'Customer country',
},
],
measures: [
{
type: 'simple',
name: 'customer_count',
column: 'id',
aggregation: 'count',
label: 'Customer Count',
description: 'Customer count',
},
],
entities: [{ name: 'customer', type: 'primary', expr: 'id' }],
defaultTimeDimension: null,
},
],
crossModelMetrics: [
{
name: 'revenue_per_customer',
label: null,
description: 'Revenue per customer',
type: 'derived',
expr: 'total_revenue / NULLIF(customer_count, 0)',
dependsOn: [
{ metricName: 'orders_semantic', alias: 'total_revenue' },
{ metricName: 'customers_semantic', alias: 'customer_count' },
],
filter: null,
},
],
relationships: [
{
fromTable: 'fct_orders',
fromColumn: 'customer_id',
toTable: 'dim_customers',
toColumn: 'id',
description: 'Customer relationship',
},
],
warnings: [],
});
expect(mapSemanticModelToSource(metricflow.semanticModels[0], 'analytics.fct_orders')).toEqual({
name: 'fct-orders',
table: 'analytics.fct_orders',
grain: ['status', 'ordered_at'],
columns: [
{ name: 'status', type: 'string', description: 'Order status' },
{ name: 'ordered_at', type: 'time' },
],
measures: [
{
name: 'total_revenue',
expr: 'sum(amount)',
description: 'Revenue',
},
],
joins: [],
descriptions: { dbt: 'MetricFlow order facts' },
});
expect(mapCrossModelMetricToSource(metricflow.crossModelMetrics[0])).toEqual({
name: 'revenue-per-customer',
sql: 'total_revenue / NULLIF(customer_count, 0)',
descriptions: { dbt: 'Revenue per customer' },
grain: [],
columns: [],
measures: [
{
name: 'revenue_per_customer',
expr: 'total_revenue / NULLIF(customer_count, 0)',
description: 'Revenue per customer',
},
],
joins: [],
});
const schema = parseDbtSchemaFiles(
[{ path: 'models/schema.yml', content: schemaYaml }],
new Map([
['orders_model', 'fct_orders'],
['mart_schema', 'analytics'],
]),
);
const merged = mergeSemanticModelTables(schema, metricflow.semanticModels);
expect(merged).toEqual({
projectName: null,
dbtVersion: null,
tables: [
{
name: 'dim_customers',
description: 'Raw customer dimension',
database: 'warehouse',
schema: 'landing',
columns: [
{ name: 'id', description: 'Customer primary key', dataType: null },
{ name: 'country', description: 'Country name', dataType: null },
],
resourceType: 'source',
},
{
name: 'fct_orders',
description: 'Modeled orders',
database: null,
schema: 'analytics',
columns: [
{
name: 'customer_id',
description: 'Linked customer id',
dataType: null,
dataTests: [
{
name: 'relationships',
package: 'dbt',
kwargs: { to: "ref('dim_customers')", field: 'id' },
},
],
},
{ name: 'status', description: 'Order status', dataType: null },
{ name: 'amount', description: 'Gross amount', dataType: null },
],
resourceType: 'model',
},
],
relationships: [
{
fromTable: 'fct_orders',
fromColumn: 'customer_id',
toTable: 'dim_customers',
toColumn: 'id',
fromSchema: 'analytics',
},
],
});
expect(matchDbtTables(merged.tables, hostTables, 'analytics')).toEqual([
{
dbtTable: 'dim_customers',
dbtSchema: 'landing',
dbtDatabase: 'warehouse',
hostTableId: 'customers-table',
hostTableName: 'dim_customers',
matched: true,
tableDescriptionAction: 'import',
tableDescriptionFound: true,
columnsToImport: 2,
columnsMatched: 2,
columnsTotal: 2,
columnDescriptionsFound: 2,
},
{
dbtTable: 'fct_orders',
dbtSchema: 'analytics',
dbtDatabase: null,
hostTableId: 'orders-table',
hostTableName: 'fct_orders',
matched: true,
tableDescriptionAction: 'import',
tableDescriptionFound: true,
columnsToImport: 3,
columnsMatched: 3,
columnsTotal: 3,
columnDescriptionsFound: 3,
},
]);
expect(
toDescriptionUpdates({
connectionId: 'warehouse-1',
parseResult: merged,
hostTables,
targetSchema: 'analytics',
}),
).toEqual({
dbt: [
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'landing', name: 'dim_customers' },
source: 'dbt',
tableDescription: 'Raw customer dimension',
columnDescriptions: {
id: 'Customer primary key',
country: 'Country name',
},
},
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'fct_orders' },
source: 'dbt',
tableDescription: 'Modeled orders',
columnDescriptions: {
customer_id: 'Linked customer id',
status: 'Order status',
amount: 'Gross amount',
},
},
],
aiInvalidations: [
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'landing', name: 'dim_customers' },
source: 'ai',
},
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'fct_orders' },
source: 'ai',
},
],
});
expect(toRelationshipUpdates({ connectionId: 'warehouse-1', parseResult: merged, hostTables })).toEqual({
joins: [
{
connectionId: 'warehouse-1',
fromTable: 'fct_orders',
fromColumns: ['customer_id'],
toTable: 'dim_customers',
toColumns: ['id'],
relationship: 'many_to_one',
author: 'dbt',
authorEmail: DBT_SYSTEM_EMAIL,
},
],
skippedNoMatch: 0,
});
});
});

View file

@ -0,0 +1,36 @@
import { describe, expect, it } from 'vitest';
import { chunkDbtProject } from './chunk.js';
describe('chunkDbtProject', () => {
const diffSet = (modified: string[]) => ({ added: [], modified, deleted: [], unchanged: [] });
it('caps peerFileIndex when the project has very many yaml files', () => {
const modelPaths = Array.from({ length: 201 }, (_, i) => `models/m${i}.yml`);
const allPaths = ['dbt_project.yml', ...modelPaths].sort();
const { workUnits } = chunkDbtProject({ allPaths });
const [first] = workUnits;
expect(first).toBeDefined();
expect(first?.peerFileIndex).toHaveLength(200);
expect(first?.notes).toMatch(/capped at 200/);
});
it('keeps large-project model work units when dbt_project.yml changes', () => {
const modelPaths = Array.from({ length: 30 }, (_, i) => `models/m${i}.yml`);
const allPaths = ['dbt_project.yml', ...modelPaths].sort();
const { workUnits } = chunkDbtProject({ allPaths }, { diffSet: diffSet(['dbt_project.yml']) });
expect(workUnits).toHaveLength(30);
expect(workUnits[0]?.rawFiles).toEqual(['models/m0.yml']);
expect(workUnits[0]?.dependencyPaths).toContain('dbt_project.yml');
});
it('keeps large-project model work units when non-model yaml peers change', () => {
const modelPaths = Array.from({ length: 30 }, (_, i) => `models/m${i}.yml`);
const allPaths = ['dbt_project.yml', 'seeds/seed_properties.yml', ...modelPaths].sort();
const { workUnits } = chunkDbtProject({ allPaths }, { diffSet: diffSet(['seeds/seed_properties.yml']) });
expect(workUnits).toHaveLength(30);
expect(workUnits[0]?.rawFiles).toEqual(['models/m0.yml']);
expect(workUnits[0]?.dependencyPaths).toContain('seeds/seed_properties.yml');
});
});

View file

@ -0,0 +1,130 @@
import type { ChunkResult, DiffSet, WorkUnit } from '../../types.js';
import type { ParsedDbtProject } from './parse.js';
interface ChunkOptions {
diffSet?: DiffSet;
}
/**
* Per-model work units (when the project has more than 25 YAML files) only name `rawFiles` under
* `models/**`. Other `.yml` (e.g. some `seeds/` or custom layouts) still appear in `peerFileIndex`
* or in the small-project / no-models fallbacks v1 does not emit one WU per non-models file.
*/
const MODELS_PREFIX = 'models/';
/** `peerFileIndex` is a hint only (agents may not read those paths). Cap to limit prompt size. */
const MAX_PEER_FILE_INDEX = 200;
function projectYamlPath(allPaths: string[]): string | undefined {
if (allPaths.includes('dbt_project.yml')) {
return 'dbt_project.yml';
}
if (allPaths.includes('dbt_project.yaml')) {
return 'dbt_project.yaml';
}
return undefined;
}
function modelRelativePaths(allPaths: string[]): string[] {
return allPaths.filter((p) => p.replace(/\\/g, '/').startsWith(MODELS_PREFIX)).sort();
}
function unitKeyForModelFile(mf: string): string {
const base = mf
.replace(/\.(ya?ml)$/i, '')
.replace(/\\/g, '/')
.replace(/[^a-zA-Z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
return `dbt-${base.toLowerCase()}`;
}
function emitFirstRunWorkUnits(allPaths: string[], dbtDep: string | undefined): WorkUnit[] {
if (allPaths.length === 0) {
return [];
}
if (allPaths.length <= 25) {
return [
{
unitKey: 'dbt-all',
displayLabel: 'dbt project (all yaml)',
rawFiles: [...allPaths],
peerFileIndex: [],
dependencyPaths: [],
notes: 'dbt project — all YAML in one WorkUnit (≤25 files)',
},
];
}
const modelFiles = modelRelativePaths(allPaths);
if (modelFiles.length === 0) {
return [
{
unitKey: 'dbt-all',
displayLabel: 'dbt project (all yaml, no models/**)',
rawFiles: [...allPaths],
peerFileIndex: [],
dependencyPaths: dbtDep ? [dbtDep] : [],
notes: 'dbt: no models/**/*.yml — single slice with dbt_project as dependency if present',
},
];
}
return modelFiles.map((mf) => {
const allPeers = allPaths.filter((p) => p !== mf).sort();
const truncated = allPeers.length > MAX_PEER_FILE_INDEX;
const peerFileIndex = truncated ? allPeers.slice(0, MAX_PEER_FILE_INDEX) : allPeers;
const dependencyPaths = dbtDep && allPaths.includes(dbtDep) && mf !== dbtDep ? [dbtDep].sort() : [];
const notes = truncated
? `dbt model schema slice (peer index capped at ${MAX_PEER_FILE_INDEX} of ${allPeers.length} paths)`
: 'dbt model schema slice';
return {
unitKey: unitKeyForModelFile(mf),
displayLabel: `dbt ${mf}`,
rawFiles: [mf],
peerFileIndex,
dependencyPaths: dependencyPaths,
notes,
};
});
}
function applyDiffSet(firstRunUnits: WorkUnit[], diffSet: DiffSet): ChunkResult {
const touched = new Set([...diffSet.added, ...diffSet.modified]);
const kept: WorkUnit[] = [];
for (const wu of firstRunUnits) {
const touchedRawFiles = wu.rawFiles.filter((p) => touched.has(p));
const touchedDependencies = wu.dependencyPaths.filter((p) => touched.has(p));
const touchedPeerFiles = wu.peerFileIndex.filter((p) => touched.has(p));
if (touchedRawFiles.length === 0 && touchedDependencies.length === 0 && touchedPeerFiles.length === 0) {
continue;
}
const rawFiles = touchedRawFiles.length > 0 ? touchedRawFiles : wu.rawFiles;
const unchangedRaw = touchedRawFiles.length > 0 ? wu.rawFiles.filter((p) => !touched.has(p)) : [];
for (const p of wu.rawFiles) {
if (!rawFiles.includes(p) && !unchangedRaw.includes(p)) {
unchangedRaw.push(p);
}
}
const combinedDeps = new Set<string>([...wu.dependencyPaths, ...unchangedRaw, ...touchedPeerFiles]);
kept.push({
...wu,
rawFiles: rawFiles.sort(),
dependencyPaths: [...combinedDeps].sort(),
});
}
const eviction = diffSet.deleted.length > 0 ? { deletedRawPaths: [...diffSet.deleted].sort() } : undefined;
return { workUnits: kept, eviction };
}
export function chunkDbtProject(project: ParsedDbtProject, opts: ChunkOptions = {}): ChunkResult {
const dbtDep = projectYamlPath(project.allPaths);
const firstRun = emitFirstRunWorkUnits(project.allPaths, dbtDep);
if (!opts.diffSet) {
return { workUnits: firstRun };
}
return applyDiffSet(firstRun, opts.diffSet);
}

View file

@ -0,0 +1,57 @@
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import type { SourceAdapter } from '../../types.js';
import { DbtSourceAdapter } from './dbt.adapter.js';
describe('DbtSourceAdapter', () => {
let stagedDir: string;
let adapter: SourceAdapter;
beforeEach(async () => {
stagedDir = await mkdtemp(join(tmpdir(), 'dbt-adapter-'));
adapter = new DbtSourceAdapter();
});
afterEach(async () => {
await rm(stagedDir, { recursive: true, force: true });
});
it('declares the expected source key and skill list', () => {
expect(adapter.source).toBe('dbt');
expect(adapter.skillNames).toEqual(['dbt_ingest']);
});
it('detects a staged dbt project root (dbt_project.yml)', async () => {
await writeFile(join(stagedDir, 'dbt_project.yml'), "name: 'jaffle'\nversion: '1.0.0'\n", 'utf-8');
expect(await adapter.detect(stagedDir)).toBe(true);
});
it('chunk: dbt_project.yml + models/a.yml yields one WU (≤25 files)', async () => {
await writeFile(join(stagedDir, 'dbt_project.yml'), "name: 'jaffle'\n", 'utf-8');
await mkdir(join(stagedDir, 'models'), { recursive: true });
await writeFile(
join(stagedDir, 'models/a.yml'),
'version: 2\nmodels:\n - name: orders\n description: Orders\n',
'utf-8',
);
const result = await adapter.chunk(stagedDir);
expect(result.workUnits).toHaveLength(1);
expect(result.workUnits[0].unitKey).toBe('dbt-all');
expect(result.parseArtifacts).toMatchObject({
projectName: 'jaffle',
tables: [{ name: 'orders', description: 'Orders' }],
});
});
it('implements fetch() for git-backed dbt source setup', () => {
expect(adapter.fetch).toBeTypeOf('function');
});
it('reports mapped warehouse targets for bundle SL discovery', async () => {
adapter = new DbtSourceAdapter({ targetConnectionIds: ['postgres-warehouse', 'postgres-warehouse'] });
await expect(adapter.listTargetConnectionIds?.(stagedDir)).resolves.toEqual(['postgres-warehouse']);
});
});

View file

@ -0,0 +1,53 @@
import { join } from 'node:path';
import type { ChunkResult, DiffSet, SourceAdapter } from '../../types.js';
import type { FetchContext } from '../../types.js';
import { loadProjectInfo } from '../../dbt-shared/project-vars.js';
import { loadDbtSchemaFiles } from '../../dbt-shared/schema-files.js';
import { parseDbtSchemaFiles } from '../dbt-descriptions/parse-schema.js';
import { chunkDbtProject } from './chunk.js';
import { detectDbtStagedDir } from './detect.js';
import { fetchDbtRepo, type DbtPullConfig } from './fetch.js';
import { parseDbtStagedDir } from './parse.js';
interface DbtSourceAdapterOptions {
homeDir?: string;
targetConnectionIds?: string[];
}
export class DbtSourceAdapter implements SourceAdapter {
readonly source = 'dbt' as const;
/** Runner merges: ingest_triage, sl_capture, wiki_capture (see ingest-bundle.runner.ts) */
readonly skillNames: string[] = ['dbt_ingest'];
constructor(private readonly options: DbtSourceAdapterOptions = {}) {}
detect(stagedDir: string): Promise<boolean> {
return detectDbtStagedDir(stagedDir);
}
async listTargetConnectionIds(_stagedDir: string): Promise<string[]> {
return [...new Set(this.options.targetConnectionIds ?? [])].sort((left, right) => left.localeCompare(right));
}
async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
const config = pullConfig as DbtPullConfig | undefined;
if (!config?.repoUrl) {
throw new Error('dbt fetch requires repoUrl');
}
await fetchDbtRepo({
config,
cacheDir: join(this.options.homeDir ?? '.ktx/cache', 'dbt', ctx.connectionId),
stagedDir,
});
}
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const project = await parseDbtStagedDir(stagedDir);
const projectInfo = await loadProjectInfo(stagedDir);
const schemaFiles = await loadDbtSchemaFiles(stagedDir);
const parseArtifacts = parseDbtSchemaFiles(schemaFiles, projectInfo.variables, {
projectName: projectInfo.projectName,
});
return { ...chunkDbtProject(project, { diffSet }), parseArtifacts };
}
}

View file

@ -0,0 +1,12 @@
import { access } from 'node:fs/promises';
import { join } from 'node:path';
export async function detectDbtStagedDir(stagedDir: string): Promise<boolean> {
for (const name of ['dbt_project.yml', 'dbt_project.yaml'] as const) {
try {
await access(join(stagedDir, name));
return true;
} catch {}
}
return false;
}

View file

@ -0,0 +1,38 @@
import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { fetchDbtRepo } from './fetch.js';
describe('fetchDbtRepo', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-dbt-fetch-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('copies dbt yaml files from a fetched repo subpath into staged dir', async () => {
const cacheDir = join(tempDir, 'cache');
const stagedDir = join(tempDir, 'staged');
await mkdir(join(cacheDir, 'analytics', 'models'), { recursive: true });
await writeFile(join(cacheDir, 'analytics', 'dbt_project.yml'), 'name: analytics\n', 'utf-8');
await writeFile(join(cacheDir, 'analytics', 'models', 'orders.yml'), 'models: []\n', 'utf-8');
const cloneOrPull = vi.fn(async () => ({ commitHash: 'abc123' }));
await expect(
fetchDbtRepo({
config: { repoUrl: 'https://github.com/acme/dbt.git', path: 'analytics' },
cacheDir,
stagedDir,
deps: { cloneOrPull },
}),
).resolves.toEqual({ commitHash: 'abc123', filesCopied: 2 });
await expect(readFile(join(stagedDir, 'dbt_project.yml'), 'utf-8')).resolves.toContain('analytics');
await expect(readFile(join(stagedDir, 'models', 'orders.yml'), 'utf-8')).resolves.toContain('models');
});
});

View file

@ -0,0 +1,60 @@
import { access, copyFile, mkdir, readdir } from 'node:fs/promises';
import { dirname, join, relative } from 'node:path';
import { cloneOrPull, sanitizeRepoError } from '../../repo-fetch.js';
export interface DbtPullConfig {
repoUrl: string;
branch?: string;
path?: string;
authToken?: string | null;
}
export interface FetchDbtRepoParams {
config: DbtPullConfig;
cacheDir: string;
stagedDir: string;
deps?: {
cloneOrPull?: typeof cloneOrPull;
};
}
export async function fetchDbtRepo(params: FetchDbtRepoParams): Promise<{ commitHash: string; filesCopied: number }> {
try {
const runCloneOrPull = params.deps?.cloneOrPull ?? cloneOrPull;
const { commitHash } = await runCloneOrPull({
repoUrl: params.config.repoUrl,
authToken: params.config.authToken,
cacheDir: params.cacheDir,
branch: params.config.branch ?? 'main',
});
const sourceRoot = params.config.path ? join(params.cacheDir, params.config.path) : params.cacheDir;
const filesCopied = await copyYamlFilesRecursive(sourceRoot, params.stagedDir);
return { commitHash, filesCopied };
} catch (error) {
throw new Error(sanitizeRepoError(error, params.config.authToken));
}
}
async function copyYamlFilesRecursive(sourceRoot: string, destRoot: string): Promise<number> {
try {
await access(sourceRoot);
} catch {
return 0;
}
await mkdir(destRoot, { recursive: true });
const entries = await readdir(sourceRoot, { withFileTypes: true, recursive: true });
let copied = 0;
for (const entry of entries) {
if (!entry.isFile() || !/\.ya?ml$/i.test(entry.name)) {
continue;
}
const absSrc = join(entry.parentPath, entry.name);
const rel = relative(sourceRoot, absSrc);
const dest = join(destRoot, rel);
await mkdir(dirname(dest), { recursive: true });
await copyFile(absSrc, dest);
copied += 1;
}
return copied;
}

View file

@ -0,0 +1,8 @@
import { describe, expect, it } from 'vitest';
import { normalizeDbtPath } from './parse.js';
describe('normalizeDbtPath', () => {
it('normalizes Windows separators to POSIX separators', () => {
expect(normalizeDbtPath('models\\marts\\orders.yml')).toBe('models/marts/orders.yml');
});
});

View file

@ -0,0 +1,32 @@
import { readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
const YAML_EXT_RE = /\.(ya?ml)$/i;
export function normalizeDbtPath(path: string): string {
return path.replaceAll('\\', '/');
}
async function collectYamlFiles(stagedDir: string): Promise<string[]> {
const entries = await readdir(stagedDir, { withFileTypes: true, recursive: true });
const paths: string[] = [];
for (const entry of entries) {
if (!entry.isFile() || !YAML_EXT_RE.test(entry.name)) {
continue;
}
const abs = join(entry.parentPath, entry.name);
paths.push(normalizeDbtPath(relative(stagedDir, abs)));
}
paths.sort();
return paths;
}
export interface ParsedDbtProject {
/** All `.yml` / `.yaml` paths under stagedDir, relative + sorted. */
allPaths: string[];
}
export async function parseDbtStagedDir(stagedDir: string): Promise<ParsedDbtProject> {
const allPaths = await collectYamlFiles(stagedDir);
return { allPaths };
}

View file

@ -0,0 +1,48 @@
import { readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
import type { ChunkResult, DiffSet, SourceAdapter, WorkUnit } from '../../types.js';
export class FakeSourceAdapter implements SourceAdapter {
readonly source = 'fake';
readonly skillNames: string[] = [];
detect(): Promise<boolean> {
return Promise.resolve(true);
}
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const subDirs = (await readdir(stagedDir, { withFileTypes: true }))
.filter((e) => e.isDirectory())
.map((e) => e.name)
.sort();
const workUnits: WorkUnit[] = [];
for (const subDir of subDirs) {
const entries = await readdir(join(stagedDir, subDir), { withFileTypes: true, recursive: true });
const rawFiles = entries
.filter((e) => e.isFile())
.map((e) => relative(stagedDir, join(e.parentPath, e.name)))
.sort();
if (rawFiles.length === 0) {
continue;
}
if (diffSet) {
const touched = new Set([...diffSet.added, ...diffSet.modified]);
const anyTouched = rawFiles.some((p) => touched.has(p));
if (!anyTouched) {
continue;
}
}
workUnits.push({
unitKey: `fake-${subDir}`,
displayLabel: subDir,
rawFiles,
peerFileIndex: [],
dependencyPaths: [],
});
}
const eviction = diffSet && diffSet.deleted.length > 0 ? { deletedRawPaths: [...diffSet.deleted] } : undefined;
return { workUnits, eviction };
}
}

View file

@ -0,0 +1,158 @@
import { describe, expect, it, vi } from 'vitest';
import { BigQueryHistoricSqlQueryHistoryReader } from './bigquery-query-history-reader.js';
import { HistoricSqlGrantsMissingError } from './errors.js';
interface FakeQueryResult {
headers: string[];
rows: unknown[][];
totalRows: number;
error?: string;
}
function queryClient(results: FakeQueryResult[]) {
const executeQuery = vi.fn(async (_query: string) => {
const next = results.shift();
if (!next) {
throw new Error('unexpected query');
}
return next;
});
return { executeQuery };
}
function firstQuery(client: ReturnType<typeof queryClient>): string {
const call = client.executeQuery.mock.calls[0];
if (!call) {
throw new Error('expected query client to be called');
}
return call[0];
}
describe('BigQueryHistoricSqlQueryHistoryReader', () => {
it('probes region-qualified INFORMATION_SCHEMA.JOBS_BY_PROJECT', async () => {
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(reader.probe(client)).resolves.toEqual({ warnings: [], info: [] });
expect(client.executeQuery).toHaveBeenCalledWith(
'SELECT 1 FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` LIMIT 1',
);
});
it('turns probe result errors into HistoricSqlGrantsMissingError', async () => {
const client = queryClient([{ headers: [], rows: [], totalRows: 0, error: 'Access Denied: jobs.listAll' }]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'us-central1' });
await expect(reader.probe(client)).rejects.toMatchObject({
name: 'HistoricSqlGrantsMissingError',
dialect: 'bigquery',
remediation:
'Grant roles/bigquery.resourceViewer on the BigQuery project, or grant a custom role containing bigquery.jobs.listAll.',
});
});
it('turns thrown probe failures into HistoricSqlGrantsMissingError', async () => {
const client = {
executeQuery: vi.fn(async () => {
throw new Error('permission denied');
}),
};
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('fetches aggregated BigQuery query templates', async () => {
const client = queryClient([
{
headers: [
'template_id',
'canonical_sql',
'executions',
'distinct_users',
'first_seen',
'last_seen',
'p50_ms',
'p95_ms',
'error_rate',
'rows_produced',
'top_users',
],
rows: [
[
'hash-1',
'select status from orders',
42,
3,
'2026-05-01T00:00:00.000Z',
'2026-05-11T00:00:00.000Z',
12,
40,
0.05,
null,
JSON.stringify([{ user: 'analyst@example.test', executions: 1 }]),
],
],
totalRows: 1,
},
]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'demo', region: 'us' });
const rows = [];
for await (const row of reader.fetchAggregated(
client,
{ start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') },
{ dialect: 'bigquery', minExecutions: 5, windowDays: 90, enabledTables: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 },
)) {
rows.push(row);
}
const sql = firstQuery(client);
expect(sql).toContain('COUNT(*) AS executions');
expect(sql).toContain('COUNT(DISTINCT user_email) AS distinct_users');
expect(sql).toContain('GROUP BY query_hash');
expect(sql).toContain('HAVING COUNT(*) >= 5');
expect(rows).toMatchObject([
{
templateId: 'hash-1',
stats: {
executions: 42,
errorRate: 0.05,
},
topUsers: [{ user: 'analyst@example.test', executions: 1 }],
},
]);
});
it('throws a clear error when the query client cannot execute SQL', async () => {
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(async () => {
for await (const _row of reader.fetchAggregated(
{},
{ start: new Date(), end: new Date() },
{
dialect: 'bigquery',
minExecutions: 5,
windowDays: 90,
enabledTables: [],
filters: { dropTrivialProbes: true },
redactionPatterns: [],
staleArchiveAfterDays: 90,
},
)) {
throw new Error('unreachable');
}
}).rejects.toThrow('Historic SQL BigQuery reader requires a query client with executeQuery(query)');
});
it('rejects unsafe project and region identifiers before building SQL', () => {
expect(() => new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project`1', region: 'US' })).toThrow(
'Invalid BigQuery project id for historic-SQL ingest: project`1',
);
expect(() => new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US;DROP' })).toThrow(
'Invalid BigQuery region for historic-SQL ingest: US;DROP',
);
});
});

View file

@ -0,0 +1,247 @@
import { HistoricSqlGrantsMissingError } from './errors.js';
import {
aggregatedTemplateSchema,
type AggregatedTemplate,
type HistoricSqlTimeWindow,
type HistoricSqlUnifiedPullConfig,
} from './types.js';
interface QueryResultLike {
headers: string[];
rows: unknown[][];
totalRows: number;
error?: string;
}
interface QueryClientLike {
executeQuery(query: string): Promise<QueryResultLike>;
}
export interface BigQueryHistoricSqlQueryHistoryReaderOptions {
projectId: string;
region: string;
}
const BIGQUERY_GRANTS_REMEDIATION =
'Grant roles/bigquery.resourceViewer on the BigQuery project, or grant a custom role containing bigquery.jobs.listAll.';
function queryClient(client: unknown): QueryClientLike {
if (
client &&
typeof client === 'object' &&
'executeQuery' in client &&
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
) {
return client as QueryClientLike;
}
throw new Error('Historic SQL BigQuery reader requires a query client with executeQuery(query)');
}
function grantsError(cause: unknown): HistoricSqlGrantsMissingError {
const message =
cause instanceof Error
? cause.message
: typeof cause === 'string'
? cause
: 'BigQuery principal cannot query INFORMATION_SCHEMA.JOBS_BY_PROJECT.';
return new HistoricSqlGrantsMissingError({
dialect: 'bigquery',
message: `Missing BigQuery audit grants for historic-SQL ingest: ${message}`,
remediation: BIGQUERY_GRANTS_REMEDIATION,
cause,
});
}
function normalizeProjectId(value: string): string {
if (!/^[A-Za-z0-9_-]+$/.test(value)) {
throw new Error(`Invalid BigQuery project id for historic-SQL ingest: ${value}`);
}
return value;
}
function normalizeRegion(value: string): string {
const region = value.trim().toLowerCase().replace(/^region-/, '');
if (!/^[a-z0-9-]+$/.test(region)) {
throw new Error(`Invalid BigQuery region for historic-SQL ingest: ${value}`);
}
return region;
}
function timestampExpression(value: Date | string): string {
const date = value instanceof Date ? value : new Date(value);
if (Number.isNaN(date.getTime())) {
throw new Error(`Invalid BigQuery query-history timestamp: ${String(value)}`);
}
return `TIMESTAMP('${date.toISOString().replace(/'/g, "\\'")}')`;
}
function indexByHeader(headers: string[]): Map<string, number> {
const out = new Map<string, number>();
headers.forEach((header, index) => {
out.set(header.toUpperCase(), index);
});
return out;
}
function value(row: unknown[], indexes: Map<string, number>, name: string): unknown {
const index = indexes.get(name.toUpperCase());
return index === undefined ? null : row[index];
}
function nullableString(raw: unknown): string | null {
if (raw === null || raw === undefined) {
return null;
}
const text = String(raw);
return text.length > 0 ? text : null;
}
function requiredString(raw: unknown, field: string): string {
const text = nullableString(raw);
if (!text) {
throw new Error(`BigQuery JOBS_BY_PROJECT row is missing ${field}`);
}
return text;
}
function nullableNumber(raw: unknown): number | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
const number = typeof raw === 'number' ? raw : Number(raw);
if (!Number.isFinite(number)) {
return null;
}
return Math.max(0, number);
}
function requiredNumber(raw: unknown, field: string): number {
const number = nullableNumber(raw);
if (number === null) {
throw new Error(`BigQuery JOBS_BY_PROJECT row has invalid ${field}: ${String(raw)}`);
}
return number;
}
function requiredInteger(raw: unknown, field: string): number {
return Math.trunc(requiredNumber(raw, field));
}
function nullableInteger(raw: unknown): number | null {
const number = nullableNumber(raw);
return number === null ? null : Math.trunc(number);
}
function isoTimestamp(raw: unknown, field: string): string {
if (raw instanceof Date) {
return raw.toISOString();
}
const text = requiredString(raw, field);
const date = new Date(text);
if (Number.isNaN(date.getTime())) {
throw new Error(`BigQuery JOBS_BY_PROJECT row has invalid ${field}: ${text}`);
}
return date.toISOString();
}
function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> {
const text = nullableString(raw);
if (!text) {
return [];
}
try {
const parsed = JSON.parse(text) as unknown;
if (!Array.isArray(parsed)) {
return [];
}
return parsed.flatMap((entry) => {
if (!entry || typeof entry !== 'object') {
return [];
}
const user = nullableString((entry as { user?: unknown }).user);
const executions = nullableInteger((entry as { executions?: unknown }).executions);
return executions === null ? [] : [{ user, executions }];
});
} catch {
return [];
}
}
function mapAggregatedRow(row: unknown[], indexes: Map<string, number>): AggregatedTemplate {
return aggregatedTemplateSchema.parse({
templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'),
canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'),
dialect: 'bigquery',
stats: {
executions: requiredInteger(value(row, indexes, 'executions'), 'executions'),
distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'),
firstSeen: isoTimestamp(value(row, indexes, 'first_seen'), 'first_seen'),
lastSeen: isoTimestamp(value(row, indexes, 'last_seen'), 'last_seen'),
p50RuntimeMs: nullableNumber(value(row, indexes, 'p50_ms')),
p95RuntimeMs: nullableNumber(value(row, indexes, 'p95_ms')),
errorRate: requiredNumber(value(row, indexes, 'error_rate'), 'error_rate'),
rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')),
},
topUsers: parseTopUsers(value(row, indexes, 'top_users')),
});
}
export class BigQueryHistoricSqlQueryHistoryReader {
private readonly viewPath: string;
constructor(options: BigQueryHistoricSqlQueryHistoryReaderOptions) {
const projectId = normalizeProjectId(options.projectId);
const region = normalizeRegion(options.region);
this.viewPath = `\`${projectId}.region-${region}.INFORMATION_SCHEMA.JOBS_BY_PROJECT\``;
}
async probe(client: unknown): Promise<{ warnings: string[]; info: string[] }> {
let result: QueryResultLike;
try {
result = await queryClient(client).executeQuery(`SELECT 1 FROM ${this.viewPath} LIMIT 1`);
} catch (error) {
throw grantsError(error);
}
if (result.error) {
throw grantsError(result.error);
}
return { warnings: [], info: [] };
}
async *fetchAggregated(
client: unknown,
window: HistoricSqlTimeWindow,
config: HistoricSqlUnifiedPullConfig,
): AsyncIterable<AggregatedTemplate> {
const sql = `
SELECT
query_hash AS template_id,
MIN(query) AS canonical_sql,
COUNT(*) AS executions,
COUNT(DISTINCT user_email) AS distinct_users,
MIN(creation_time) AS first_seen,
MAX(creation_time) AS last_seen,
APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(50)] AS p50_ms,
APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(95)] AS p95_ms,
SAFE_DIVIDE(COUNTIF(error_result IS NOT NULL), COUNT(*)) AS error_rate,
CAST(NULL AS INT64) AS rows_produced,
TO_JSON_STRING(ARRAY_AGG(STRUCT(user_email AS user, 1 AS executions) ORDER BY creation_time DESC LIMIT 5)) AS top_users
FROM ${this.viewPath}
WHERE job_type = 'QUERY'
AND statement_type IN ('SELECT', 'MERGE')
AND creation_time >= ${timestampExpression(window.start)}
AND creation_time < ${timestampExpression(window.end)}
AND query IS NOT NULL
GROUP BY query_hash
HAVING COUNT(*) >= ${config.minExecutions}
ORDER BY executions DESC`.trim();
const result = await queryClient(client).executeQuery(sql);
if (result.error) {
throw grantsError(result.error);
}
const indexes = indexByHeader(result.headers);
for (const row of result.rows) {
yield mapAggregatedRow(row, indexes);
}
}
}

View file

@ -0,0 +1,59 @@
import { describe, expect, it } from 'vitest';
import {
bucketDistinctUsers,
bucketErrorRate,
bucketExecutions,
bucketFrequency,
bucketP95Runtime,
bucketRecency,
} from './buckets.js';
describe('historic-sql bucket helpers', () => {
it('uses stable execution buckets', () => {
expect([0, 9, 10, 99, 100, 999, 1000, 4999, 5000, 49999, 50000].map(bucketExecutions)).toEqual([
'<10',
'<10',
'10-100',
'10-100',
'100-1k',
'100-1k',
'1k-5k',
'1k-5k',
'5k-50k',
'5k-50k',
'>50k',
]);
});
it('uses stable distinct-user, error-rate, runtime, and recency buckets', () => {
expect([0, 1, 2, 5, 6, 10, 11].map(bucketDistinctUsers)).toEqual([
'0',
'1',
'2-5',
'2-5',
'5-10',
'5-10',
'>10',
]);
expect([0, 0.01, 0.05, 0.2].map(bucketErrorRate)).toEqual(['none', 'low', 'low', 'high']);
expect([null, 99, 100, 999, 1000, 9999, 10000].map(bucketP95Runtime)).toEqual([
'unknown',
'<100ms',
'100ms-1s',
'100ms-1s',
'1s-10s',
'1s-10s',
'>10s',
]);
expect(bucketRecency('2026-05-11T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('current');
expect(bucketRecency('2026-04-20T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('recent');
expect(bucketRecency('2026-01-01T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('stale');
});
it('maps frequency counts to high, mid, and low labels', () => {
expect(bucketFrequency(80, 100)).toBe('high');
expect(bucketFrequency(20, 100)).toBe('mid');
expect(bucketFrequency(1, 100)).toBe('low');
expect(bucketFrequency(0, 0)).toBe('low');
});
});

View file

@ -0,0 +1,49 @@
export function bucketExecutions(value: number): string {
if (value < 10) return '<10';
if (value < 100) return '10-100';
if (value < 1000) return '100-1k';
if (value < 5000) return '1k-5k';
if (value < 50000) return '5k-50k';
return '>50k';
}
export function bucketDistinctUsers(value: number): string {
if (value <= 0) return '0';
if (value === 1) return '1';
if (value <= 5) return '2-5';
if (value <= 10) return '5-10';
return '>10';
}
export function bucketErrorRate(value: number): string {
if (value <= 0) return 'none';
if (value < 0.1) return 'low';
return 'high';
}
export function bucketP95Runtime(value: number | null): string {
if (value === null) return 'unknown';
if (value < 100) return '<100ms';
if (value < 1000) return '100ms-1s';
if (value < 10000) return '1s-10s';
return '>10s';
}
export function bucketRecency(lastSeen: string, now: Date): string {
const parsed = new Date(lastSeen);
if (Number.isNaN(parsed.getTime())) {
return 'unknown';
}
const ageDays = (now.getTime() - parsed.getTime()) / (24 * 60 * 60 * 1000);
if (ageDays <= 7) return 'current';
if (ageDays <= 45) return 'recent';
return 'stale';
}
export function bucketFrequency(count: number, total: number): 'high' | 'mid' | 'low' {
if (total <= 0 || count <= 0) return 'low';
const ratio = count / total;
if (ratio >= 0.5) return 'high';
if (ratio >= 0.1) return 'mid';
return 'low';
}

View file

@ -0,0 +1,182 @@
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './chunk-unified.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-unified-chunk-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
async function writeUnifiedStagedDir(root: string): Promise<void> {
await writeJson(root, 'manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 1,
touchedTableCount: 1,
parseFailures: 0,
warnings: [],
probeWarnings: [],
});
await writeJson(root, 'tables/public.orders.json', {
table: 'public.orders',
stats: {
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
errorRateBucket: 'none',
p95RuntimeBucket: '<100ms',
recencyBucket: 'current',
},
columnsByClause: { select: [['status', 'high']] },
observedJoins: [],
topTemplates: [{ id: 'orders', canonicalSql: 'select * from public.orders', topUsers: [{ user: 'analyst' }] }],
});
await writeJson(root, 'patterns-input.json', {
templates: [
{
id: 'orders',
canonicalSql: 'select * from public.orders join public.customers on true',
tablesTouched: ['public.orders', 'public.customers'],
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
},
],
});
await writeJson(root, 'patterns-input/part-0001.json', {
templates: [
{
id: 'orders',
canonicalSql: 'select * from public.orders join public.customers on true',
tablesTouched: ['public.orders', 'public.customers'],
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
},
],
});
}
describe('chunkHistoricSqlUnifiedStagedDir', () => {
it('emits one table WorkUnit plus one patterns WorkUnit', async () => {
const stagedDir = await tempDir();
await writeUnifiedStagedDir(stagedDir);
const result = await chunkHistoricSqlUnifiedStagedDir(stagedDir);
expect(result.workUnits).toEqual([
expect.objectContaining({
unitKey: 'historic-sql-table-public-orders',
displayLabel: 'Historic SQL usage: public.orders',
rawFiles: ['tables/public.orders.json'],
dependencyPaths: ['manifest.json'],
notes: expect.stringContaining('historic_sql_table_digest'),
}),
expect.objectContaining({
unitKey: 'historic-sql-patterns-part-0001',
displayLabel: 'Historic SQL cross-table patterns: part-0001',
rawFiles: ['patterns-input/part-0001.json'],
dependencyPaths: ['manifest.json'],
notes: expect.stringContaining('patterns-input/part-0001.json'),
}),
]);
expect(result.workUnits[0]?.notes).toContain('emit_historic_sql_evidence');
expect(result.workUnits[1]?.notes).toContain('emit_historic_sql_evidence');
expect(result.reconcileNotes).toEqual(['Historic-SQL touched tables=1 parseFailures=0']);
});
it('respects diff sets for unchanged table and patterns files', async () => {
const stagedDir = await tempDir();
await writeUnifiedStagedDir(stagedDir);
await expect(
chunkHistoricSqlUnifiedStagedDir(stagedDir, {
added: [],
modified: ['tables/public.orders.json'],
deleted: [],
unchanged: ['manifest.json', 'patterns-input.json', 'patterns-input/part-0001.json'],
}),
).resolves.toMatchObject({
workUnits: [expect.objectContaining({ unitKey: 'historic-sql-table-public-orders' })],
});
await expect(
chunkHistoricSqlUnifiedStagedDir(stagedDir, {
added: [],
modified: ['patterns-input/part-0001.json'],
deleted: [],
unchanged: ['manifest.json', 'patterns-input.json', 'tables/public.orders.json'],
}),
).resolves.toMatchObject({
workUnits: [expect.objectContaining({ unitKey: 'historic-sql-patterns-part-0001' })],
});
await expect(
chunkHistoricSqlUnifiedStagedDir(stagedDir, {
added: [],
modified: ['patterns-input.json'],
deleted: [],
unchanged: ['manifest.json', 'patterns-input/part-0001.json', 'tables/public.orders.json'],
}),
).resolves.toMatchObject({
workUnits: [],
});
});
it('describes unified staged scope', async () => {
const stagedDir = await tempDir();
await writeUnifiedStagedDir(stagedDir);
const scope = await describeHistoricSqlUnifiedScope(stagedDir);
expect(scope.isPathInScope('manifest.json')).toBe(true);
expect(scope.isPathInScope('patterns-input.json')).toBe(true);
expect(scope.isPathInScope('patterns-input/part-0001.json')).toBe(true);
expect(scope.isPathInScope('patterns-input/part-1.json')).toBe(false);
expect(scope.isPathInScope('tables/public.orders.json')).toBe(true);
expect(scope.isPathInScope('templates/old/page.md')).toBe(false);
});
it('emits one patterns WorkUnit per changed shard', async () => {
const stagedDir = await tempDir();
await writeUnifiedStagedDir(stagedDir);
await writeJson(stagedDir, 'patterns-input/part-0002.json', {
templates: [
{
id: 'line-items',
canonicalSql: 'select * from public.orders join public.line_items on true',
tablesTouched: ['public.orders', 'public.line_items'],
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
},
],
});
const result = await chunkHistoricSqlUnifiedStagedDir(stagedDir, {
added: ['patterns-input/part-0002.json'],
modified: ['patterns-input/part-0001.json'],
deleted: [],
unchanged: ['manifest.json', 'patterns-input.json', 'tables/public.orders.json'],
});
expect(result.workUnits.map((unit) => unit.unitKey)).toEqual([
'historic-sql-patterns-part-0001',
'historic-sql-patterns-part-0002',
]);
expect(result.workUnits.map((unit) => unit.rawFiles)).toEqual([
['patterns-input/part-0001.json'],
['patterns-input/part-0002.json'],
]);
});
});

View file

@ -0,0 +1,99 @@
import { createHash } from 'node:crypto';
import { readFile, readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js';
import { isHistoricSqlPatternInputShardPath } from './pattern-inputs.js';
import { stagedManifestSchema, stagedPatternsInputSchema, stagedTableInputSchema } from './types.js';
async function walk(root: string): Promise<string[]> {
const entries = await readdir(root, { withFileTypes: true, recursive: true });
return entries
.filter((entry) => entry.isFile())
.map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/'))
.sort();
}
async function readJson<T>(stagedDir: string, relPath: string): Promise<T> {
return JSON.parse(await readFile(join(stagedDir, relPath), 'utf-8')) as T;
}
function safeUnitKey(value: string): string {
return value.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '');
}
function touchedPath(path: string, touched: Set<string> | null): boolean {
return !touched || touched.has(path);
}
export async function chunkHistoricSqlUnifiedStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const files = await walk(stagedDir);
const manifest = stagedManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
const workUnits: WorkUnit[] = [];
for (const path of files.filter((file) => /^tables\/.+\.json$/.test(file))) {
if (!touchedPath(path, touched)) {
continue;
}
const table = stagedTableInputSchema.parse(await readJson(stagedDir, path));
workUnits.push({
unitKey: `historic-sql-table-${safeUnitKey(table.table)}`,
displayLabel: `Historic SQL usage: ${table.table}`,
rawFiles: [path],
dependencyPaths: ['manifest.json'],
peerFileIndex: files.filter((file) => file !== path && file !== 'manifest.json').sort(),
notes:
'Use historic_sql_table_digest. Read this table usage JSON and emit exactly one table_usage object with emit_historic_sql_evidence. Do not call wiki_write or sl_write_source.',
});
}
for (const path of files.filter(isHistoricSqlPatternInputShardPath)) {
if (!touchedPath(path, touched)) {
continue;
}
stagedPatternsInputSchema.parse(await readJson(stagedDir, path));
const shardLabel = path.replace(/^patterns-input\//, '').replace(/\.json$/, '');
workUnits.push({
unitKey: `historic-sql-patterns-${safeUnitKey(shardLabel)}`,
displayLabel: `Historic SQL cross-table patterns: ${shardLabel}`,
rawFiles: [path],
dependencyPaths: ['manifest.json'],
peerFileIndex: files.filter((file) => file !== path && file !== 'manifest.json').sort(),
notes:
`Use historic_sql_patterns. Read ${path} and emit pattern objects with emit_historic_sql_evidence using rawPath "${path}". Do not call wiki_write or sl_write_source.`,
});
}
const deleted = diffSet?.deleted
.filter((path) => isHistoricSqlPatternInputShardPath(path) || /^tables\/.+\.json$/.test(path))
.sort();
return {
workUnits,
eviction: deleted && deleted.length > 0 ? { deletedRawPaths: deleted } : undefined,
reconcileNotes: [`Historic-SQL touched tables=${manifest.touchedTableCount} parseFailures=${manifest.parseFailures}`],
contextReport: {
capped: false,
warnings: [...manifest.probeWarnings, ...manifest.warnings],
},
};
}
export async function describeHistoricSqlUnifiedScope(stagedDir: string): Promise<ScopeDescriptor> {
const manifest = stagedManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const fingerprint = createHash('sha256')
.update(JSON.stringify({
connectionId: manifest.connectionId,
dialect: manifest.dialect,
windowStart: manifest.windowStart,
windowEnd: manifest.windowEnd,
}))
.digest('hex');
return {
fingerprint,
isPathInScope: (rawPath) =>
rawPath === 'manifest.json' ||
rawPath === 'patterns-input.json' ||
isHistoricSqlPatternInputShardPath(rawPath) ||
/^tables\/.+\.json$/.test(rawPath),
};
}

View file

@ -0,0 +1,57 @@
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { detectHistoricSqlStagedDir } from './detect.js';
import { HISTORIC_SQL_SOURCE_KEY, stagedManifestSchema } from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-detect-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
function manifest() {
return stagedManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_1',
dialect: 'postgres',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
snapshotRowCount: 0,
touchedTableCount: 0,
parseFailures: 0,
warnings: [],
probeWarnings: [],
});
}
describe('historic-sql staged dir detection', () => {
it('detects manifest source', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', manifest());
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
});
it('detects unified table and patterns structure without manifest', async () => {
const stagedDir = await tempDir();
await writeFile(join(stagedDir, 'not-a-match.txt'), 'x', 'utf-8');
await writeJson(stagedDir, 'patterns-input.json', { templates: [] });
await writeJson(stagedDir, 'tables/public.orders.json', { table: 'public.orders' });
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
});
it('does not detect unrelated directories', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', { source: 'notion' });
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(false);
});
});

View file

@ -0,0 +1,25 @@
import { readFile, readdir } from 'node:fs/promises';
import { join } from 'node:path';
import { HISTORIC_SQL_SOURCE_KEY } from './types.js';
export async function detectHistoricSqlStagedDir(stagedDir: string): Promise<boolean> {
try {
const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as { source?: unknown };
if (manifest.source === HISTORIC_SQL_SOURCE_KEY) {
return true;
}
if (manifest.source !== undefined) {
return false;
}
} catch {
// Fall through to structural detection for stage-only fixtures.
}
try {
await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8');
const entries = await readdir(join(stagedDir, 'tables'), { withFileTypes: true });
return entries.some((entry) => entry.isFile() && entry.name.endsWith('.json'));
} catch {
return false;
}
}

View file

@ -0,0 +1,61 @@
import type { HistoricSqlDialect } from './types.js';
interface HistoricSqlGrantsMissingErrorOptions {
dialect: HistoricSqlDialect;
message: string;
remediation: string;
cause?: unknown;
}
export class HistoricSqlGrantsMissingError extends Error {
readonly dialect: HistoricSqlDialect;
readonly remediation: string;
constructor(options: HistoricSqlGrantsMissingErrorOptions) {
super(options.message, options.cause === undefined ? undefined : { cause: options.cause });
this.name = 'HistoricSqlGrantsMissingError';
this.dialect = options.dialect;
this.remediation = options.remediation;
}
}
interface HistoricSqlExtensionMissingErrorOptions {
dialect: HistoricSqlDialect;
message: string;
remediation: string;
cause?: unknown;
}
export class HistoricSqlExtensionMissingError extends Error {
readonly dialect: HistoricSqlDialect;
readonly remediation: string;
constructor(options: HistoricSqlExtensionMissingErrorOptions) {
super(options.message, options.cause === undefined ? undefined : { cause: options.cause });
this.name = 'HistoricSqlExtensionMissingError';
this.dialect = options.dialect;
this.remediation = options.remediation;
}
}
interface HistoricSqlVersionUnsupportedErrorOptions {
dialect: HistoricSqlDialect;
detectedVersion: string;
minimumVersion: string;
}
export class HistoricSqlVersionUnsupportedError extends Error {
readonly dialect: HistoricSqlDialect;
readonly detectedVersion: string;
readonly minimumVersion: string;
constructor(options: HistoricSqlVersionUnsupportedErrorOptions) {
super(
`Unsupported ${options.dialect} version for historic-SQL ingest: detected ${options.detectedVersion}; requires ${options.minimumVersion} or newer.`,
);
this.name = 'HistoricSqlVersionUnsupportedError';
this.dialect = options.dialect;
this.detectedVersion = options.detectedVersion;
this.minimumVersion = options.minimumVersion;
}
}

View file

@ -0,0 +1,89 @@
import { describe, expect, it, vi } from 'vitest';
import { asSchema } from 'ai';
import { createEmitHistoricSqlEvidenceTool } from './evidence-tool.js';
describe('emit_historic_sql_evidence tool', () => {
it('exposes an AI SDK v6 tool input schema with top-level object type', async () => {
const tool = createEmitHistoricSqlEvidenceTool();
expect(await asSchema(tool.inputSchema).jsonSchema).toMatchObject({
type: 'object',
});
});
it('writes table usage evidence to the ignored run evidence directory', async () => {
const writeFile = vi.fn(async () => ({ success: true, commitHash: null }));
const tool = createEmitHistoricSqlEvidenceTool();
const result = await tool.execute!(
{
kind: 'table_usage',
table: 'public.orders',
rawPath: 'tables/public.orders.json',
usage: {
narrative: 'Orders are repeatedly queried by paid status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [],
staleSince: null,
},
},
{
toolCallId: 'call-1',
messages: [],
abortSignal: new AbortController().signal,
experimental_context: {
connectionId: 'warehouse',
session: {
ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'historic-sql' },
configService: { writeFile },
},
},
} as never,
);
expect(result).toBe('Recorded historic-SQL table_usage evidence for public.orders.');
expect(writeFile).toHaveBeenCalledWith(
'.ktx/ingest-evidence/historic-sql/run-1/historic-sql-table-public-orders.json',
expect.stringContaining('"kind": "table_usage"'),
'System User',
'system@example.com',
'Record historic-SQL evidence: historic-sql-table-public-orders',
{ skipLock: true },
);
});
it('rejects non-historic ingest sessions', async () => {
const tool = createEmitHistoricSqlEvidenceTool();
await expect(
tool.execute!(
{
kind: 'pattern',
rawPath: 'patterns-input.json',
pattern: {
slug: 'orders',
title: 'Orders',
narrative: 'Orders pattern.',
definitionSql: 'select * from public.orders',
tablesInvolved: ['public.orders'],
slRefs: ['orders'],
constituentTemplateIds: ['pg:1'],
},
},
{
toolCallId: 'call-1',
messages: [],
abortSignal: new AbortController().signal,
experimental_context: {
connectionId: 'warehouse',
session: {
ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'notion' },
configService: { writeFile: vi.fn() },
},
},
} as never,
),
).resolves.toContain('Error: emit_historic_sql_evidence is only available during historic-sql ingest');
});
});

View file

@ -0,0 +1,121 @@
import { tool } from 'ai';
import { z } from 'zod';
import { historicSqlEvidencePath, serializeHistoricSqlEvidence } from './evidence.js';
import { patternOutputSchema, tableUsageOutputSchema } from './skill-schemas.js';
const SYSTEM_AUTHOR = 'System User';
const SYSTEM_EMAIL = 'system@example.com';
const emitHistoricSqlEvidenceInputSchema = z
.object({
kind: z.enum(['table_usage', 'pattern']),
table: z.string().min(1).optional(),
rawPath: z.string().min(1),
usage: tableUsageOutputSchema.optional(),
pattern: patternOutputSchema.optional(),
})
.superRefine((input, ctx) => {
if (input.kind === 'table_usage') {
if (!input.table) {
ctx.addIssue({
code: 'custom',
path: ['table'],
message: 'table is required when kind is table_usage',
});
}
if (!input.usage) {
ctx.addIssue({
code: 'custom',
path: ['usage'],
message: 'usage is required when kind is table_usage',
});
}
}
if (input.kind === 'pattern' && !input.pattern) {
ctx.addIssue({
code: 'custom',
path: ['pattern'],
message: 'pattern is required when kind is pattern',
});
}
});
type EmitHistoricSqlEvidenceInput = z.infer<typeof emitHistoricSqlEvidenceInputSchema>;
interface EmitHistoricSqlEvidenceToolContext {
connectionId?: string | null;
session?: {
ingest?: { runId: string; sourceKey: string };
configService?: {
writeFile(
path: string,
content: string,
author: string,
authorEmail: string,
commitMessage: string,
options?: { skipLock?: boolean },
): Promise<unknown>;
};
};
}
function unitKeyForEvidence(input: EmitHistoricSqlEvidenceInput): string {
if (input.kind === 'table_usage') {
return `historic-sql-table-${String(input.table).replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
}
return `historic-sql-pattern-${String(input.pattern?.slug).replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
}
function evidenceEnvelope(input: EmitHistoricSqlEvidenceInput, connectionId: string) {
if (input.kind === 'table_usage') {
if (!input.table || !input.usage) {
throw new Error('Invalid historic-SQL table usage evidence input.');
}
return {
kind: 'table_usage' as const,
connectionId,
table: input.table,
rawPath: input.rawPath,
usage: input.usage,
};
}
if (!input.pattern) {
throw new Error('Invalid historic-SQL pattern evidence input.');
}
return {
kind: 'pattern' as const,
connectionId,
rawPath: input.rawPath,
pattern: input.pattern,
};
}
export function createEmitHistoricSqlEvidenceTool(defaultContext?: EmitHistoricSqlEvidenceToolContext) {
return tool({
description:
'Record typed historic-SQL evidence for deterministic projection. Use this instead of wiki_write, sl_write_source, sl_edit_source, or context_candidate_write during historic-SQL WorkUnits.',
inputSchema: emitHistoricSqlEvidenceInputSchema,
execute: async (input, options): Promise<string> => {
const context = (options.experimental_context as EmitHistoricSqlEvidenceToolContext | undefined) ?? defaultContext;
const ingest = context?.session?.ingest;
const configService = context?.session?.configService;
if (!ingest || ingest.sourceKey !== 'historic-sql' || !configService || !context?.connectionId) {
return 'Error: emit_historic_sql_evidence is only available during historic-sql ingest.';
}
const unitKey = unitKeyForEvidence(input);
const evidence = evidenceEnvelope(input, context.connectionId);
const content = serializeHistoricSqlEvidence(evidence);
await configService.writeFile(
historicSqlEvidencePath(ingest.runId, unitKey),
content,
SYSTEM_AUTHOR,
SYSTEM_EMAIL,
`Record historic-SQL evidence: ${unitKey}`,
{ skipLock: true },
);
const label = evidence.kind === 'table_usage' ? evidence.table : evidence.pattern.slug;
return `Recorded historic-SQL ${input.kind} evidence for ${label}.`;
},
});
}

View file

@ -0,0 +1,57 @@
import { describe, expect, it } from 'vitest';
import {
historicSqlEvidenceEnvelopeSchema,
historicSqlEvidencePath,
historicSqlPatternEvidenceSchema,
historicSqlTableUsageEvidenceSchema,
} from './evidence.js';
describe('historic-sql evidence contracts', () => {
it('validates table usage evidence emitted by table digest WorkUnits', () => {
const parsed = historicSqlTableUsageEvidenceSchema.parse({
kind: 'table_usage',
connectionId: 'warehouse',
table: 'public.orders',
rawPath: 'tables/public.orders.json',
usage: {
narrative: 'Orders are repeatedly queried for paid/refunded lifecycle analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: null,
},
});
expect(parsed.table).toBe('public.orders');
expect(parsed.usage.frequencyTier).toBe('high');
});
it('validates pattern evidence emitted by the patterns WorkUnit', () => {
const parsed = historicSqlPatternEvidenceSchema.parse(
historicSqlEvidenceEnvelopeSchema.parse({
kind: 'pattern',
connectionId: 'warehouse',
rawPath: 'patterns-input.json',
pattern: {
slug: 'order-lifecycle-analysis',
title: 'Order Lifecycle Analysis',
narrative: 'Analysts compare order status changes by customer segment.',
definitionSql: 'select status, count(*) from public.orders group by status',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['pg:1', 'pg:2'],
},
}),
);
expect(parsed.kind).toBe('pattern');
expect(parsed.pattern.slug).toBe('order-lifecycle-analysis');
});
it('builds a stable ignored evidence path from run and WorkUnit identity', () => {
expect(historicSqlEvidencePath('run-1', 'historic-sql-table-public-orders')).toBe(
'.ktx/ingest-evidence/historic-sql/run-1/historic-sql-table-public-orders.json',
);
});
});

View file

@ -0,0 +1,41 @@
import { z } from 'zod';
import { patternOutputSchema, tableUsageOutputSchema } from './skill-schemas.js';
function safeEvidenceSegment(value: string): string {
const segment = value.replace(/[^a-zA-Z0-9._-]+/g, '-').replace(/^-+|-+$/g, '');
if (!segment) {
throw new Error(`Invalid historic-SQL evidence path segment: ${value}`);
}
return segment;
}
export const historicSqlTableUsageEvidenceSchema = z.object({
kind: z.literal('table_usage'),
connectionId: z.string().min(1),
table: z.string().min(1),
rawPath: z.string().min(1),
usage: tableUsageOutputSchema,
});
export type HistoricSqlTableUsageEvidence = z.infer<typeof historicSqlTableUsageEvidenceSchema>;
export const historicSqlPatternEvidenceSchema = z.object({
kind: z.literal('pattern'),
connectionId: z.string().min(1),
rawPath: z.string().min(1),
pattern: patternOutputSchema,
});
export type HistoricSqlPatternEvidence = z.infer<typeof historicSqlPatternEvidenceSchema>;
export const historicSqlEvidenceEnvelopeSchema = z.discriminatedUnion('kind', [
historicSqlTableUsageEvidenceSchema,
historicSqlPatternEvidenceSchema,
]);
export type HistoricSqlEvidenceEnvelope = z.infer<typeof historicSqlEvidenceEnvelopeSchema>;
export function historicSqlEvidencePath(runId: string, unitKey: string): string {
return `.ktx/ingest-evidence/historic-sql/${safeEvidenceSegment(runId)}/${safeEvidenceSegment(unitKey)}.json`;
}
export function serializeHistoricSqlEvidence(evidence: HistoricSqlEvidenceEnvelope): string {
return `${JSON.stringify(historicSqlEvidenceEnvelopeSchema.parse(evidence), null, 2)}\n`;
}

View file

@ -0,0 +1,110 @@
import { mkdtemp } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import type { SourceAdapter } from '../../types.js';
import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js';
import type { HistoricSqlReader } from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-adapter-'));
}
const sqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint() {
throw new Error('analyzeForFingerprint must not be used');
},
async analyzeBatch() {
return new Map();
},
async validateReadOnly() {
return { ok: true };
},
};
const reader: HistoricSqlReader = {
async probe() {
return { warnings: [], info: [] };
},
async *fetchAggregated() {},
};
describe('HistoricSqlSourceAdapter', () => {
it('declares canonical adapter metadata', () => {
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
expect(adapter.source).toBe('historic-sql');
expect(adapter.skillNames).toEqual(['historic_sql_table_digest', 'historic_sql_patterns']);
expect(adapter.reconcileSkillNames).toEqual([]);
expect((adapter as SourceAdapter).evidenceIndexing).toBeUndefined();
expect(adapter.triageSupported).toBe(false);
});
it('fetches a unified aggregate snapshot and emits unified WorkUnits', async () => {
const stagedDir = await tempDir();
const aggregateReader: HistoricSqlReader = {
async probe() {
return { warnings: [], info: [] };
},
async *fetchAggregated() {
yield {
templateId: 'pg:1',
canonicalSql:
'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status',
dialect: 'postgres',
stats: {
executions: 25,
distinctUsers: 3,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 10,
p95RuntimeMs: 20,
errorRate: 0,
rowsProduced: 10,
},
topUsers: [{ user: 'analyst', executions: 25 }],
};
},
};
const batchSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint() {
throw new Error('analyzeForFingerprint must not be used');
},
async analyzeBatch() {
return new Map([
[
'pg:1',
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: { select: ['status'], join: ['customer_id', 'id'], groupBy: ['status'] },
},
],
]);
},
async validateReadOnly() {
return { ok: true };
},
};
const adapter = new HistoricSqlSourceAdapter({
sqlAnalysis: batchSqlAnalysis,
reader: aggregateReader,
queryClient: {},
now: () => new Date('2026-05-11T00:00:00.000Z'),
});
await adapter.fetch({ dialect: 'postgres', minExecutions: 5 }, stagedDir, {
connectionId: 'warehouse',
sourceKey: 'historic-sql',
});
await expect(adapter.detect(stagedDir)).resolves.toBe(true);
await expect(adapter.chunk(stagedDir)).resolves.toMatchObject({
workUnits: [
{ unitKey: 'historic-sql-table-public-customers' },
{ unitKey: 'historic-sql-table-public-orders' },
{ unitKey: 'historic-sql-patterns-part-0001' },
],
});
});
});

View file

@ -0,0 +1,65 @@
import type {
ChunkResult,
DeterministicFinalizationContext,
DiffSet,
FetchContext,
FinalizationResult,
ScopeDescriptor,
SourceAdapter,
} from '../../types.js';
import { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './chunk-unified.js';
import { detectHistoricSqlStagedDir } from './detect.js';
import { projectHistoricSqlEvidence } from './projection.js';
import { stageHistoricSqlAggregatedSnapshot } from './stage-unified.js';
import { type HistoricSqlSourceAdapterDeps } from './types.js';
export class HistoricSqlSourceAdapter implements SourceAdapter {
readonly source = 'historic-sql';
readonly skillNames = ['historic_sql_table_digest', 'historic_sql_patterns'];
readonly reconcileSkillNames: string[] = [];
readonly triageSupported = false;
constructor(private readonly deps: HistoricSqlSourceAdapterDeps) {}
detect(stagedDir: string): Promise<boolean> {
return detectHistoricSqlStagedDir(stagedDir);
}
async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
await stageHistoricSqlAggregatedSnapshot({
stagedDir,
connectionId: ctx.connectionId,
queryClient: this.deps.queryClient,
reader: this.deps.reader,
sqlAnalysis: this.deps.sqlAnalysis,
pullConfig,
now: this.deps.now?.(),
});
}
chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
return chunkHistoricSqlUnifiedStagedDir(stagedDir, diffSet);
}
describeScope(stagedDir: string): Promise<ScopeDescriptor> {
return describeHistoricSqlUnifiedScope(stagedDir);
}
async finalize(ctx: DeterministicFinalizationContext): Promise<FinalizationResult> {
const projection = await projectHistoricSqlEvidence({
workdir: ctx.workdir,
connectionId: ctx.connectionId,
syncId: ctx.syncId,
runId: ctx.runId,
overrideReplay: ctx.overrideReplay,
});
return {
result: projection,
warnings: projection.warnings,
errors: [],
touchedSources: projection.touchedSources,
changedWikiPageKeys: projection.changedWikiPageKeys,
actions: projection.actions,
};
}
}

View file

@ -0,0 +1,291 @@
import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import YAML from 'yaml';
import type { AgentRunnerPort, RunLoopParams } from '../../../llm/index.js';
import { initKtxProject, loadKtxProject, type KtxLocalProject } from '../../../project/index.js';
import {
type SqlAnalysisBatchItem,
type SqlAnalysisBatchResult,
type SqlAnalysisDialect,
type SqlAnalysisPort,
} from '../../../sql-analysis/index.js';
import { searchLocalSlSources } from '../../../sl/local-sl.js';
import { searchLocalKnowledgePages } from '../../../wiki/local-knowledge.js';
import { runLocalIngest } from '../../local-ingest.js';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js';
import type { AggregatedTemplate, HistoricSqlReader, HistoricSqlUnifiedPullConfig } from './types.js';
class AcceptanceHistoricSqlReader implements HistoricSqlReader {
async probe() {
return { warnings: [], info: [] };
}
async *fetchAggregated(
_client: unknown,
_window: { start: Date; end: Date },
_config: HistoricSqlUnifiedPullConfig,
): AsyncIterable<AggregatedTemplate> {
yield {
templateId: 'pg:orders-lifecycle',
canonicalSql:
'select o.status, c.segment, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.status = $1 group by o.status, c.segment',
dialect: 'postgres',
stats: {
executions: 42,
distinctUsers: 4,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 18,
p95RuntimeMs: 84,
errorRate: 0,
rowsProduced: 420,
},
topUsers: [{ user: 'analyst@example.test', executions: 42 }],
};
}
}
class HistoricSqlAcceptanceAgentRunner implements AgentRunnerPort {
runLoop = vi.fn(async (params: RunLoopParams) => {
if (params.telemetryTags?.operationName !== 'ingest-bundle-wu') {
return { stopReason: 'natural' as const };
}
const emitEvidence = params.toolSet.emit_historic_sql_evidence;
if (!emitEvidence?.execute) {
throw new Error('emit_historic_sql_evidence tool was not available to the historic-SQL WorkUnit');
}
if (params.telemetryTags.unitKey === 'historic-sql-table-public-orders') {
const result = await emitEvidence.execute({
kind: 'table_usage',
table: 'public.orders',
rawPath: 'tables/public.orders.json',
usage: {
narrative: 'Analysts repeatedly inspect paid order lifecycle by customer segment.',
frequencyTier: 'high',
commonFilters: ['status'],
commonGroupBys: ['status', 'segment'],
commonJoins: [{ table: 'public.customers', on: ['customer_id', 'id'] }],
staleSince: null,
},
});
if (!result.markdown.includes('Recorded historic-SQL table_usage evidence')) {
throw new Error(`Unexpected orders evidence result: ${result.markdown}`);
}
}
if (params.telemetryTags.unitKey === 'historic-sql-table-public-customers') {
const result = await emitEvidence.execute({
kind: 'table_usage',
table: 'public.customers',
rawPath: 'tables/public.customers.json',
usage: {
narrative: 'Customers provide segment context for paid order lifecycle analysis.',
frequencyTier: 'mid',
commonFilters: [],
commonGroupBys: ['segment'],
commonJoins: [{ table: 'public.orders', on: ['id', 'customer_id'] }],
staleSince: null,
},
});
if (!result.markdown.includes('Recorded historic-SQL table_usage evidence')) {
throw new Error(`Unexpected customers evidence result: ${result.markdown}`);
}
}
if (params.telemetryTags.unitKey === 'historic-sql-patterns-part-0001') {
const result = await emitEvidence.execute({
kind: 'pattern',
rawPath: 'patterns-input/part-0001.json',
pattern: {
slug: 'paid-order-lifecycle',
title: 'Paid Order Lifecycle',
narrative: 'Analysts join orders and customers to compare paid order lifecycle by segment.',
definitionSql:
'select o.status, c.segment, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status, c.segment',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['pg:orders-lifecycle'],
},
});
if (!result.markdown.includes('Recorded historic-SQL pattern evidence')) {
throw new Error(`Unexpected pattern evidence result: ${result.markdown}`);
}
}
return { stopReason: 'natural' as const };
});
}
function acceptanceSqlAnalysis(): SqlAnalysisPort {
return {
analyzeForFingerprint: async () => {
throw new Error('analyzeForFingerprint should not be used by unified historic-SQL ingest');
},
analyzeBatch: vi.fn(
async (
items: SqlAnalysisBatchItem[],
_dialect: SqlAnalysisDialect,
): Promise<Map<string, SqlAnalysisBatchResult>> => {
return new Map(
items.map((item) => [
item.id,
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: {
select: ['status', 'segment'],
where: ['status'],
join: ['customer_id', 'id'],
groupBy: ['status', 'segment'],
},
},
]),
);
},
),
validateReadOnly: vi.fn(async () => ({ ok: true })),
};
}
async function writeHistoricSqlProject(project: KtxLocalProject): Promise<KtxLocalProject> {
await writeFile(
join(project.projectDir, 'ktx.yaml'),
[
'connections:',
' warehouse:',
' driver: postgres',
' historicSql:',
' enabled: true',
' dialect: postgres',
' minExecutions: 2',
'ingest:',
' adapters:',
' - historic-sql',
' embeddings:',
' backend: none',
'storage:',
' state: sqlite',
' search: sqlite-fts5',
' git:',
' auto_commit: false',
' author: KTX Test <system@ktx.local>',
'',
].join('\n'),
'utf-8',
);
const loaded = await loadKtxProject({ projectDir: project.projectDir });
await loaded.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify({
tables: {
orders: {
table: 'public.orders',
columns: [
{ name: 'id', type: 'string' },
{ name: 'status', type: 'string' },
{ name: 'customer_id', type: 'string' },
],
},
customers: {
table: 'public.customers',
columns: [
{ name: 'id', type: 'string' },
{ name: 'segment', type: 'string' },
],
},
},
}),
'KTX Test',
'system@ktx.local',
'Seed schema shard',
);
return loaded;
}
describe('historic-SQL local ingest retrieval acceptance', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-historic-sql-acceptance-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('projects table and pattern evidence into semantic-layer and wiki retrieval surfaces', async () => {
const initialized = await initKtxProject({ projectDir: join(tempDir, 'project') });
const project = await writeHistoricSqlProject(initialized);
const sqlAnalysis = acceptanceSqlAnalysis();
const agentRunner = new HistoricSqlAcceptanceAgentRunner();
const adapter = new HistoricSqlSourceAdapter({
reader: new AcceptanceHistoricSqlReader(),
queryClient: {},
sqlAnalysis,
now: () => new Date('2026-05-11T00:00:00.000Z'),
});
const result = await runLocalIngest({
project,
adapters: [adapter],
adapter: 'historic-sql',
connectionId: 'warehouse',
jobId: 'historic-sql-retrieval-acceptance',
agentRunner,
});
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(1);
expect(result.result.failedWorkUnits).toEqual([]);
expect(result.result.workUnitCount).toBe(3);
expect(agentRunner.runLoop).toHaveBeenCalledTimes(3);
const finalization = result.report.body.finalization;
expect(finalization).toBeDefined();
if (!finalization) {
throw new Error('Expected historic-SQL finalization result');
}
expect(finalization).toMatchObject({
sourceKey: 'historic-sql',
status: 'success',
result: {
tableUsageMerged: 2,
patternPagesWritten: 1,
},
});
expect(finalization.declaredTouchedSources).toEqual(
expect.arrayContaining([
{ connectionId: 'warehouse', sourceName: 'customers' },
{ connectionId: 'warehouse', sourceName: 'orders' },
]),
);
await expect(readFile(join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves
.toContain('Analysts repeatedly inspect paid order lifecycle by customer segment.');
await expect(readFile(join(project.projectDir, 'wiki/global/historic-sql-paid-order-lifecycle.md'), 'utf-8'))
.resolves.toContain('Paid Order Lifecycle');
const reloaded = await loadKtxProject({ projectDir: project.projectDir });
await expect(
searchLocalSlSources(reloaded, { connectionId: 'warehouse', query: 'paid order lifecycle', limit: 5 }),
).resolves.toEqual(expect.arrayContaining([
expect.objectContaining({
name: 'orders',
frequencyTier: 'high',
snippet: expect.stringContaining('<mark>'),
matchReasons: expect.arrayContaining(['lexical']),
}),
]));
await expect(
searchLocalKnowledgePages(reloaded, { query: 'paid order lifecycle', userId: 'local', limit: 5 }),
).resolves.toEqual([
expect.objectContaining({
key: 'historic-sql-paid-order-lifecycle',
summary: 'Paid Order Lifecycle',
matchReasons: expect.arrayContaining(['lexical']),
}),
]);
});
});

View file

@ -0,0 +1,89 @@
import { describe, expect, it } from 'vitest';
import {
HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES,
isHistoricSqlPatternInputShardPath,
serializedStagedPatternsInputByteLength,
splitHistoricSqlPatternInputs,
} from './pattern-inputs.js';
import type { StagedPatternsInput } from './types.js';
type PatternTemplate = StagedPatternsInput['templates'][number];
function template(id: string, tablesTouched: string[], canonicalSql = 'select 1'): PatternTemplate {
return {
id,
canonicalSql,
tablesTouched,
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
};
}
describe('historic-SQL pattern input sharding', () => {
it('keeps the audit input complete while sharding only cross-table pattern candidates', () => {
const largeSql = `select * from public.orders join public.customers on true where marker = '${'x'.repeat(260)}'`;
const input: StagedPatternsInput = {
templates: [
template('single-table-orders', ['public.orders']),
template('orders-customers-2', ['public.orders', 'public.customers'], largeSql),
template('orders-customers-1', ['public.customers', 'public.orders'], largeSql),
template('orders-customers-payments', ['public.orders', 'public.customers', 'public.payments'], largeSql),
],
};
const result = splitHistoricSqlPatternInputs(input, { maxBytes: 760 });
expect(result.auditInput.templates.map((entry) => entry.id)).toEqual([
'orders-customers-1',
'orders-customers-2',
'orders-customers-payments',
'single-table-orders',
]);
expect(result.shards.length).toBeGreaterThan(1);
expect(result.shards.map((shard) => shard.path)).toEqual([
'patterns-input/part-0001.json',
'patterns-input/part-0002.json',
'patterns-input/part-0003.json',
]);
expect(result.shards.flatMap((shard) => shard.input.templates.map((entry) => entry.id))).toEqual([
'orders-customers-payments',
'orders-customers-1',
'orders-customers-2',
]);
expect(result.shards.every((shard) => shard.byteLength <= 760)).toBe(true);
expect(result.shards.flatMap((shard) => shard.input.templates).some((entry) => entry.id === 'single-table-orders')).toBe(false);
expect(result.warnings).toEqual([]);
});
it('omits a single oversized template from shards and reports a manifest warning', () => {
const input: StagedPatternsInput = {
templates: [
template(
'oversized-cross-table',
['public.orders', 'public.customers'],
`select * from public.orders join public.customers on true where payload = '${'x'.repeat(500)}'`,
),
],
};
const result = splitHistoricSqlPatternInputs(input, { maxBytes: 240 });
expect(result.auditInput.templates.map((entry) => entry.id)).toEqual(['oversized-cross-table']);
expect(result.shards).toEqual([]);
expect(result.warnings).toEqual(['patterns_input_template_too_large:oversized-cross-table']);
});
it('recognizes only generated pattern shard paths', () => {
expect(isHistoricSqlPatternInputShardPath('patterns-input/part-0001.json')).toBe(true);
expect(isHistoricSqlPatternInputShardPath('patterns-input/part-0012.json')).toBe(true);
expect(isHistoricSqlPatternInputShardPath('patterns-input.json')).toBe(false);
expect(isHistoricSqlPatternInputShardPath('patterns-input/part-1.json')).toBe(false);
expect(isHistoricSqlPatternInputShardPath('patterns-input/readme.md')).toBe(false);
});
it('uses a production byte budget below read_raw_file maximum size', () => {
expect(HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES).toBeLessThan(120_000);
expect(serializedStagedPatternsInputByteLength({ templates: [] })).toBeGreaterThan(0);
});
});

View file

@ -0,0 +1,99 @@
import { Buffer } from 'node:buffer';
import type { StagedPatternsInput } from './types.js';
export const HISTORIC_SQL_PATTERN_WORKUNIT_DIR = 'patterns-input';
export const HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES = 110_000;
export const HISTORIC_SQL_PATTERN_WORKUNIT_PATH_RE = /^patterns-input\/part-\d{4}\.json$/;
type PatternTemplate = StagedPatternsInput['templates'][number];
export interface HistoricSqlPatternInputShard {
path: string;
input: StagedPatternsInput;
byteLength: number;
}
export interface HistoricSqlPatternInputSplitResult {
auditInput: StagedPatternsInput;
shards: HistoricSqlPatternInputShard[];
warnings: string[];
}
export interface HistoricSqlPatternInputSplitOptions {
maxBytes?: number;
}
export function isHistoricSqlPatternInputShardPath(path: string): boolean {
return HISTORIC_SQL_PATTERN_WORKUNIT_PATH_RE.test(path);
}
export function serializeStagedPatternsInput(input: StagedPatternsInput): string {
return `${JSON.stringify(input, null, 2)}\n`;
}
export function serializedStagedPatternsInputByteLength(input: StagedPatternsInput): number {
return Buffer.byteLength(serializeStagedPatternsInput(input), 'utf-8');
}
function sortedAuditTemplates(templates: readonly PatternTemplate[]): PatternTemplate[] {
return [...templates].sort((left, right) => left.id.localeCompare(right.id));
}
function sortedPatternCandidates(templates: readonly PatternTemplate[]): PatternTemplate[] {
return [...templates]
.filter((template) => template.tablesTouched.length >= 2)
.map((template) => ({ ...template, tablesTouched: [...template.tablesTouched].sort() }))
.sort((left, right) => {
const cardinality = right.tablesTouched.length - left.tablesTouched.length;
if (cardinality !== 0) return cardinality;
const tableSignature = left.tablesTouched.join('\0').localeCompare(right.tablesTouched.join('\0'));
if (tableSignature !== 0) return tableSignature;
return left.id.localeCompare(right.id);
});
}
function shardPath(index: number): string {
return `${HISTORIC_SQL_PATTERN_WORKUNIT_DIR}/part-${String(index).padStart(4, '0')}.json`;
}
export function splitHistoricSqlPatternInputs(
input: StagedPatternsInput,
options: HistoricSqlPatternInputSplitOptions = {},
): HistoricSqlPatternInputSplitResult {
const maxBytes = options.maxBytes ?? HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES;
const auditInput: StagedPatternsInput = { templates: sortedAuditTemplates(input.templates) };
const warnings: string[] = [];
const shards: HistoricSqlPatternInputShard[] = [];
let current: PatternTemplate[] = [];
const flush = () => {
if (current.length === 0) {
return;
}
const shardInput: StagedPatternsInput = { templates: current };
shards.push({
path: shardPath(shards.length + 1),
input: shardInput,
byteLength: serializedStagedPatternsInputByteLength(shardInput),
});
current = [];
};
for (const template of sortedPatternCandidates(input.templates)) {
const singleInput: StagedPatternsInput = { templates: [template] };
if (serializedStagedPatternsInputByteLength(singleInput) > maxBytes) {
warnings.push(`patterns_input_template_too_large:${template.id}`);
continue;
}
const nextInput: StagedPatternsInput = { templates: [...current, template] };
if (current.length > 0 && serializedStagedPatternsInputByteLength(nextInput) > maxBytes) {
flush();
}
current.push(template);
}
flush();
return { auditInput, shards, warnings };
}

View file

@ -0,0 +1,242 @@
import { describe, expect, it, vi } from 'vitest';
import {
HistoricSqlExtensionMissingError,
HistoricSqlGrantsMissingError,
HistoricSqlVersionUnsupportedError,
} from './errors.js';
import { PostgresPgssReader } from './postgres-pgss-reader.js';
interface FakeQueryResult {
headers: string[];
rows: unknown[][];
totalRows?: number;
error?: string;
}
function queryClient(results: Array<FakeQueryResult | Error>) {
const executeQuery = vi.fn(async (_query: string, _params?: unknown[]) => {
const next = results.shift();
if (!next) {
throw new Error('unexpected query');
}
if (next instanceof Error) {
throw next;
}
return next;
});
return { executeQuery };
}
function executedSql(client: ReturnType<typeof queryClient>, index: number): string {
const call = client.executeQuery.mock.calls[index];
if (!call) {
throw new Error(`expected query client call ${index}`);
}
return call[0];
}
describe('PostgresPgssReader aggregate path', () => {
it('probes version, extension presence, grants, and tracking state', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4 on x86_64-apple-darwin']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['top']] },
{ headers: ['max'], rows: [['5000']] },
]);
const reader = new PostgresPgssReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4 on x86_64-apple-darwin',
warnings: [],
info: [],
});
expect(executedSql(client, 0)).toContain("current_setting('server_version_num')::int");
expect(executedSql(client, 1)).toBe('SELECT 1 FROM pg_stat_statements LIMIT 1');
expect(executedSql(client, 2)).toBe(
"SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role",
);
expect(executedSql(client, 3)).toBe("SELECT current_setting('pg_stat_statements.track') AS track");
expect(executedSql(client, 4)).toBe("SELECT current_setting('pg_stat_statements.max') AS max");
});
it('rejects PostgreSQL versions older than 14 without probing the extension', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[130012, 'PostgreSQL 13.12']],
},
]);
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlVersionUnsupportedError',
dialect: 'postgres',
detectedVersion: 'PostgreSQL 13.12',
minimumVersion: 'PostgreSQL 14',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlVersionUnsupportedError);
expect(client.executeQuery).toHaveBeenCalledTimes(1);
});
it('maps a missing pg_stat_statements relation to HistoricSqlExtensionMissingError', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
new Error('relation "pg_stat_statements" does not exist'),
]);
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlExtensionMissingError',
dialect: 'postgres',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlExtensionMissingError);
});
it('maps pg_stat_statements preload failures to HistoricSqlExtensionMissingError with preload remediation', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
new Error('pg_stat_statements must be loaded via shared_preload_libraries'),
]);
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlExtensionMissingError',
dialect: 'postgres',
message: 'pg_stat_statements is installed but not loaded via shared_preload_libraries.',
remediation: expect.stringContaining("shared_preload_libraries includes 'pg_stat_statements'"),
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlExtensionMissingError);
});
it('maps missing pg_read_all_stats membership to HistoricSqlGrantsMissingError', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[false]] },
]);
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlGrantsMissingError',
dialect: 'postgres',
remediation: 'GRANT pg_read_all_stats TO <connection role>;',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('returns a warning instead of failing when pg_stat_statements.track is none', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['none']] },
{ headers: ['max'], rows: [['5000']] },
]);
const reader = new PostgresPgssReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4',
warnings: [
"pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config",
],
info: [],
});
});
it('returns an info note when pg_stat_statements.max is below the recommended floor', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['top']] },
{ headers: ['max'], rows: [['1000']] },
]);
const reader = new PostgresPgssReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4',
warnings: [],
info: [
'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn',
],
});
});
it('aggregates pg_stat_statements rows by queryid and query', async () => {
const executeQuery = vi.fn(async (sql: string, params?: unknown[]) => {
if (sql.includes('pg_stat_statements_info')) {
return { headers: ['stats_reset', 'dealloc'], rows: [['2026-05-01T00:00:00.000Z', 1]] };
}
expect(sql).toContain('GROUP BY queryid, query');
expect(sql).toContain('HAVING SUM(calls) >= $1');
expect(params).toEqual([5]);
return {
headers: ['template_id', 'canonical_sql', 'executions', 'distinct_users', 'mean_ms', 'rows_produced', 'top_users'],
rows: [
[
'123',
'select status from public.orders',
'42',
'3',
'11.5',
'100',
JSON.stringify([{ user: 'analyst', executions: 40 }]),
],
],
};
});
const reader = new PostgresPgssReader();
const rows = [];
for await (const row of reader.fetchAggregated(
{ executeQuery },
{ start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') },
{ dialect: 'postgres', minExecutions: 5, enabledTables: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 },
)) {
rows.push(row);
}
expect(rows).toEqual([
{
templateId: '123',
canonicalSql: 'select status from public.orders',
dialect: 'postgres',
stats: {
executions: 42,
distinctUsers: 3,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 11.5,
p95RuntimeMs: 11.5,
errorRate: 0,
rowsProduced: 100,
},
topUsers: [{ user: 'analyst', executions: 40 }],
},
]);
});
});

View file

@ -0,0 +1,293 @@
import {
HistoricSqlExtensionMissingError,
HistoricSqlGrantsMissingError,
HistoricSqlVersionUnsupportedError,
} from './errors.js';
import {
aggregatedTemplateSchema,
type AggregatedTemplate,
type HistoricSqlTimeWindow,
type HistoricSqlUnifiedPullConfig,
type KtxPostgresQueryClient,
type PostgresPgssProbeResult,
} from './types.js';
interface QueryResultLike {
headers: string[];
rows: unknown[][];
totalRows?: number;
error?: string;
}
const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info';
const VERSION_SQL = `
SELECT current_setting('server_version_num')::int AS server_version_num,
version() AS server_version
`.trim();
const EXTENSION_PROBE_SQL = 'SELECT 1 FROM pg_stat_statements LIMIT 1';
const GRANTS_PROBE_SQL = "SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role";
const TRACKING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.track') AS track";
const MAX_SETTING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.max') AS max";
const RECOMMENDED_PGSS_MAX = 5000;
const AGGREGATE_SQL = `
SELECT queryid::text AS template_id,
query AS canonical_sql,
SUM(calls)::bigint AS executions,
COUNT(DISTINCT userid) AS distinct_users,
SUM(total_exec_time) / NULLIF(SUM(calls), 0) AS mean_ms,
SUM(rows)::bigint AS rows_produced,
COALESCE(
json_agg(json_build_object('user', rolname, 'executions', calls) ORDER BY calls DESC)
FILTER (WHERE userid IS NOT NULL),
'[]'::json
)::text AS top_users
FROM pg_stat_statements
LEFT JOIN pg_roles ON pg_roles.oid = pg_stat_statements.userid
WHERE toplevel = true
GROUP BY queryid, query
HAVING SUM(calls) >= $1
ORDER BY SUM(total_exec_time) DESC
`.trim();
const POSTGRES_EXTENSION_REMEDIATION = [
'Run CREATE EXTENSION pg_stat_statements; against the connection database.',
"Ensure shared_preload_libraries includes 'pg_stat_statements' in the Postgres parameter group or config.",
].join(' ');
const POSTGRES_GRANTS_REMEDIATION = 'GRANT pg_read_all_stats TO <connection role>;';
function queryClient(client: unknown): KtxPostgresQueryClient {
if (
client &&
typeof client === 'object' &&
'executeQuery' in client &&
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
) {
return client as KtxPostgresQueryClient;
}
throw new Error('Historic SQL Postgres PGSS reader requires a query client with executeQuery(sql, params?)');
}
async function execute(client: KtxPostgresQueryClient, sql: string, params?: unknown[]): Promise<QueryResultLike> {
const result = await client.executeQuery(sql, params);
if ('error' in result && typeof result.error === 'string' && result.error.length > 0) {
throw new Error(result.error);
}
return result;
}
function indexByHeader(headers: string[]): Map<string, number> {
const out = new Map<string, number>();
headers.forEach((header, index) => out.set(header.toLowerCase(), index));
return out;
}
function value(row: unknown[], headerIndexes: Map<string, number>, header: string): unknown {
const index = headerIndexes.get(header.toLowerCase());
return index === undefined ? null : row[index];
}
function nullableString(raw: unknown): string | null {
if (raw === null || raw === undefined) {
return null;
}
const text = String(raw);
return text.length > 0 ? text : null;
}
function requiredString(raw: unknown, field: string): string {
const text = nullableString(raw);
if (!text) {
throw new Error(`Postgres pg_stat_statements row is missing ${field}`);
}
return text;
}
function requiredFiniteNumber(raw: unknown, field: string): number {
const number = typeof raw === 'number' ? raw : Number(raw);
if (!Number.isFinite(number)) {
throw new Error(`Postgres pg_stat_statements row has invalid ${field}: ${String(raw)}`);
}
return number;
}
function requiredInteger(raw: unknown, field: string): number {
return Math.trunc(requiredFiniteNumber(raw, field));
}
function nullableNumber(raw: unknown): number | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
const number = typeof raw === 'number' ? raw : Number(raw);
return Number.isFinite(number) ? number : null;
}
function nullableInteger(raw: unknown): number | null {
const number = nullableNumber(raw);
return number === null ? null : Math.trunc(number);
}
function nullableIsoTimestamp(raw: unknown): string | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
if (raw instanceof Date) {
return raw.toISOString();
}
const date = new Date(String(raw));
return Number.isNaN(date.getTime()) ? null : date.toISOString();
}
function firstRow(result: QueryResultLike, context: string): { row: unknown[]; headers: Map<string, number> } {
const row = result.rows[0];
if (!row) {
throw new Error(`Postgres historic-SQL ${context} query returned no rows`);
}
return { row, headers: indexByHeader(result.headers) };
}
function isMissingPgssRelation(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /relation ["']?pg_stat_statements["']? does not exist/i.test(message);
}
function isPgssPreloadRequired(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /pg_stat_statements.*shared_preload_libraries/i.test(message);
}
function extensionMissingError(cause: unknown, message?: string): HistoricSqlExtensionMissingError {
return new HistoricSqlExtensionMissingError({
dialect: 'postgres',
message: message ?? 'pg_stat_statements extension is not installed in the connection database.',
remediation: POSTGRES_EXTENSION_REMEDIATION,
cause,
});
}
function grantsMissingError(): HistoricSqlGrantsMissingError {
return new HistoricSqlGrantsMissingError({
dialect: 'postgres',
message: 'Postgres connection role lacks pg_read_all_stats for historic-SQL ingest.',
remediation: POSTGRES_GRANTS_REMEDIATION,
});
}
function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> {
const text = nullableString(raw);
if (!text) {
return [];
}
try {
const parsed = JSON.parse(text) as unknown;
if (!Array.isArray(parsed)) {
return [];
}
return parsed.flatMap((entry) => {
if (!entry || typeof entry !== 'object') {
return [];
}
const user = nullableString((entry as { user?: unknown }).user);
const executions = nullableInteger((entry as { executions?: unknown }).executions);
return executions === null ? [] : [{ user, executions }];
});
} catch {
return [];
}
}
export class PostgresPgssReader {
async probe(client: unknown): Promise<PostgresPgssProbeResult> {
const pgClient = queryClient(client);
const versionResult = await execute(pgClient, VERSION_SQL);
const { row: versionRow, headers: versionHeaders } = firstRow(versionResult, 'version probe');
const serverVersionNum = requiredFiniteNumber(
value(versionRow, versionHeaders, 'server_version_num'),
'server_version_num',
);
const pgServerVersion = requiredString(value(versionRow, versionHeaders, 'server_version'), 'server_version');
if (serverVersionNum < 140000) {
throw new HistoricSqlVersionUnsupportedError({
dialect: 'postgres',
detectedVersion: pgServerVersion,
minimumVersion: 'PostgreSQL 14',
});
}
try {
await execute(pgClient, EXTENSION_PROBE_SQL);
} catch (error) {
if (isMissingPgssRelation(error)) {
throw extensionMissingError(error);
}
if (isPgssPreloadRequired(error)) {
throw extensionMissingError(
error,
'pg_stat_statements is installed but not loaded via shared_preload_libraries.',
);
}
throw error;
}
const grantsResult = await execute(pgClient, GRANTS_PROBE_SQL);
const { row: grantsRow, headers: grantsHeaders } = firstRow(grantsResult, 'grant probe');
if (value(grantsRow, grantsHeaders, 'has_role') !== true) {
throw grantsMissingError();
}
const trackingResult = await execute(pgClient, TRACKING_PROBE_SQL);
const { row: trackingRow, headers: trackingHeaders } = firstRow(trackingResult, 'tracking probe');
const track = nullableString(value(trackingRow, trackingHeaders, 'track'));
const maxResult = await execute(pgClient, MAX_SETTING_PROBE_SQL);
const { row: maxRow, headers: maxHeaders } = firstRow(maxResult, 'max-setting probe');
const pgssMax = nullableInteger(value(maxRow, maxHeaders, 'max'));
const warnings: string[] = [];
const info: string[] = [];
if (track === 'none') {
warnings.push('pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config');
}
if (pgssMax !== null && pgssMax < RECOMMENDED_PGSS_MAX) {
info.push(
`pg_stat_statements.max is ${pgssMax}; set it to at least ${RECOMMENDED_PGSS_MAX} to reduce query-template eviction churn`,
);
}
return { pgServerVersion, warnings, info };
}
async *fetchAggregated(
client: unknown,
window: HistoricSqlTimeWindow,
config: HistoricSqlUnifiedPullConfig,
): AsyncIterable<AggregatedTemplate> {
const pgClient = queryClient(client);
const statsResult = await execute(pgClient, STATS_INFO_SQL);
const { row: statsRow, headers: statsHeaders } = firstRow(statsResult, 'stats-info');
const firstSeen = nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')) ?? window.start.toISOString();
const result = await execute(pgClient, AGGREGATE_SQL, [config.minExecutions]);
const indexes = indexByHeader(result.headers);
for (const row of result.rows) {
yield aggregatedTemplateSchema.parse({
templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'),
canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'),
dialect: 'postgres',
stats: {
executions: requiredInteger(value(row, indexes, 'executions'), 'executions'),
distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'),
firstSeen,
lastSeen: window.end.toISOString(),
p50RuntimeMs: nullableNumber(value(row, indexes, 'mean_ms')),
p95RuntimeMs: nullableNumber(value(row, indexes, 'mean_ms')),
errorRate: 0,
rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')),
},
topUsers: parseTopUsers(value(row, indexes, 'top_users')),
});
}
}
}

View file

@ -0,0 +1,457 @@
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import YAML from 'yaml';
import { describe, expect, it } from 'vitest';
import { projectHistoricSqlEvidence } from './projection.js';
async function tempWorkdir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-projection-'));
}
async function writeText(root: string, relPath: string, content: string): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, content, 'utf-8');
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`);
}
describe('projectHistoricSqlEvidence', () => {
it('merges table usage into matching _schema shards and preserves external usage keys', async () => {
const workdir = await tempWorkdir();
await writeText(
workdir,
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify({
tables: {
orders: {
table: 'public.orders',
usage: {
narrative: 'Old generated usage.',
frequencyTier: 'low',
commonFilters: ['old_status'],
commonJoins: [],
ownerNote: 'keep me',
},
columns: [{ name: 'id', type: 'string' }],
},
},
}),
);
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 1,
touchedTableCount: 1,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 90,
});
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/orders.json', {
kind: 'table_usage',
connectionId: 'warehouse',
table: 'public.orders',
rawPath: 'tables/public.orders.json',
usage: {
narrative: 'Orders are repeatedly queried for lifecycle analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: null,
},
});
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]);
expect(result.actions).toEqual(
expect.arrayContaining([
expect.objectContaining({
target: 'sl',
key: 'orders',
rawPaths: ['tables/public.orders.json'],
}),
]),
);
const shard = YAML.parse(await readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8'));
expect(shard.tables.orders.usage).toEqual({
ownerNote: 'keep me',
narrative: 'Orders are repeatedly queried for lifecycle analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: null,
});
});
it('writes pattern pages, reuses similar slugs, and marks missing old pattern pages stale', async () => {
const workdir = await tempWorkdir();
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 2,
touchedTableCount: 2,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 90,
});
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' });
await writeText(
workdir,
'wiki/global/historic-sql-old-order-lifecycle.md',
[
'---',
YAML.stringify({
summary: 'Old order lifecycle page',
tags: ['historic-sql', 'pattern'],
refs: [],
sl_refs: ['orders'],
usage_mode: 'auto',
source: 'historic-sql',
tables: ['public.orders', 'public.customers'],
fingerprints: ['pg:1'],
}).trimEnd(),
'---',
'',
'Old body',
'',
].join('\n'),
);
await writeText(
workdir,
'wiki/global/historic-sql-retired-pattern.md',
[
'---',
YAML.stringify({
summary: 'Retired pattern',
tags: ['historic-sql', 'pattern'],
refs: [],
sl_refs: [],
usage_mode: 'auto',
source: 'historic-sql',
tables: ['public.tickets'],
fingerprints: ['pg:9'],
}).trimEnd(),
'---',
'',
'Retired body',
'',
].join('\n'),
);
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/pattern.json', {
kind: 'pattern',
connectionId: 'warehouse',
rawPath: 'patterns-input.json',
pattern: {
slug: 'order-lifecycle-analysis',
title: 'Order Lifecycle Analysis',
narrative: 'Analysts compare order status with customer segment.',
definitionSql: 'select * from public.orders join public.customers on customers.id = orders.customer_id',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['pg:1', 'pg:2'],
},
});
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.patternPagesWritten).toBe(1);
expect(result.changedWikiPageKeys).toContain('historic-sql-old-order-lifecycle');
expect(result.actions).toEqual(
expect.arrayContaining([
expect.objectContaining({
target: 'wiki',
key: 'historic-sql-old-order-lifecycle',
rawPaths: ['patterns-input.json'],
}),
]),
);
await expect(readFile(join(workdir, 'wiki/global/historic-sql-old-order-lifecycle.md'), 'utf-8')).resolves.toContain(
'Order Lifecycle Analysis',
);
await expect(readFile(join(workdir, 'wiki/global/historic-sql-retired-pattern.md'), 'utf-8')).resolves.toContain(
'stale_since: "2026-05-11T00:00:00.000Z"',
);
});
it('rewrites a reappearing archived pattern at the flat slug', async () => {
const workdir = await tempWorkdir();
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 2,
touchedTableCount: 2,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 30,
});
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' });
await writeText(
workdir,
'wiki/global/historic-sql-order-lifecycle-analysis.md',
[
'---',
YAML.stringify({
summary: 'Archived order lifecycle page',
tags: ['historic-sql', 'pattern', 'archived'],
refs: [],
sl_refs: ['orders'],
usage_mode: 'auto',
source: 'historic-sql',
tables: ['public.orders', 'public.customers'],
fingerprints: ['pg:1'],
stale_since: '2026-01-01T00:00:00.000Z',
}).trimEnd(),
'---',
'',
'Archived body',
'',
].join('\n'),
);
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/pattern.json', {
kind: 'pattern',
connectionId: 'warehouse',
rawPath: 'patterns-input.json',
pattern: {
slug: 'order-lifecycle-analysis',
title: 'Order Lifecycle Analysis',
narrative: 'Analysts compare order status with customer segment again.',
definitionSql: 'select * from public.orders join public.customers on customers.id = orders.customer_id',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['pg:1', 'pg:2'],
},
});
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.patternPagesWritten).toBe(1);
const page = await readFile(join(workdir, 'wiki/global/historic-sql-order-lifecycle-analysis.md'), 'utf-8');
expect(page).toContain('Analysts compare order status with customer segment again.');
expect(page).not.toContain('Archived body');
expect(page).not.toContain('archived');
});
it('leaves already archived pattern pages stable when they are still absent', async () => {
const workdir = await tempWorkdir();
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 0,
touchedTableCount: 0,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 30,
});
await writeText(
workdir,
'wiki/global/historic-sql-retired-pattern.md',
[
'---',
YAML.stringify({
summary: 'Retired pattern',
tags: ['historic-sql', 'pattern', 'archived'],
refs: [],
sl_refs: [],
usage_mode: 'auto',
source: 'historic-sql',
tables: ['public.tickets'],
fingerprints: ['pg:9'],
stale_since: '2026-01-01T00:00:00.000Z',
}).trimEnd(),
'---',
'',
'Archived retired body',
'',
].join('\n'),
);
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.archivedPatternPages).toBe(0);
expect(result.stalePatternPagesMarked).toBe(0);
await expect(readFile(join(workdir, 'wiki/global/historic-sql-retired-pattern.md'), 'utf-8')).resolves.toContain(
'Archived retired body',
);
});
it('marks missing table usage stale without deleting old query pages', async () => {
const workdir = await tempWorkdir();
await writeText(
workdir,
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify({
tables: {
orders: {
table: 'public.orders',
usage: {
narrative: 'Orders were active before.',
frequencyTier: 'high',
commonFilters: ['status'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
ownerNote: 'keep analyst annotation',
},
columns: [{ name: 'id', type: 'string' }],
},
},
}),
);
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 0,
touchedTableCount: 0,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 90,
});
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/customers.json', {
kind: 'table_usage',
connectionId: 'warehouse',
table: 'public.customers',
rawPath: 'tables/public.customers.json',
usage: {
narrative: 'Customers were queried.',
frequencyTier: 'low',
commonFilters: [],
commonJoins: [],
staleSince: null,
},
});
await writeText(
workdir,
'wiki/global/historic-sql-old-template.md',
[
'---',
YAML.stringify({
summary: 'Old template page',
tags: ['historic-sql', 'query-pattern'],
refs: [],
sl_refs: ['orders'],
usage_mode: 'auto',
source: 'historic-sql',
tables: ['public.orders'],
fingerprints: ['old:1'],
}).trimEnd(),
'---',
'',
'Old body',
'',
].join('\n'),
);
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.staleTablesMarked).toBe(1);
expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]);
const staleAction = result.actions.find((action) => action.target === 'sl' && action.key === 'orders');
expect(staleAction).toEqual(expect.objectContaining({ target: 'sl', key: 'orders' }));
expect(staleAction?.rawPaths).toBeUndefined();
const shard = YAML.parse(await readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8'));
expect(shard.tables.orders.usage).toEqual({
ownerNote: 'keep analyst annotation',
narrative: 'No recent historic SQL usage was observed in the latest snapshot.',
frequencyTier: 'unused',
commonFilters: [],
commonGroupBys: [],
commonJoins: [],
staleSince: '2026-05-11T00:00:00.000Z',
});
await expect(readFile(join(workdir, 'wiki/global/historic-sql-old-template.md'), 'utf-8')).resolves.toContain(
'Old body',
);
});
it('does not mark stale or archive pages when override replay has no current-run evidence', async () => {
const workdir = await tempWorkdir();
await writeText(
workdir,
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify({
tables: {
orders: {
table: 'public.orders',
usage: {
narrative: 'Orders were active before.',
frequencyTier: 'high',
commonFilters: ['status'],
commonGroupBys: ['status'],
commonJoins: [],
},
columns: [{ name: 'id', type: 'string' }],
},
},
}),
);
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/override-sync/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 0,
touchedTableCount: 0,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 90,
});
const result = await projectHistoricSqlEvidence({
workdir,
connectionId: 'warehouse',
syncId: 'override-sync',
runId: 'override-run',
overrideReplay: {
priorJobId: 'prior-job',
priorRunId: 'prior-run',
priorSyncId: 'prior-sync',
evictionRawPaths: ['tables/public/orders.json'],
},
});
expect(result.tableUsageMerged).toBe(0);
expect(result.staleTablesMarked).toBe(0);
expect(result.patternPagesWritten).toBe(0);
expect(result.stalePatternPagesMarked).toBe(0);
expect(result.archivedPatternPages).toBe(0);
expect(result.touchedSources).toEqual([]);
expect(result.changedWikiPageKeys).toEqual([]);
expect(result.actions).toEqual([]);
});
});

View file

@ -0,0 +1,385 @@
import { access, mkdir, readdir, readFile, rename, writeFile } from 'node:fs/promises';
import { dirname, join, relative } from 'node:path';
import YAML from 'yaml';
import type { MemoryAction } from '../../../memory/index.js';
import { rawSourcesDirForSync } from '../../raw-sources-paths.js';
import type { FinalizationOverrideReplay } from '../../types.js';
import { mergeUsagePreservingExternal } from '../live-database/manifest.js';
import { historicSqlEvidenceEnvelopeSchema, type HistoricSqlEvidenceEnvelope } from './evidence.js';
import type { TableUsageOutput } from './skill-schemas.js';
import { stagedManifestSchema } from './types.js';
export interface HistoricSqlProjectionInput {
workdir: string;
connectionId: string;
syncId: string;
runId: string;
overrideReplay?: FinalizationOverrideReplay;
}
export interface HistoricSqlProjectionResult {
tableUsageMerged: number;
staleTablesMarked: number;
patternPagesWritten: number;
stalePatternPagesMarked: number;
archivedPatternPages: number;
touchedSources: Array<{ connectionId: string; sourceName: string }>;
changedWikiPageKeys: string[];
actions: MemoryAction[];
warnings: string[];
}
interface ManifestShard {
tables?: Record<string, { table?: string; usage?: Record<string, unknown>; columns?: unknown[]; [key: string]: unknown }>;
}
interface HistoricSqlPatternPage {
key: string;
path: string;
frontmatter: Record<string, unknown>;
content: string;
}
function safeKnowledgeSlug(value: string): string {
return value.toLowerCase().replace(/[^a-z0-9_-]+/g, '-').replace(/^-+|-+$/g, '');
}
async function pathExists(path: string): Promise<boolean> {
try {
await access(path);
return true;
} catch {
return false;
}
}
async function walkFiles(root: string): Promise<string[]> {
if (!(await pathExists(root))) return [];
const result: string[] = [];
async function visit(dir: string): Promise<void> {
const entries = await readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const absolute = join(dir, entry.name);
if (entry.isDirectory()) {
await visit(absolute);
} else if (entry.isFile()) {
result.push(relative(root, absolute).replace(/\\/g, '/'));
}
}
}
await visit(root);
return result.sort();
}
async function readJson(path: string): Promise<unknown> {
return JSON.parse(await readFile(path, 'utf-8')) as unknown;
}
async function writeYamlAtomic(path: string, value: unknown): Promise<void> {
await mkdir(dirname(path), { recursive: true });
const tmp = `${path}.tmp`;
await writeFile(tmp, YAML.stringify(value, { indent: 2, lineWidth: 0, version: '1.1' }), 'utf-8');
await rename(tmp, path);
}
function tableSourceName(tableRef: string): string {
return tableRef.split('.').filter(Boolean).at(-1) ?? tableRef;
}
function staleUsage(fetchedAt: string) {
return {
narrative: 'No recent historic SQL usage was observed in the latest snapshot.',
frequencyTier: 'unused' as const,
commonFilters: [],
commonGroupBys: [],
commonJoins: [],
staleSince: fetchedAt,
};
}
async function loadEvidence(workdir: string, runId: string): Promise<HistoricSqlEvidenceEnvelope[]> {
const root = join(workdir, '.ktx/ingest-evidence/historic-sql', runId);
const files = await walkFiles(root);
const evidence: HistoricSqlEvidenceEnvelope[] = [];
for (const file of files.filter((candidate) => candidate.endsWith('.json'))) {
evidence.push(historicSqlEvidenceEnvelopeSchema.parse(await readJson(join(root, file))));
}
return evidence;
}
function renderPatternMarkdown(pattern: HistoricSqlEvidenceEnvelope & { kind: 'pattern' }): string {
return [
`# ${pattern.pattern.title}`,
'',
pattern.pattern.narrative,
'',
'## Representative SQL',
'',
'```sql',
pattern.pattern.definitionSql,
'```',
'',
'## Tables',
'',
...pattern.pattern.tablesInvolved.map((table) => `- ${table}`),
'',
'## Constituent Templates',
'',
...pattern.pattern.constituentTemplateIds.map((id) => `- ${id}`),
'',
].join('\n');
}
function overlapRatio(left: string[], right: string[]): number {
const rightSet = new Set(right);
const intersection = left.filter((value) => rightSet.has(value)).length;
return left.length === 0 ? 0 : intersection / left.length;
}
function parseMarkdownPage(key: string, path: string, raw: string): HistoricSqlPatternPage | null {
const match = raw.match(/^---\n([\s\S]*?)\n---\n?([\s\S]*)$/);
if (!match) return null;
return {
key,
path,
frontmatter: (YAML.parse(match[1] ?? '') ?? {}) as Record<string, unknown>,
content: match[2] ?? '',
};
}
function isHistoricPatternPage(page: HistoricSqlPatternPage): boolean {
const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : [];
return (
page.frontmatter.source === 'historic-sql' &&
tags.includes('historic-sql') &&
tags.includes('pattern')
);
}
function isArchivedPatternPage(page: HistoricSqlPatternPage): boolean {
const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : [];
return tags.includes('archived');
}
function stringArray(value: unknown): string[] {
return Array.isArray(value) ? value.filter((entry): entry is string => typeof entry === 'string') : [];
}
function renderMarkdownPage(frontmatter: Record<string, unknown>, content: string): string {
let yaml = YAML.stringify(frontmatter, { indent: 2, lineWidth: 0 }).trimEnd();
const staleSince = frontmatter.stale_since;
if (typeof staleSince === 'string') {
yaml = yaml.replace(`stale_since: ${staleSince}`, `stale_since: "${staleSince}"`);
}
return `---\n${yaml}\n---\n\n${content.trim()}\n`;
}
function existingPageSignals(page: HistoricSqlPatternPage): string[] {
return [...stringArray(page.frontmatter.tables), ...stringArray(page.frontmatter.fingerprints)];
}
function shouldArchive(staleSince: unknown, fetchedAt: string, days: number): boolean {
if (typeof staleSince !== 'string') return false;
const staleTime = Date.parse(staleSince);
const fetchedTime = Date.parse(fetchedAt);
if (!Number.isFinite(staleTime) || !Number.isFinite(fetchedTime)) return false;
return fetchedTime - staleTime > days * 24 * 60 * 60 * 1000;
}
async function loadPatternPages(root: string): Promise<HistoricSqlPatternPage[]> {
const files = await walkFiles(root);
const pages: HistoricSqlPatternPage[] = [];
for (const file of files.filter((candidate) => candidate.endsWith('.md'))) {
if (file.includes('/')) {
continue;
}
const key = file.replace(/\.md$/, '');
const path = join(root, file);
const page = parseMarkdownPage(key, path, await readFile(path, 'utf-8'));
if (page) {
pages.push(page);
}
}
return pages;
}
function historicSqlFlatKey(slug: string): string {
return `historic-sql-${safeKnowledgeSlug(slug)}`;
}
async function currentStagedTables(rawDir: string): Promise<Set<string>> {
const tablesRoot = join(rawDir, 'tables');
const files = await walkFiles(tablesRoot);
const tables = new Set<string>();
for (const file of files.filter((candidate) => candidate.endsWith('.json'))) {
const value = await readJson(join(tablesRoot, file));
if (typeof value === 'object' && value !== null && 'table' in value && typeof value.table === 'string') {
tables.add(value.table);
}
}
return tables;
}
export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInput): Promise<HistoricSqlProjectionResult> {
const result: HistoricSqlProjectionResult = {
tableUsageMerged: 0,
staleTablesMarked: 0,
patternPagesWritten: 0,
stalePatternPagesMarked: 0,
archivedPatternPages: 0,
touchedSources: [],
changedWikiPageKeys: [],
actions: [],
warnings: [],
};
const touchedKeys = new Set<string>();
const rawDir = join(input.workdir, rawSourcesDirForSync(input.connectionId, 'historic-sql', input.syncId));
const manifest = stagedManifestSchema.parse(await readJson(join(rawDir, 'manifest.json')));
const currentTables = await currentStagedTables(rawDir);
const evidence = await loadEvidence(input.workdir, input.runId);
if (input.overrideReplay && evidence.length === 0) {
result.warnings.push(
'historic-sql finalization skipped stale/archive cleanup during override replay without current-run evidence',
);
return result;
}
if (evidence.length === 0) {
result.warnings.push('historic-sql finalization skipped because no current-run evidence was emitted');
return result;
}
const tableEvidence = evidence.filter((entry): entry is HistoricSqlEvidenceEnvelope & { kind: 'table_usage' } => entry.kind === 'table_usage');
const patternEvidence = evidence.filter((entry): entry is HistoricSqlEvidenceEnvelope & { kind: 'pattern' } => entry.kind === 'pattern');
const schemaRoot = join(input.workdir, 'semantic-layer', input.connectionId, '_schema');
for (const file of (await walkFiles(schemaRoot)).filter((candidate) => candidate.endsWith('.yaml') || candidate.endsWith('.yml'))) {
const path = join(schemaRoot, file);
const before = await readFile(path, 'utf-8');
const shard = (YAML.parse(before) ?? {}) as ManifestShard;
if (!shard.tables) continue;
for (const [tableName, entry] of Object.entries(shard.tables)) {
const tableRef = entry.table ?? tableName;
const matchingEvidence = tableEvidence.find(
(candidate) => candidate.table === tableRef || tableSourceName(candidate.table) === tableName,
);
if (matchingEvidence) {
const merged = mergeUsagePreservingExternal(entry.usage as TableUsageOutput | undefined, matchingEvidence.usage);
if (JSON.stringify(entry.usage ?? null) !== JSON.stringify(merged ?? null)) {
entry.usage = merged as Record<string, unknown>;
result.tableUsageMerged += 1;
const sourceName = tableSourceName(matchingEvidence.table);
const key = `${input.connectionId}:${sourceName}`;
if (!touchedKeys.has(key)) {
touchedKeys.add(key);
result.touchedSources.push({ connectionId: input.connectionId, sourceName });
}
result.actions.push({
target: 'sl',
type: 'updated',
key: sourceName,
targetConnectionId: input.connectionId,
detail: `Merged historic-SQL usage for ${matchingEvidence.table}`,
rawPaths: [matchingEvidence.rawPath],
});
}
} else if (entry.usage && !currentTables.has(tableRef)) {
const merged = mergeUsagePreservingExternal(entry.usage as TableUsageOutput | undefined, staleUsage(manifest.fetchedAt));
if (JSON.stringify(entry.usage ?? null) !== JSON.stringify(merged ?? null)) {
entry.usage = merged as Record<string, unknown>;
result.staleTablesMarked += 1;
const sourceName = tableSourceName(tableRef);
const key = `${input.connectionId}:${sourceName}`;
if (!touchedKeys.has(key)) {
touchedKeys.add(key);
result.touchedSources.push({ connectionId: input.connectionId, sourceName });
}
result.actions.push({
target: 'sl',
type: 'updated',
key: sourceName,
targetConnectionId: input.connectionId,
detail: `Marked historic-SQL usage stale for ${tableRef}`,
});
}
}
}
const after = YAML.stringify(shard, { indent: 2, lineWidth: 0, version: '1.1' });
if (after !== before) {
await writeYamlAtomic(path, shard);
}
}
const wikiRoot = join(input.workdir, 'wiki/global');
await mkdir(wikiRoot, { recursive: true });
const allPages = await loadPatternPages(wikiRoot);
const activePages = allPages.filter((page) => !isArchivedPatternPage(page));
const patternPages = activePages.filter(isHistoricPatternPage);
const writtenKeys = new Set<string>();
for (const pattern of patternEvidence) {
const incomingSignals = [...pattern.pattern.tablesInvolved, ...pattern.pattern.constituentTemplateIds];
const reusable = patternPages.find((page) => overlapRatio(incomingSignals, existingPageSignals(page)) >= 0.6);
const key = reusable?.key ?? historicSqlFlatKey(pattern.pattern.slug);
const pagePath = join(wikiRoot, `${key}.md`);
const frontmatter = {
summary: pattern.pattern.title,
tags: ['historic-sql', 'pattern'],
refs: [],
sl_refs: pattern.pattern.slRefs,
usage_mode: 'auto',
source: 'historic-sql',
tables: pattern.pattern.tablesInvolved,
representative_sql: pattern.pattern.definitionSql,
fingerprints: pattern.pattern.constituentTemplateIds,
};
await mkdir(dirname(pagePath), { recursive: true });
await writeFile(pagePath, renderMarkdownPage(frontmatter, renderPatternMarkdown(pattern)), 'utf-8');
writtenKeys.add(key);
result.patternPagesWritten += 1;
result.changedWikiPageKeys.push(key);
result.actions.push({
target: 'wiki',
type: reusable ? 'updated' : 'created',
key,
detail: `Projected historic-SQL pattern ${pattern.pattern.title}`,
rawPaths: [pattern.rawPath],
});
}
for (const page of patternPages) {
if (writtenKeys.has(page.key)) continue;
if (shouldArchive(page.frontmatter.stale_since, manifest.fetchedAt, manifest.staleArchiveAfterDays)) {
const tags = [...new Set([...stringArray(page.frontmatter.tags), 'archived'])];
await writeFile(
page.path,
renderMarkdownPage({ ...page.frontmatter, tags, archived_since: manifest.fetchedAt }, page.content),
'utf-8',
);
result.archivedPatternPages += 1;
result.changedWikiPageKeys.push(page.key);
result.actions.push({
target: 'wiki',
type: 'updated',
key: page.key,
detail: `Archived stale historic-SQL pattern page ${page.key}`,
});
continue;
}
const tags = [...new Set([...stringArray(page.frontmatter.tags), 'stale'])];
await writeFile(
page.path,
renderMarkdownPage({ ...page.frontmatter, tags, stale_since: manifest.fetchedAt }, page.content),
'utf-8',
);
result.stalePatternPagesMarked += 1;
result.changedWikiPageKeys.push(page.key);
result.actions.push({
target: 'wiki',
type: 'updated',
key: page.key,
detail: `Marked historic-SQL pattern page ${page.key} stale`,
});
}
result.changedWikiPageKeys = [...new Set(result.changedWikiPageKeys)].sort();
return result;
}

View file

@ -0,0 +1,36 @@
import { describe, expect, it } from 'vitest';
import { compileHistoricSqlRedactionPatterns, redactHistoricSqlText } from './redaction.js';
describe('historic-SQL redaction', () => {
it('redacts regex matches and supports the (?i) case-insensitive prefix', () => {
const redactors = compileHistoricSqlRedactionPatterns([
'sk_live_[A-Za-z0-9]+',
'(?i)secret_token_[a-z0-9]+',
]);
const sql =
"select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'"; // pragma: allowlist secret
expect(redactHistoricSqlText(sql, redactors)).toBe(
"select * from public.api_events where api_key = '[REDACTED]' and note = '[REDACTED]'",
);
});
it('returns the original SQL text when no redaction patterns are configured', () => {
const sql = "select * from public.orders where status = 'paid'";
expect(redactHistoricSqlText(sql, compileHistoricSqlRedactionPatterns([]))).toBe(sql);
});
it('throws a config-focused error for invalid redaction regex patterns', () => {
expect(() => compileHistoricSqlRedactionPatterns(['[broken'])).toThrow(
'Invalid historicSql.redactionPatterns entry "[broken"',
);
});
it('throws a config-focused error for empty redaction regex patterns', () => {
expect(() => compileHistoricSqlRedactionPatterns([' '])).toThrow(
'Invalid historicSql.redactionPatterns entry " "',
);
});
});

View file

@ -0,0 +1,37 @@
export interface HistoricSqlRedactionPattern {
pattern: string;
expression: RegExp;
}
const CASE_INSENSITIVE_PREFIX = '(?i)';
const REDACTION_TOKEN = '[REDACTED]';
export function compileHistoricSqlRedactionPatterns(patterns: readonly string[]): HistoricSqlRedactionPattern[] {
return patterns.map((pattern) => {
const trimmed = pattern.trim();
const caseInsensitive = trimmed.startsWith(CASE_INSENSITIVE_PREFIX);
const source = caseInsensitive ? trimmed.slice(CASE_INSENSITIVE_PREFIX.length) : trimmed;
if (source.length === 0) {
throw new Error(`Invalid historicSql.redactionPatterns entry "${pattern}": pattern must not be empty`);
}
try {
return {
pattern,
expression: new RegExp(source, caseInsensitive ? 'gi' : 'g'),
};
} catch (error) {
const reason = error instanceof Error ? error.message : String(error);
throw new Error(`Invalid historicSql.redactionPatterns entry "${pattern}": ${reason}`);
}
});
}
export function redactHistoricSqlText(text: string, redactors: readonly HistoricSqlRedactionPattern[]): string {
let next = text;
for (const redactor of redactors) {
redactor.expression.lastIndex = 0;
next = next.replace(redactor.expression, REDACTION_TOKEN);
}
return next;
}

View file

@ -0,0 +1,74 @@
import { describe, expect, it } from 'vitest';
import { z } from 'zod';
import {
patternOutputSchema,
patternsArraySchema,
tableUsageOutputSchema,
} from './skill-schemas.js';
describe('historic-sql skill schemas', () => {
it('accepts table usage output and preserves future keys', () => {
const parsed = tableUsageOutputSchema.parse({
narrative: 'Orders are queried for paid/refunded lifecycle analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: null,
analystNote: 'preserve me',
});
expect(parsed).toMatchObject({
narrative: 'Orders are queried for paid/refunded lifecycle analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: null,
analystNote: 'preserve me',
});
});
it('rejects invalid frequency tiers', () => {
const result = tableUsageOutputSchema.safeParse({
narrative: 'Orders are queried often.',
frequencyTier: 'sometimes',
commonFilters: [],
commonJoins: [],
});
expect(result.success).toBe(false);
});
it('accepts pattern outputs used for wiki projection', () => {
const parsed = patternsArraySchema.parse([
{
slug: 'order-lifecycle-analysis',
title: 'Order Lifecycle Analysis',
narrative: 'Teams inspect order status by customer and month.',
definitionSql: 'select status, count(*) from public.orders group by status',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['template_1', 'template_2'],
},
]);
expect(parsed[0]).toEqual({
slug: 'order-lifecycle-analysis',
title: 'Order Lifecycle Analysis',
narrative: 'Teams inspect order status by customer and month.',
definitionSql: 'select status, count(*) from public.orders group by status',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['template_1', 'template_2'],
});
});
it('exports zod schemas that can produce JSON schema for prompt prefixes', () => {
const tableUsageJsonSchema = z.toJSONSchema(tableUsageOutputSchema);
const patternJsonSchema = z.toJSONSchema(patternOutputSchema);
expect(tableUsageJsonSchema).toMatchObject({ type: 'object' });
expect(patternJsonSchema).toMatchObject({ type: 'object' });
});
});

View file

@ -0,0 +1,31 @@
import { z } from 'zod';
export const tableUsageOutputSchema = z
.object({
narrative: z.string(),
frequencyTier: z.enum(['high', 'mid', 'low', 'unused']),
commonFilters: z.array(z.string()),
commonGroupBys: z.array(z.string()).optional(),
commonJoins: z.array(
z.object({
table: z.string(),
on: z.array(z.string()),
}),
),
staleSince: z.iso.datetime().nullable().optional(),
})
.passthrough();
export type TableUsageOutput = z.infer<typeof tableUsageOutputSchema>;
export const patternOutputSchema = z.object({
slug: z.string(),
title: z.string(),
narrative: z.string(),
definitionSql: z.string(),
tablesInvolved: z.array(z.string()),
slRefs: z.array(z.string()),
constituentTemplateIds: z.array(z.string()),
});
export type PatternOutput = z.infer<typeof patternOutputSchema>;
export const patternsArraySchema = z.array(patternOutputSchema);

View file

@ -0,0 +1,148 @@
import { describe, expect, it, vi } from 'vitest';
import { HistoricSqlGrantsMissingError } from './errors.js';
import { SnowflakeHistoricSqlQueryHistoryReader } from './snowflake-query-history-reader.js';
interface FakeQueryResult {
headers: string[];
rows: unknown[][];
totalRows: number;
error?: string;
}
function queryClient(results: FakeQueryResult[]) {
const executeQuery = vi.fn(async (_query: string) => {
const next = results.shift();
if (!next) {
throw new Error('unexpected query');
}
return next;
});
return { executeQuery };
}
function firstQuery(client: ReturnType<typeof queryClient>): string {
const call = client.executeQuery.mock.calls[0];
if (!call) {
throw new Error('expected query client to be called');
}
return call[0];
}
describe('SnowflakeHistoricSqlQueryHistoryReader', () => {
it('probes SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY', async () => {
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
await expect(reader.probe(client)).resolves.toEqual({ warnings: [], info: [] });
expect(client.executeQuery).toHaveBeenCalledWith(
'SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY LIMIT 1',
);
});
it('turns probe result errors into HistoricSqlGrantsMissingError', async () => {
const client = queryClient([{ headers: [], rows: [], totalRows: 0, error: 'Object does not exist or not authorized' }]);
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
await expect(reader.probe(client)).rejects.toMatchObject({
name: 'HistoricSqlGrantsMissingError',
dialect: 'snowflake',
remediation: 'GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE <connection role>;',
});
});
it('turns thrown probe failures into HistoricSqlGrantsMissingError', async () => {
const client = {
executeQuery: vi.fn(async () => {
throw new Error('permission denied');
}),
};
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('fetches aggregated Snowflake query templates', async () => {
const client = queryClient([
{
headers: [
'template_id',
'canonical_sql',
'executions',
'distinct_users',
'first_seen',
'last_seen',
'p50_ms',
'p95_ms',
'error_rate',
'rows_produced',
'top_users',
],
rows: [
[
'hash-1',
'select status from orders',
42,
3,
'2026-05-01T00:00:00.000Z',
'2026-05-11T00:00:00.000Z',
12,
40,
0.05,
100,
JSON.stringify([{ user: 'ANALYST', executions: 1 }]),
],
],
totalRows: 1,
},
]);
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
const rows = [];
for await (const row of reader.fetchAggregated(
client,
{ start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') },
{ dialect: 'snowflake', minExecutions: 5, windowDays: 90, enabledTables: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 },
)) {
rows.push(row);
}
const sql = firstQuery(client);
expect(sql).toContain('SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY');
expect(sql).toContain('COUNT(*) AS executions');
expect(sql).toContain('GROUP BY query_hash');
expect(sql).toContain('HAVING COUNT(*) >= 5');
expect(rows).toMatchObject([
{
templateId: 'hash-1',
stats: {
executions: 42,
errorRate: 0.05,
},
topUsers: [{ user: 'ANALYST', executions: 1 }],
},
]);
});
it('throws a clear error when the query client cannot execute SQL', async () => {
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
await expect(async () => {
for await (const _row of reader.fetchAggregated(
{},
{ start: new Date(), end: new Date() },
{
dialect: 'snowflake',
minExecutions: 5,
windowDays: 90,
enabledTables: [],
filters: { dropTrivialProbes: true },
redactionPatterns: [],
staleArchiveAfterDays: 90,
},
)) {
throw new Error('unreachable');
}
}).rejects.toThrow('Historic SQL Snowflake reader requires a query client with executeQuery(query)');
});
});

View file

@ -0,0 +1,220 @@
import { HistoricSqlGrantsMissingError } from './errors.js';
import {
aggregatedTemplateSchema,
type AggregatedTemplate,
type HistoricSqlTimeWindow,
type HistoricSqlUnifiedPullConfig,
} from './types.js';
interface QueryResultLike {
headers: string[];
rows: unknown[][];
totalRows: number;
error?: string;
}
interface QueryClientLike {
executeQuery(query: string): Promise<QueryResultLike>;
}
const PROBE_SQL = 'SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY LIMIT 1';
const SNOWFLAKE_GRANTS_REMEDIATION =
'GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE <connection role>;';
function queryClient(client: unknown): QueryClientLike {
if (
client &&
typeof client === 'object' &&
'executeQuery' in client &&
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
) {
return client as QueryClientLike;
}
throw new Error('Historic SQL Snowflake reader requires a query client with executeQuery(query)');
}
function grantsError(cause: unknown): HistoricSqlGrantsMissingError {
const message =
cause instanceof Error
? cause.message
: typeof cause === 'string'
? cause
: 'Snowflake role cannot query SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY.';
return new HistoricSqlGrantsMissingError({
dialect: 'snowflake',
message: `Missing Snowflake audit grants for historic-SQL ingest: ${message}`,
remediation: SNOWFLAKE_GRANTS_REMEDIATION,
cause,
});
}
function timestampLiteral(value: Date | string): string {
const date = value instanceof Date ? value : new Date(value);
if (Number.isNaN(date.getTime())) {
throw new Error(`Invalid Snowflake query-history timestamp: ${String(value)}`);
}
return `'${date.toISOString().replace(/'/g, "''")}'::TIMESTAMP_TZ`;
}
function indexByHeader(headers: string[]): Map<string, number> {
const out = new Map<string, number>();
headers.forEach((header, index) => {
out.set(header.toUpperCase(), index);
});
return out;
}
function value(row: unknown[], indexes: Map<string, number>, name: string): unknown {
const index = indexes.get(name.toUpperCase());
return index === undefined ? null : row[index];
}
function nullableString(raw: unknown): string | null {
if (raw === null || raw === undefined) {
return null;
}
const text = String(raw);
return text.length > 0 ? text : null;
}
function requiredString(raw: unknown, field: string): string {
const text = nullableString(raw);
if (!text) {
throw new Error(`Snowflake QUERY_HISTORY row is missing ${field}`);
}
return text;
}
function nullableNumber(raw: unknown): number | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
const number = typeof raw === 'number' ? raw : Number(raw);
if (!Number.isFinite(number)) {
return null;
}
return number;
}
function requiredNumber(raw: unknown, field: string): number {
const number = nullableNumber(raw);
if (number === null) {
throw new Error(`Snowflake QUERY_HISTORY row has invalid ${field}: ${String(raw)}`);
}
return number;
}
function requiredInteger(raw: unknown, field: string): number {
return Math.trunc(requiredNumber(raw, field));
}
function nullableInteger(raw: unknown): number | null {
const number = nullableNumber(raw);
return number === null ? null : Math.trunc(number);
}
function isoTimestamp(raw: unknown, field: string): string {
if (raw instanceof Date) {
return raw.toISOString();
}
const text = requiredString(raw, field);
const date = new Date(text);
if (Number.isNaN(date.getTime())) {
throw new Error(`Snowflake QUERY_HISTORY row has invalid ${field}: ${text}`);
}
return date.toISOString();
}
function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> {
const text = nullableString(raw);
if (!text) {
return [];
}
try {
const parsed = JSON.parse(text) as unknown;
if (!Array.isArray(parsed)) {
return [];
}
return parsed.flatMap((entry) => {
if (!entry || typeof entry !== 'object') {
return [];
}
const user = nullableString((entry as { user?: unknown }).user);
const executions = nullableInteger((entry as { executions?: unknown }).executions);
return executions === null ? [] : [{ user, executions }];
});
} catch {
return [];
}
}
function mapAggregatedRow(row: unknown[], indexes: Map<string, number>): AggregatedTemplate {
return aggregatedTemplateSchema.parse({
templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'),
canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'),
dialect: 'snowflake',
stats: {
executions: requiredInteger(value(row, indexes, 'executions'), 'executions'),
distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'),
firstSeen: isoTimestamp(value(row, indexes, 'first_seen'), 'first_seen'),
lastSeen: isoTimestamp(value(row, indexes, 'last_seen'), 'last_seen'),
p50RuntimeMs: nullableNumber(value(row, indexes, 'p50_ms')),
p95RuntimeMs: nullableNumber(value(row, indexes, 'p95_ms')),
errorRate: requiredNumber(value(row, indexes, 'error_rate'), 'error_rate'),
rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')),
},
topUsers: parseTopUsers(value(row, indexes, 'top_users')),
});
}
export class SnowflakeHistoricSqlQueryHistoryReader {
async probe(client: unknown): Promise<{ warnings: string[]; info: string[] }> {
let result: QueryResultLike;
try {
result = await queryClient(client).executeQuery(PROBE_SQL);
} catch (error) {
throw grantsError(error);
}
if (result.error) {
throw grantsError(result.error);
}
return { warnings: [], info: [] };
}
async *fetchAggregated(
client: unknown,
window: HistoricSqlTimeWindow,
config: HistoricSqlUnifiedPullConfig,
): AsyncIterable<AggregatedTemplate> {
const sql = `
SELECT
query_hash AS template_id,
MIN(query_text) AS canonical_sql,
COUNT(*) AS executions,
COUNT(DISTINCT user_name) AS distinct_users,
MIN(start_time) AS first_seen,
MAX(start_time) AS last_seen,
APPROX_PERCENTILE(total_elapsed_time, 0.50) AS p50_ms,
APPROX_PERCENTILE(total_elapsed_time, 0.95) AS p95_ms,
DIV0(COUNT_IF(execution_status != 'SUCCESS'), COUNT(*)) AS error_rate,
SUM(rows_produced) AS rows_produced,
ARRAY_AGG(OBJECT_CONSTRUCT('user', user_name, 'executions', 1)) WITHIN GROUP (ORDER BY start_time DESC)::string AS top_users
FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
WHERE query_text IS NOT NULL
AND query_type IN ('SELECT', 'MERGE')
AND start_time >= ${timestampLiteral(window.start)}
AND start_time < ${timestampLiteral(window.end)}
GROUP BY query_hash
HAVING COUNT(*) >= ${config.minExecutions}
ORDER BY executions DESC`.trim();
const result = await queryClient(client).executeQuery(sql);
if (result.error) {
throw grantsError(result.error);
}
const indexes = indexByHeader(result.headers);
for (const row of result.rows) {
yield mapAggregatedRow(row, indexes);
}
}
}

View file

@ -0,0 +1,436 @@
import { mkdtemp, readFile, readdir } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it, vi } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import { stageHistoricSqlAggregatedSnapshot } from './stage-unified.js';
import type { AggregatedTemplate, HistoricSqlReader } from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-unified-stage-'));
}
async function readJson<T>(root: string, relPath: string): Promise<T> {
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
}
function aggregate(overrides: Partial<AggregatedTemplate> & { templateId: string; canonicalSql: string }): AggregatedTemplate {
return {
templateId: overrides.templateId,
canonicalSql: overrides.canonicalSql,
dialect: overrides.dialect ?? 'postgres',
stats: overrides.stats ?? {
executions: 42,
distinctUsers: 3,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 20,
p95RuntimeMs: 80,
errorRate: 0,
rowsProduced: 100,
},
topUsers: overrides.topUsers ?? [{ user: 'analyst', executions: 40 }],
};
}
describe('stageHistoricSqlAggregatedSnapshot', () => {
it('batch parses templates and writes stable table and patterns artifacts', async () => {
const stagedDir = await tempDir();
const reader: HistoricSqlReader = {
async probe() {
return { warnings: ['pg_stat_statements.track is none; aggregation still proceeds'], info: [] };
},
async *fetchAggregated() {
yield aggregate({
templateId: 'orders-by-status',
canonicalSql: 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.created_at >= $1 group by o.status',
});
yield aggregate({
templateId: 'service-account-only',
canonicalSql: 'select * from public.orders where id = $1',
stats: {
executions: 20,
distinctUsers: 1,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 5,
p95RuntimeMs: 10,
errorRate: 0,
rowsProduced: 1,
},
topUsers: [{ user: 'svc_loader', executions: 20 }],
});
yield aggregate({
templateId: 'bad-parse',
canonicalSql: 'select broken from',
});
},
};
const sqlAnalysis: SqlAnalysisPort = {
analyzeForFingerprint: vi.fn(),
analyzeBatch: vi.fn(async () => new Map([
[
'orders-by-status',
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: {
select: ['status'],
where: ['created_at'],
join: ['customer_id'],
groupBy: ['status'],
},
},
],
['bad-parse', { tablesTouched: [], columnsByClause: {}, error: 'parse failed' }],
])),
validateReadOnly: vi.fn(async () => ({ ok: true })),
};
await stageHistoricSqlAggregatedSnapshot({
stagedDir,
connectionId: 'warehouse',
queryClient: {},
reader,
sqlAnalysis,
pullConfig: {
dialect: 'postgres',
filters: {
serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' },
},
},
now: new Date('2026-05-11T12:00:00.000Z'),
});
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(1);
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith(
[
{
id: 'orders-by-status',
sql: 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.created_at >= $1 group by o.status',
},
{ id: 'bad-parse', sql: 'select broken from' },
],
'postgres',
);
expect(await readdir(join(stagedDir, 'tables'))).toEqual(['public.customers.json', 'public.orders.json']);
const manifest = await readJson<Record<string, unknown>>(stagedDir, 'manifest.json');
expect(manifest).toMatchObject({
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
snapshotRowCount: 3,
touchedTableCount: 2,
parseFailures: 1,
warnings: ['parse_failed:bad-parse'],
probeWarnings: ['pg_stat_statements.track is none; aggregation still proceeds'],
staleArchiveAfterDays: 90,
});
const orders = await readJson<Record<string, any>>(stagedDir, 'tables/public.orders.json');
expect(orders).toMatchObject({
table: 'public.orders',
stats: {
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
errorRateBucket: 'none',
p95RuntimeBucket: '<100ms',
recencyBucket: 'current',
},
columnsByClause: {
select: [['status', 'high']],
where: [['created_at', 'high']],
join: [['customer_id', 'high']],
groupBy: [['status', 'high']],
},
observedJoins: [{ withTable: 'public.customers', on: ['customer_id'], freq: 'high' }],
topTemplates: [
{
id: 'orders-by-status',
topUsers: [{ user: 'analyst' }],
},
],
});
expect(orders.topTemplates[0].canonicalSql).toContain('group by o.status');
const patterns = await readJson<Record<string, any>>(stagedDir, 'patterns-input.json');
expect(patterns.templates).toEqual([
{
id: 'orders-by-status',
canonicalSql: expect.stringContaining('public.orders'),
tablesTouched: ['public.customers', 'public.orders'],
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
},
]);
});
it('redacts configured SQL substrings in staged artifacts while analyzing original SQL', async () => {
const stagedDir = await tempDir();
const originalSql =
"select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'"; // pragma: allowlist secret
const reader: HistoricSqlReader = {
async probe() {
return { warnings: [], info: [] };
},
async *fetchAggregated() {
yield aggregate({
templateId: 'api-events-with-secret',
canonicalSql: originalSql,
stats: {
executions: 15,
distinctUsers: 2,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 12,
p95RuntimeMs: 25,
errorRate: 0,
rowsProduced: 15,
},
});
},
};
const sqlAnalysis: SqlAnalysisPort = {
analyzeForFingerprint: vi.fn(),
analyzeBatch: vi.fn(async () => new Map([
[
'api-events-with-secret',
{
tablesTouched: ['public.api_events'],
columnsByClause: {
select: [],
where: ['api_key', 'note'],
join: [],
groupBy: [],
},
},
],
])),
validateReadOnly: vi.fn(async () => ({ ok: true })),
};
await stageHistoricSqlAggregatedSnapshot({
stagedDir,
connectionId: 'warehouse',
queryClient: {},
reader,
sqlAnalysis,
pullConfig: {
dialect: 'postgres',
redactionPatterns: ['sk_live_[A-Za-z0-9]+', '(?i)secret_token_[a-z0-9]+'],
},
now: new Date('2026-05-11T12:00:00.000Z'),
});
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith(
[{ id: 'api-events-with-secret', sql: originalSql }],
'postgres',
);
const tableJson = await readFile(join(stagedDir, 'tables/public.api_events.json'), 'utf-8');
const patternsJson = await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8');
expect(tableJson).not.toContain('sk_live_abc123');
expect(tableJson).not.toContain('Secret_Token_9f');
expect(patternsJson).not.toContain('sk_live_abc123');
expect(patternsJson).not.toContain('Secret_Token_9f');
expect(tableJson).toContain('[REDACTED]');
expect(patternsJson).toContain('[REDACTED]');
});
it('limits staged table artifacts to configured enabled tables', async () => {
const stagedDir = await tempDir();
const reader: HistoricSqlReader = {
async probe() {
return { warnings: [], info: [] };
},
async *fetchAggregated() {
yield aggregate({
templateId: 'selected-qualified',
canonicalSql: 'select count(*) from orbit_analytics.int_active_contract_arr',
});
yield aggregate({
templateId: 'selected-unqualified',
canonicalSql: 'select count(*) from int_customer_health_signals',
});
yield aggregate({
templateId: 'unselected',
canonicalSql: 'select count(*) from orbit_raw.accounts',
});
},
};
const sqlAnalysis: SqlAnalysisPort = {
analyzeForFingerprint: vi.fn(),
analyzeBatch: vi.fn(async () => new Map([
[
'selected-qualified',
{
tablesTouched: ['orbit_analytics.int_active_contract_arr'],
columnsByClause: { select: [], where: [], join: [], groupBy: [] },
},
],
[
'selected-unqualified',
{
tablesTouched: ['int_customer_health_signals'],
columnsByClause: { select: [], where: [], join: [], groupBy: [] },
},
],
[
'unselected',
{
tablesTouched: ['orbit_raw.accounts'],
columnsByClause: { select: [], where: [], join: [], groupBy: [] },
},
],
])),
validateReadOnly: vi.fn(async () => ({ ok: true })),
};
await stageHistoricSqlAggregatedSnapshot({
stagedDir,
connectionId: 'warehouse',
queryClient: {},
reader,
sqlAnalysis,
pullConfig: {
dialect: 'postgres',
enabledTables: [
'orbit_analytics.int_active_contract_arr',
'orbit_analytics.int_customer_health_signals',
],
},
now: new Date('2026-05-11T12:00:00.000Z'),
});
expect(await readdir(join(stagedDir, 'tables'))).toEqual([
'int_customer_health_signals.json',
'orbit_analytics.int_active_contract_arr.json',
]);
const manifest = await readJson<Record<string, any>>(stagedDir, 'manifest.json');
expect(manifest.touchedTableCount).toBe(2);
const patterns = await readJson<Record<string, any>>(stagedDir, 'patterns-input.json');
expect(patterns.templates.map((entry: any) => entry.id)).toEqual(['selected-qualified', 'selected-unqualified']);
});
it('preserves full patterns audit input and writes bounded cross-table pattern shards', async () => {
const stagedDir = await tempDir();
const largeSql = `select * from public.orders o join public.customers c on c.id = o.customer_id where payload = '${'x'.repeat(8000)}'`;
const reader: HistoricSqlReader = {
async probe() {
return { warnings: [], info: [] };
},
async *fetchAggregated() {
yield aggregate({
templateId: 'orders-customers-a',
canonicalSql: largeSql,
stats: {
executions: 25,
distinctUsers: 4,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 15,
p95RuntimeMs: 90,
errorRate: 0,
rowsProduced: 250,
},
});
yield aggregate({
templateId: 'orders-customers-b',
canonicalSql: largeSql.replace('payload', 'payload_b'),
stats: {
executions: 22,
distinctUsers: 3,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 20,
p95RuntimeMs: 95,
errorRate: 0,
rowsProduced: 220,
},
});
yield aggregate({
templateId: 'orders-single-table',
canonicalSql: 'select count(*) from public.orders',
stats: {
executions: 30,
distinctUsers: 2,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 10,
p95RuntimeMs: 20,
errorRate: 0,
rowsProduced: 30,
},
});
},
};
const sqlAnalysis: SqlAnalysisPort = {
analyzeForFingerprint: vi.fn(),
analyzeBatch: vi.fn(async () => new Map([
[
'orders-customers-a',
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: {
select: [],
where: ['payload'],
join: ['customer_id', 'id'],
groupBy: [],
},
},
],
[
'orders-customers-b',
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: {
select: [],
where: ['payload_b'],
join: ['customer_id', 'id'],
groupBy: [],
},
},
],
[
'orders-single-table',
{
tablesTouched: ['public.orders'],
columnsByClause: {
select: [],
where: [],
join: [],
groupBy: [],
},
},
],
])),
validateReadOnly: vi.fn(async () => ({ ok: true })),
};
await stageHistoricSqlAggregatedSnapshot({
stagedDir,
connectionId: 'warehouse',
queryClient: {},
reader,
sqlAnalysis,
pullConfig: { dialect: 'postgres' },
now: new Date('2026-05-11T12:00:00.000Z'),
});
const audit = await readJson<Record<string, any>>(stagedDir, 'patterns-input.json');
expect(audit.templates.map((entry: any) => entry.id)).toEqual([
'orders-customers-a',
'orders-customers-b',
'orders-single-table',
]);
const firstShard = await readJson<Record<string, any>>(stagedDir, 'patterns-input/part-0001.json');
expect(firstShard.templates.map((entry: any) => entry.id)).toEqual(['orders-customers-a', 'orders-customers-b']);
expect(firstShard.templates.some((entry: any) => entry.id === 'orders-single-table')).toBe(false);
const manifest = await readJson<Record<string, any>>(stagedDir, 'manifest.json');
expect(manifest.warnings).toEqual([]);
});
});

View file

@ -0,0 +1,360 @@
import { mkdir, writeFile } from 'node:fs/promises';
import { dirname, join } from 'node:path';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import {
bucketDistinctUsers,
bucketErrorRate,
bucketExecutions,
bucketFrequency,
bucketP95Runtime,
bucketRecency,
} from './buckets.js';
import { splitHistoricSqlPatternInputs } from './pattern-inputs.js';
import {
compileHistoricSqlRedactionPatterns,
redactHistoricSqlText,
type HistoricSqlRedactionPattern,
} from './redaction.js';
import {
HISTORIC_SQL_SOURCE_KEY,
aggregatedTemplateSchema,
historicSqlUnifiedPullConfigSchema,
type AggregatedTemplate,
type HistoricSqlReader,
type HistoricSqlUnifiedPullConfig,
type StagedPatternsInput,
type StagedTableInput,
} from './types.js';
interface StageHistoricSqlAggregatedSnapshotInput {
stagedDir: string;
connectionId: string;
queryClient: unknown;
reader: HistoricSqlReader;
sqlAnalysis: SqlAnalysisPort;
pullConfig: unknown;
now?: Date;
}
interface ParsedTemplate {
template: AggregatedTemplate;
tablesTouched: string[];
includedTables: string[];
columnsByClause: Record<string, string[]>;
}
interface EnabledTableFilter {
exact: Set<string>;
uniqueUnqualified: Set<string>;
}
interface TableAccumulator {
table: string;
executions: number;
distinctUsers: number;
errorRateNumerator: number;
p95RuntimeMs: number | null;
lastSeen: string;
columnsByClause: Map<string, Map<string, number>>;
observedJoins: Map<string, Map<string, number>>;
topTemplates: AggregatedTemplate[];
}
const TRIVIAL_SQL_RE = /^\s*SELECT\s+(1|NOW\(\)|CURRENT_TIMESTAMP|VERSION\(\))\s*;?\s*$/i;
const NOISE_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i;
const SYSTEM_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i;
function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
return mkdir(dirname(target), { recursive: true }).then(() =>
writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'),
);
}
function compilePatterns(patterns: string[]): RegExp[] {
return patterns.map((pattern) => new RegExp(pattern));
}
function matchesAny(value: string | null, patterns: RegExp[]): boolean {
return !!value && patterns.some((pattern) => pattern.test(value));
}
function shouldDropBySql(sql: string, config: HistoricSqlUnifiedPullConfig): boolean {
if (NOISE_PREFIX_RE.test(sql) || SYSTEM_TABLE_RE.test(sql)) return true;
if (config.filters.dropTrivialProbes !== false && TRIVIAL_SQL_RE.test(sql)) return true;
return false;
}
function shouldDropByUsers(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean {
const service = config.filters.serviceAccounts;
if (!service || service.mode === 'mark-only' || service.patterns.length === 0) return false;
const patterns = compilePatterns(service.patterns);
const matchingExecutions = template.topUsers
.filter((entry) => matchesAny(entry.user, patterns))
.reduce((sum, entry) => sum + entry.executions, 0);
const allExecutions = template.topUsers.reduce((sum, entry) => sum + entry.executions, 0);
const serviceOnly = allExecutions > 0 && matchingExecutions >= allExecutions;
return service.mode === 'exclude' ? serviceOnly : !serviceOnly;
}
function shouldDropByFailure(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean {
const failed = config.filters.dropFailedBelow;
return !!failed && template.stats.errorRate > failed.errorRate && template.stats.executions < failed.executions;
}
function shouldDropTemplate(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean {
if (shouldDropBySql(template.canonicalSql, config)) return true;
if (shouldDropByUsers(template, config)) return true;
if (shouldDropByFailure(template, config)) return true;
return false;
}
function normalizeTableIdentifier(value: string): string {
return value.trim().toLowerCase();
}
function unqualifiedTableIdentifier(value: string): string {
const parts = normalizeTableIdentifier(value).split('.').filter(Boolean);
return parts.at(-1) ?? '';
}
function buildEnabledTableFilter(enabledTables: string[]): EnabledTableFilter | null {
if (enabledTables.length === 0) {
return null;
}
const exact = new Set(enabledTables.map(normalizeTableIdentifier).filter((value) => value.length > 0));
const unqualifiedCounts = new Map<string, number>();
for (const table of exact) {
const unqualified = unqualifiedTableIdentifier(table);
if (unqualified.length > 0) {
unqualifiedCounts.set(unqualified, (unqualifiedCounts.get(unqualified) ?? 0) + 1);
}
}
return {
exact,
uniqueUnqualified: new Set(
[...unqualifiedCounts.entries()]
.filter(([, count]) => count === 1)
.map(([table]) => table),
),
};
}
function isEnabledTable(table: string, filter: EnabledTableFilter | null): boolean {
if (!filter) {
return true;
}
const normalized = normalizeTableIdentifier(table);
return filter.exact.has(normalized) || filter.uniqueUnqualified.has(unqualifiedTableIdentifier(normalized));
}
function historicSqlWindowDays(config: HistoricSqlUnifiedPullConfig): number {
return 'windowDays' in config ? config.windowDays : 90;
}
function redactTemplateSql(
template: AggregatedTemplate,
redactors: readonly HistoricSqlRedactionPattern[],
): AggregatedTemplate {
if (redactors.length === 0) {
return template;
}
return {
...template,
canonicalSql: redactHistoricSqlText(template.canonicalSql, redactors),
};
}
function recordColumn(acc: TableAccumulator, clause: string, column: string, executions: number): void {
const byColumn = acc.columnsByClause.get(clause) ?? new Map<string, number>();
byColumn.set(column, (byColumn.get(column) ?? 0) + executions);
acc.columnsByClause.set(clause, byColumn);
}
function recordJoin(acc: TableAccumulator, otherTable: string, columns: string[], executions: number): void {
const byColumns = acc.observedJoins.get(otherTable) ?? new Map<string, number>();
const key = [...new Set(columns)].sort().join(',');
if (key.length > 0) {
byColumns.set(key, (byColumns.get(key) ?? 0) + executions);
acc.observedJoins.set(otherTable, byColumns);
}
}
function accumulatorFor(table: string): TableAccumulator {
return {
table,
executions: 0,
distinctUsers: 0,
errorRateNumerator: 0,
p95RuntimeMs: null,
lastSeen: '1970-01-01T00:00:00.000Z',
columnsByClause: new Map(),
observedJoins: new Map(),
topTemplates: [],
};
}
function addTemplate(acc: TableAccumulator, parsed: ParsedTemplate): void {
const executions = parsed.template.stats.executions;
acc.executions += executions;
acc.distinctUsers = Math.max(acc.distinctUsers, parsed.template.stats.distinctUsers);
acc.errorRateNumerator += parsed.template.stats.errorRate * executions;
acc.p95RuntimeMs =
acc.p95RuntimeMs === null
? parsed.template.stats.p95RuntimeMs
: parsed.template.stats.p95RuntimeMs === null
? acc.p95RuntimeMs
: Math.max(acc.p95RuntimeMs, parsed.template.stats.p95RuntimeMs);
acc.lastSeen = parsed.template.stats.lastSeen > acc.lastSeen ? parsed.template.stats.lastSeen : acc.lastSeen;
for (const [clause, columns] of Object.entries(parsed.columnsByClause)) {
for (const column of columns) {
recordColumn(acc, clause, column, executions);
}
}
const joinColumns = parsed.columnsByClause.join ?? [];
for (const otherTable of parsed.tablesTouched.filter((table) => table !== acc.table)) {
recordJoin(acc, otherTable, joinColumns, executions);
}
acc.topTemplates.push(parsed.template);
}
function toStagedTable(acc: TableAccumulator, now: Date): StagedTableInput {
const errorRate = acc.executions > 0 ? acc.errorRateNumerator / acc.executions : 0;
const columnsByClause: Record<string, Array<[string, string]>> = Object.fromEntries(
[...acc.columnsByClause.entries()]
.sort(([left], [right]) => left.localeCompare(right))
.map(([clause, counts]) => [
clause,
[...counts.entries()]
.sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))
.map(([column, count]) => [column, bucketFrequency(count, acc.executions)] as [string, string]),
]),
);
const observedJoins = [...acc.observedJoins.entries()]
.flatMap(([withTable, byColumns]) =>
[...byColumns.entries()].map(([columns, count]) => ({
withTable,
on: columns.split(',').filter(Boolean),
freq: bucketFrequency(count, acc.executions),
})),
)
.sort((left, right) => left.withTable.localeCompare(right.withTable) || left.on.join(',').localeCompare(right.on.join(',')));
const topTemplates = [...acc.topTemplates]
.sort((left, right) => right.stats.executions - left.stats.executions || left.templateId.localeCompare(right.templateId))
.slice(0, 5)
.map((template) => ({
id: template.templateId,
canonicalSql: template.canonicalSql,
topUsers: template.topUsers.slice(0, 5).map((entry) => ({ user: entry.user })),
}));
return {
table: acc.table,
stats: {
executionsBucket: bucketExecutions(acc.executions),
distinctUsersBucket: bucketDistinctUsers(acc.distinctUsers),
errorRateBucket: bucketErrorRate(errorRate),
p95RuntimeBucket: bucketP95Runtime(acc.p95RuntimeMs),
recencyBucket: bucketRecency(acc.lastSeen, now),
},
columnsByClause,
observedJoins,
topTemplates,
};
}
function toPatternsInput(parsedTemplates: ParsedTemplate[]): StagedPatternsInput {
return {
templates: parsedTemplates
.map(({ template, tablesTouched }) => ({
id: template.templateId,
canonicalSql: template.canonicalSql,
tablesTouched: [...tablesTouched].sort(),
executionsBucket: bucketExecutions(template.stats.executions),
distinctUsersBucket: bucketDistinctUsers(template.stats.distinctUsers),
dialect: template.dialect,
}))
.sort((left, right) => left.id.localeCompare(right.id)),
};
}
export async function stageHistoricSqlAggregatedSnapshot(input: StageHistoricSqlAggregatedSnapshotInput): Promise<void> {
const config = historicSqlUnifiedPullConfigSchema.parse(input.pullConfig);
const enabledTableFilter = buildEnabledTableFilter(config.enabledTables);
const redactors = compileHistoricSqlRedactionPatterns(config.redactionPatterns);
const now = input.now ?? new Date();
const windowStart = new Date(now.getTime() - historicSqlWindowDays(config) * 24 * 60 * 60 * 1000);
const probe = await input.reader.probe(input.queryClient);
const snapshot: AggregatedTemplate[] = [];
let snapshotRowCount = 0;
for await (const row of input.reader.fetchAggregated(input.queryClient, { start: windowStart, end: now }, config)) {
snapshotRowCount += 1;
const parsed = aggregatedTemplateSchema.parse(row);
if (!shouldDropTemplate(parsed, config)) {
snapshot.push(parsed);
}
}
const analysis = await input.sqlAnalysis.analyzeBatch(
snapshot.map((template) => ({ id: template.templateId, sql: template.canonicalSql })),
config.dialect,
);
const warnings: string[] = [];
const parsedTemplates: ParsedTemplate[] = [];
for (const template of snapshot) {
const parsed = analysis.get(template.templateId);
if (!parsed || parsed.error) {
warnings.push(`parse_failed:${template.templateId}`);
continue;
}
const tablesTouched = [...new Set(parsed.tablesTouched)].filter((table) => table.length > 0).sort();
const includedTables = tablesTouched.filter((table) => isEnabledTable(table, enabledTableFilter));
if (includedTables.length === 0) {
continue;
}
parsedTemplates.push({
template: redactTemplateSql(template, redactors),
tablesTouched,
includedTables,
columnsByClause: Object.fromEntries(
Object.entries(parsed.columnsByClause).map(([clause, columns]) => [clause, [...new Set(columns)].sort()]),
),
});
}
const byTable = new Map<string, TableAccumulator>();
for (const parsed of parsedTemplates) {
for (const table of parsed.includedTables) {
const acc = byTable.get(table) ?? accumulatorFor(table);
addTemplate(acc, parsed);
byTable.set(table, acc);
}
}
await mkdir(input.stagedDir, { recursive: true });
for (const [table, acc] of [...byTable.entries()].sort(([left], [right]) => left.localeCompare(right))) {
await writeJson(input.stagedDir, `tables/${table}.json`, toStagedTable(acc, now));
}
const patternsInput = toPatternsInput(parsedTemplates);
const patternInputSplit = splitHistoricSqlPatternInputs(patternsInput);
const allWarnings = [...warnings, ...patternInputSplit.warnings];
await writeJson(input.stagedDir, 'patterns-input.json', patternInputSplit.auditInput);
for (const shard of patternInputSplit.shards) {
await writeJson(input.stagedDir, shard.path, shard.input);
}
await writeJson(input.stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: input.connectionId,
dialect: config.dialect,
fetchedAt: now.toISOString(),
windowStart: windowStart.toISOString(),
windowEnd: now.toISOString(),
snapshotRowCount,
touchedTableCount: byTable.size,
parseFailures: allWarnings.filter((warning) => warning.startsWith('parse_failed:')).length,
warnings: allWarnings,
probeWarnings: probe.warnings,
staleArchiveAfterDays: config.staleArchiveAfterDays,
});
}

View file

@ -0,0 +1,110 @@
import { describe, expect, it } from 'vitest';
import {
aggregatedTemplateSchema,
historicSqlUnifiedPullConfigSchema,
stagedManifestSchema,
stagedPatternsInputSchema,
stagedTableInputSchema,
} from './types.js';
describe('historic-sql unified contracts', () => {
it('parses minExecutions and service-account filters', () => {
expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minExecutions: 9 })).toMatchObject({
dialect: 'postgres',
minExecutions: 9,
redactionPatterns: [],
staleArchiveAfterDays: 90,
});
expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minExecutions: 9 })).not.toHaveProperty(
'windowDays',
);
expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minExecutions: 9 })).not.toHaveProperty(
'concurrency',
);
const parsed = historicSqlUnifiedPullConfigSchema.parse({
dialect: 'postgres',
minExecutions: 7,
filters: {
serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' },
},
});
expect(parsed.minExecutions).toBe(7);
expect(parsed.filters.serviceAccounts).toEqual({ patterns: ['^svc_'], mode: 'exclude' });
});
it('validates aggregate templates from warehouse readers', () => {
const parsed = aggregatedTemplateSchema.parse({
templateId: 'pg:123',
canonicalSql: 'select status, count(*) from public.orders group by status',
dialect: 'postgres',
stats: {
executions: 42,
distinctUsers: 3,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 12.5,
p95RuntimeMs: 40,
errorRate: 0,
rowsProduced: 100,
},
topUsers: [{ user: 'analyst', executions: 40 }],
});
expect(parsed.templateId).toBe('pg:123');
expect(parsed.topUsers).toEqual([{ user: 'analyst', executions: 40 }]);
});
it('validates staged table, patterns, and manifest artifacts', () => {
expect(
stagedTableInputSchema.parse({
table: 'public.orders',
stats: {
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
errorRateBucket: 'none',
p95RuntimeBucket: '<100ms',
recencyBucket: 'current',
},
columnsByClause: {
select: [['status', 'high']],
where: [['created_at', 'mid']],
},
observedJoins: [{ withTable: 'public.customers', on: ['customer_id'], freq: 'high' }],
topTemplates: [{ id: 'pg:123', canonicalSql: 'select * from public.orders', topUsers: [{ user: 'analyst' }] }],
}).table,
).toBe('public.orders');
expect(
stagedPatternsInputSchema.parse({
templates: [
{
id: 'pg:123',
canonicalSql: 'select * from public.orders',
tablesTouched: ['public.orders'],
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
},
],
}).templates,
).toHaveLength(1);
expect(
stagedManifestSchema.parse({
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 2,
touchedTableCount: 1,
parseFailures: 1,
warnings: ['parse_failed:bad'],
probeWarnings: [],
staleArchiveAfterDays: 90,
}).staleArchiveAfterDays,
).toBe(90);
});
});

View file

@ -0,0 +1,154 @@
import { z } from 'zod';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
export const HISTORIC_SQL_SOURCE_KEY = 'historic-sql' as const;
const historicSqlDialectSchema = z.enum(['snowflake', 'bigquery', 'postgres']);
export type HistoricSqlDialect = z.infer<typeof historicSqlDialectSchema>;
const filterModeSchema = z.enum(['exclude', 'include', 'mark-only']);
const historicSqlCommonPullConfigSchema = z.object({
minExecutions: z.number().int().nonnegative().default(5),
enabledTables: z.array(z.string().min(1)).default([]),
filters: z.object({
serviceAccounts: z.object({
patterns: z.array(z.string()).default([]),
mode: filterModeSchema.default('exclude'),
}).optional(),
orchestrators: z.object({
mode: filterModeSchema.default('mark-only'),
}).optional(),
dropTrivialProbes: z.boolean().default(true),
dropFailedBelow: z.object({
errorRate: z.number().min(0).max(1),
executions: z.number().int().nonnegative(),
}).optional(),
}).default({ dropTrivialProbes: true }),
redactionPatterns: z.array(z.string()).default([]),
staleArchiveAfterDays: z.number().int().positive().default(90),
});
const historicSqlWindowedPullConfigSchema = historicSqlCommonPullConfigSchema.extend({
dialect: z.enum(['snowflake', 'bigquery']),
windowDays: z.number().int().positive().default(90),
});
const historicSqlPostgresPullConfigSchema = historicSqlCommonPullConfigSchema.extend({
dialect: z.literal('postgres'),
});
export const historicSqlUnifiedPullConfigSchema = z.discriminatedUnion('dialect', [
historicSqlWindowedPullConfigSchema,
historicSqlPostgresPullConfigSchema,
]);
export type HistoricSqlUnifiedPullConfig = z.infer<typeof historicSqlUnifiedPullConfigSchema>;
export const aggregatedTemplateSchema = z.object({
templateId: z.string().min(1),
canonicalSql: z.string().min(1),
dialect: historicSqlDialectSchema,
stats: z.object({
executions: z.number().int().nonnegative(),
distinctUsers: z.number().int().nonnegative(),
firstSeen: z.iso.datetime(),
lastSeen: z.iso.datetime(),
p50RuntimeMs: z.number().nonnegative().nullable(),
p95RuntimeMs: z.number().nonnegative().nullable(),
errorRate: z.number().min(0).max(1),
rowsProduced: z.number().int().nonnegative().nullable(),
}),
topUsers: z.array(z.object({
user: z.string().nullable(),
executions: z.number().int().nonnegative(),
})).default([]),
});
export type AggregatedTemplate = z.infer<typeof aggregatedTemplateSchema>;
export const stagedTableInputSchema = z.object({
table: z.string().min(1),
stats: z.object({
executionsBucket: z.string(),
distinctUsersBucket: z.string(),
errorRateBucket: z.string(),
p95RuntimeBucket: z.string(),
recencyBucket: z.string(),
}),
columnsByClause: z.record(z.string(), z.array(z.tuple([z.string(), z.string()]))),
observedJoins: z.array(z.object({
withTable: z.string(),
on: z.array(z.string()),
freq: z.string(),
})),
topTemplates: z.array(z.object({
id: z.string(),
canonicalSql: z.string(),
topUsers: z.array(z.object({ user: z.string().nullable() })),
})),
});
export type StagedTableInput = z.infer<typeof stagedTableInputSchema>;
export const stagedPatternsInputSchema = z.object({
templates: z.array(z.object({
id: z.string(),
canonicalSql: z.string(),
tablesTouched: z.array(z.string()),
executionsBucket: z.string(),
distinctUsersBucket: z.string(),
dialect: historicSqlDialectSchema,
})),
});
export type StagedPatternsInput = z.infer<typeof stagedPatternsInputSchema>;
export const stagedManifestSchema = z.object({
source: z.literal(HISTORIC_SQL_SOURCE_KEY),
connectionId: z.string().min(1),
dialect: historicSqlDialectSchema,
fetchedAt: z.iso.datetime(),
windowStart: z.iso.datetime(),
windowEnd: z.iso.datetime(),
snapshotRowCount: z.number().int().nonnegative(),
touchedTableCount: z.number().int().nonnegative(),
parseFailures: z.number().int().nonnegative(),
warnings: z.array(z.string()),
probeWarnings: z.array(z.string()),
staleArchiveAfterDays: z.number().int().positive().default(90),
});
export type StagedManifest = z.infer<typeof stagedManifestSchema>;
export interface HistoricSqlProbeResult {
warnings: string[];
info?: string[];
}
export interface HistoricSqlReader {
probe(client: unknown): Promise<HistoricSqlProbeResult>;
fetchAggregated(
client: unknown,
window: HistoricSqlTimeWindow,
config: HistoricSqlUnifiedPullConfig,
): AsyncIterable<AggregatedTemplate>;
}
export interface HistoricSqlTimeWindow {
start: Date;
end: Date;
}
export interface KtxPostgresQueryClient {
executeQuery(sql: string, params?: unknown[]): Promise<{ headers: string[]; rows: unknown[][]; totalRows?: number }>;
}
export interface PostgresPgssProbeResult extends HistoricSqlProbeResult {
pgServerVersion: string;
warnings: string[];
info: string[];
}
export interface HistoricSqlSourceAdapterDeps {
sqlAnalysis: SqlAnalysisPort;
reader: HistoricSqlReader;
queryClient: unknown;
now?: () => Date;
}

View file

@ -0,0 +1,107 @@
import { mkdtemp } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import type { KtxSchemaSnapshot } from '../../../scan/types.js';
import { chunkLiveDatabaseStagedDir } from './chunk.js';
import { liveDatabaseTablePath, writeLiveDatabaseSnapshot } from './stage.js';
function snapshot(): KtxSchemaSnapshot {
return {
connectionId: 'conn-1',
driver: 'postgres',
extractedAt: '2026-04-27T00:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
kind: 'table',
comment: null,
estimatedRows: null,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
],
foreignKeys: [],
},
{
name: 'customers',
catalog: null,
db: 'public',
kind: 'table',
comment: null,
estimatedRows: null,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
],
foreignKeys: [],
},
],
};
}
describe('chunkLiveDatabaseStagedDir', () => {
it('emits one work unit per table on the first run', async () => {
const dir = await mkdtemp(join(tmpdir(), 'ktx-live-db-chunk-'));
await writeLiveDatabaseSnapshot(dir, snapshot());
const result = await chunkLiveDatabaseStagedDir(dir);
expect(result.workUnits.map((wu) => wu.unitKey)).toEqual([
'live-database-public-customers',
'live-database-public-orders',
]);
expect(result.workUnits[0]?.dependencyPaths).toEqual(['connection.json', 'foreign-keys.json']);
expect(result.workUnits[0]?.peerFileIndex).toContain(
liveDatabaseTablePath({ catalog: null, db: 'public', name: 'orders' }),
);
});
it('keeps only changed tables during incremental syncs and records table evictions', async () => {
const dir = await mkdtemp(join(tmpdir(), 'ktx-live-db-diff-'));
await writeLiveDatabaseSnapshot(dir, snapshot());
const ordersPath = liveDatabaseTablePath({ catalog: null, db: 'public', name: 'orders' });
const customersPath = liveDatabaseTablePath({ catalog: null, db: 'public', name: 'customers' });
const result = await chunkLiveDatabaseStagedDir(dir, {
added: [],
modified: [ordersPath],
deleted: [customersPath],
unchanged: ['connection.json', 'foreign-keys.json'],
});
expect(result.workUnits.map((wu) => wu.unitKey)).toEqual(['live-database-public-orders']);
expect(result.eviction?.deletedRawPaths).toEqual([customersPath]);
});
it('fans out all table work units when the foreign-key index changes', async () => {
const dir = await mkdtemp(join(tmpdir(), 'ktx-live-db-fk-'));
await writeLiveDatabaseSnapshot(dir, snapshot());
const result = await chunkLiveDatabaseStagedDir(dir, {
added: [],
modified: ['foreign-keys.json'],
deleted: [],
unchanged: [],
});
expect(result.workUnits).toHaveLength(2);
});
});

Some files were not shown because too many files have changed in this diff Show more