Initial open-source release

This commit is contained in:
Andrey Avtomonov 2026-05-10 23:12:26 +02:00
commit 1a42152e6f
1199 changed files with 257054 additions and 0 deletions

View file

@ -0,0 +1,330 @@
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
vi.mock('ai', () => ({
generateText: vi.fn(),
stepCountIs: (n: number) => n,
tool: (def: unknown) => def,
}));
import { generateText } from 'ai';
import { AgentRunnerService, type RunLoopStepInfo } from './agent-runner.service.js';
describe('AgentRunnerService.runLoop', () => {
let runner: AgentRunnerService;
const llmProvider = {
getModel: vi.fn().mockReturnValue({ modelId: 'claude-sonnet-4-6', provider: 'anthropic' }),
getModelByName: vi.fn(),
cacheMarker: vi.fn(),
repairToolCallHandler: vi.fn(),
thinkingProviderOptions: vi.fn(),
telemetryConfig: vi.fn(),
promptCachingConfig: vi.fn(() => ({
enabled: false,
systemTtl: '1h',
toolsTtl: '1h',
historyTtl: '5m',
cacheSystem: true,
cacheTools: true,
cacheHistory: true,
vertexFallbackTo5m: false,
})),
activeBackend: vi.fn(() => 'anthropic'),
};
beforeEach(() => {
vi.clearAllMocks();
runner = new AgentRunnerService({ llmProvider: llmProvider as any });
});
afterEach(() => vi.clearAllMocks());
it('passes systemPrompt, userPrompt, tools, and step budget through to generateText', async () => {
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
const tools = { noop: { description: 'noop', inputSchema: {}, execute: vi.fn() } };
await runner.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: 'SYS',
userPrompt: 'USR',
toolSet: tools as any,
stepBudget: 17,
telemetryTags: { source: 'test' },
});
const call = (generateText as any).mock.calls[0][0];
expect(call.messages).toEqual([
{ role: 'system', content: 'SYS' },
{ role: 'user', content: 'USR' },
]);
expect(call.system).toBeUndefined();
expect(call.prompt).toBeUndefined();
expect(call.tools).toEqual(tools);
expect(call.stopWhen).toBe(17);
expect(call.temperature).toBe(0);
expect(llmProvider.getModel).toHaveBeenCalledWith('candidateExtraction');
});
it('returns stopReason=natural when the loop completes without error', async () => {
(generateText as any).mockResolvedValue({ text: 'done', toolCalls: [], steps: [] });
const result = await runner.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: 'system',
userPrompt: 'user',
toolSet: {},
stepBudget: 10,
telemetryTags: {},
});
expect(result.stopReason).toBe('natural');
expect(result.error).toBeUndefined();
expect(llmProvider.getModel).toHaveBeenCalledWith('candidateExtraction');
expect(generateText).toHaveBeenCalledWith(
expect.objectContaining({
messages: [
{ role: 'system', content: 'system' },
{ role: 'user', content: 'user' },
],
}),
);
});
it('returns stopReason=error with the error on generateText failure', async () => {
const err = new Error('LLM unavailable');
(generateText as any).mockRejectedValue(err);
const result = await runner.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: {},
});
expect(result.stopReason).toBe('error');
expect(result.error).toBe(err);
});
it('invokes caller onStepFinish with incrementing stepIndex and total budget', async () => {
const calls: RunLoopStepInfo[] = [];
(generateText as any).mockImplementation(async (opts: any) => {
for (let i = 0; i < 3; i++) {
await opts.onStepFinish({});
}
return { text: 'ok', toolCalls: [], steps: [] };
});
await runner.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: {},
onStepFinish: (info) => {
calls.push(info);
},
});
expect(calls).toEqual([
{ stepIndex: 1, stepBudget: 10 },
{ stepIndex: 2, stepBudget: 10 },
{ stepIndex: 3, stepBudget: 10 },
]);
});
it('swallows errors thrown from caller onStepFinish without aborting the loop', async () => {
(generateText as any).mockImplementation(async (opts: any) => {
await opts.onStepFinish({});
return { text: 'ok', toolCalls: [], steps: [] };
});
const result = await runner.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: {},
onStepFinish: () => {
throw new Error('boom');
},
});
expect(result.stopReason).toBe('natural');
});
it('forwards telemetryTags.source through experimental_telemetry metadata', async () => {
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
const telemetryConfigEnabled = {
isEnabled: () => true,
devtoolsEnabled: false,
appSettingsService: {
settings: { telemetry: { recordInputs: false, recordOutputs: false } },
},
systemConfigService: {
config: { instance: { name: 'test-instance' } },
},
} as any;
const runnerWithTelemetry = new AgentRunnerService({
llmProvider: llmProvider as any,
telemetry: {
createTelemetry: (tags) => ({
isEnabled: telemetryConfigEnabled.isEnabled(),
metadata: {
source: tags.source ?? 'RESEARCH',
jobId: tags.jobId,
unitKey: tags.unitKey,
},
}),
},
});
await runnerWithTelemetry.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: { source: 'metabase', jobId: 'job-123', unitKey: 'u/1' },
});
const call = (generateText as any).mock.calls[0][0];
expect(call.experimental_telemetry.metadata.source).toBe('metabase');
});
it('defaults to source=RESEARCH when telemetryTags omits source', async () => {
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
const telemetryConfigEnabled = {
isEnabled: () => true,
devtoolsEnabled: false,
appSettingsService: {
settings: { telemetry: { recordInputs: false, recordOutputs: false } },
},
systemConfigService: {
config: { instance: { name: 'test-instance' } },
},
} as any;
const runnerWithTelemetry = new AgentRunnerService({
llmProvider: llmProvider as any,
telemetry: {
createTelemetry: (tags) => ({
isEnabled: telemetryConfigEnabled.isEnabled(),
metadata: {
source: tags.source ?? 'RESEARCH',
jobId: tags.jobId,
unitKey: tags.unitKey,
},
}),
},
});
await runnerWithTelemetry.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: { operationName: 'memory-agent-ingest' },
});
const call = (generateText as any).mock.calls[0][0];
expect(call.experimental_telemetry.metadata.source).toBe('RESEARCH');
});
it('forwards jobId and unitKey through experimental_telemetry metadata', async () => {
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
const telemetryConfigEnabled = {
isEnabled: () => true,
devtoolsEnabled: false,
appSettingsService: {
settings: { telemetry: { recordInputs: false, recordOutputs: false } },
},
systemConfigService: {
config: { instance: { name: 'test-instance' } },
},
} as any;
const runnerWithTelemetry = new AgentRunnerService({
llmProvider: llmProvider as any,
telemetry: {
createTelemetry: (tags) => ({
isEnabled: telemetryConfigEnabled.isEnabled(),
metadata: {
source: tags.source ?? 'RESEARCH',
jobId: tags.jobId,
unitKey: tags.unitKey,
},
}),
},
});
await runnerWithTelemetry.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: { source: 'metabase', jobId: 'job-777', unitKey: 'sources/users' },
});
const call = (generateText as any).mock.calls[0][0];
expect(call.experimental_telemetry.metadata.jobId).toBe('job-777');
expect(call.experimental_telemetry.metadata.unitKey).toBe('sources/users');
});
it('records a sanitized LLM debug request when a recorder is injected', async () => {
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
const record = vi.fn();
const provider = {
...llmProvider,
cacheMarker: vi.fn((ttl: '5m' | '1h') => ({
anthropic: { cacheControl: { type: 'ephemeral' as const, ttl } },
})),
promptCachingConfig: vi.fn(() => ({
enabled: true,
systemTtl: '1h',
toolsTtl: '1h',
historyTtl: '5m',
cacheSystem: true,
cacheTools: true,
cacheHistory: true,
vertexFallbackTo5m: false,
})),
};
const runnerWithDebug = new AgentRunnerService({
llmProvider: provider as any,
debugRequestRecorder: { record },
});
await runnerWithDebug.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: 'SECRET SYSTEM PROMPT',
userPrompt: 'SECRET USER PROMPT',
toolSet: {
emit_candidate: {
description: 'SECRET TOOL DESCRIPTION',
inputSchema: {},
execute: vi.fn(),
} as any,
},
stepBudget: 10,
telemetryTags: { operationName: 'ingest-bundle-wu', source: 'metabase', jobId: 'job-1', unitKey: 'cards/1' },
});
expect(record).toHaveBeenCalledTimes(1);
expect(record).toHaveBeenCalledWith(
expect.objectContaining({
operationName: 'ingest-bundle-wu',
source: 'metabase',
jobId: 'job-1',
unitKey: 'cards/1',
modelRole: 'candidateExtraction',
modelId: 'claude-sonnet-4-6',
messageCount: 2,
toolNames: ['emit_candidate'],
}),
);
const providerOptions = record.mock.calls[0][0].providerOptions;
expect(providerOptions).toEqual(
expect.arrayContaining([
expect.objectContaining({ target: 'message', index: 0, role: 'system' }),
expect.objectContaining({ target: 'message-part', index: 1, role: 'user', partIndex: 0 }),
expect.objectContaining({ target: 'tool', name: 'emit_candidate' }),
]),
);
expect(providerOptions).toHaveLength(3);
const serialized = JSON.stringify(record.mock.calls[0][0]);
expect(serialized).not.toContain('SECRET SYSTEM PROMPT');
expect(serialized).not.toContain('SECRET USER PROMPT');
expect(serialized).not.toContain('SECRET TOOL DESCRIPTION');
});
});

View file

@ -0,0 +1,101 @@
import { KloMessageBuilder, type KloLlmProvider, type KloModelRole } from '@klo/llm';
import { generateText, stepCountIs, type TelemetrySettings, type Tool } from 'ai';
import { noopLogger, type KloLogger } from '../core/index.js';
import { summarizeKloLlmDebugRequest, type KloLlmDebugRequestRecorder } from '../llm/index.js';
export type RunLoopStopReason = 'budget' | 'natural' | 'error';
export interface RunLoopStepInfo {
stepIndex: number;
stepBudget: number;
}
export interface RunLoopParams {
modelRole: KloModelRole;
systemPrompt: string;
userPrompt: string;
toolSet: Record<string, Tool>;
stepBudget: number;
telemetryTags: Record<string, string>;
onStepFinish?: (info: RunLoopStepInfo) => void | Promise<void>;
}
export interface RunLoopResult {
stopReason: RunLoopStopReason;
error?: Error;
}
export interface AgentTelemetryPort {
createTelemetry(tags: Record<string, string>): TelemetrySettings;
}
export interface AgentRunnerServiceDeps {
llmProvider: KloLlmProvider;
telemetry?: AgentTelemetryPort;
debugRequestRecorder?: KloLlmDebugRequestRecorder;
logger?: KloLogger;
}
export class AgentRunnerService {
private readonly logger: KloLogger;
constructor(private readonly deps: AgentRunnerServiceDeps) {
this.logger = deps.logger ?? noopLogger;
}
async runLoop(params: RunLoopParams): Promise<RunLoopResult> {
let stepIndex = 0;
try {
const model = this.deps.llmProvider.getModel(params.modelRole);
const builder = new KloMessageBuilder(this.deps.llmProvider);
const built = builder.wrapSimple({
system: params.systemPrompt,
messages: [{ role: 'user', content: params.userPrompt }],
tools: params.toolSet,
model,
});
await this.deps.debugRequestRecorder?.record(
summarizeKloLlmDebugRequest({
operationName: params.telemetryTags.operationName ?? 'klo-agent-runner',
source: params.telemetryTags.source,
jobId: params.telemetryTags.jobId,
unitKey: params.telemetryTags.unitKey,
modelRole: params.modelRole,
modelId: (model as { modelId?: string }).modelId ?? params.modelRole,
messages: built.messages,
tools: built.tools as Record<string, { providerOptions?: unknown }>,
}),
);
await generateText({
model,
temperature: 0,
stopWhen: stepCountIs(params.stepBudget),
experimental_telemetry: this.deps.telemetry?.createTelemetry(params.telemetryTags),
messages: built.messages,
tools: built.tools as Record<string, Tool>,
onStepFinish: async () => {
stepIndex += 1;
if (!params.onStepFinish) {
return;
}
try {
await params.onStepFinish({ stepIndex, stepBudget: params.stepBudget });
} catch (err) {
this.logger.warn(
`[agent-runner] onStepFinish callback threw; ignoring: ${
err instanceof Error ? err.message : String(err)
}`,
);
}
},
});
return { stopReason: 'natural' };
} catch (error) {
const err = error instanceof Error ? error : new Error(String(error));
this.logger.warn(`[agent-runner] loop failed: ${err.message}`);
return { stopReason: 'error', error: err };
}
}
}

View file

@ -0,0 +1,9 @@
export type {
AgentRunnerServiceDeps,
AgentTelemetryPort,
RunLoopParams,
RunLoopResult,
RunLoopStepInfo,
RunLoopStopReason,
} from './agent-runner.service.js';
export { AgentRunnerService } from './agent-runner.service.js';

View file

@ -0,0 +1,28 @@
import { z } from 'zod';
export const connectionTypeSchema = z.enum([
'POSTGRESQL',
'SQLITE',
'SQLSERVER',
'BIGQUERY',
'SNOWFLAKE',
'CENTRALREACH',
'EPIC',
'CERNER',
'ATHENA',
'QUICKBOOKS',
'WORKDAY',
'REST',
'S3',
'SLACK',
'METABASE',
'LOOKER',
'NOTION',
'POSTHOG',
'MYSQL',
'CLICKHOUSE',
'PLAIN',
'BETTERSTACK',
]);
export type ConnectionType = z.infer<typeof connectionTypeSchema>;

View file

@ -0,0 +1,27 @@
export type {
KloSqlQueryExecutionInput,
KloSqlQueryExecutionResult,
KloSqlQueryExecutorPort,
} from './query-executor.js';
export { createDefaultLocalQueryExecutor, type DefaultLocalQueryExecutorOptions } from './local-query-executor.js';
export { normalizeQueryRows } from './query-executor.js';
export { createPostgresQueryExecutor } from './postgres-query-executor.js';
export { assertReadOnlySql, limitSqlForExecution } from './read-only-sql.js';
export { createSqliteQueryExecutor, sqliteDatabasePathFromConnection } from './sqlite-query-executor.js';
export { connectionTypeSchema, type ConnectionType } from './connection-type.js';
export {
localConnectionInfoFromConfig,
localConnectionToWarehouseDescriptor,
localConnectionTypeForConfig,
type LocalConnectionInfo,
type LocalWarehouseDescriptor,
} from './local-warehouse-descriptor.js';
export {
KLO_NOTION_ORG_KNOWLEDGE_WARNING,
notionConnectionToPullConfig,
parseNotionConnectionConfig,
redactNotionConnectionConfig,
resolveNotionAuthToken,
type KloNotionConnectionConfig,
type RedactedKloNotionConnectionConfig,
} from './notion-config.js';

View file

@ -0,0 +1,59 @@
import { describe, expect, it, vi } from 'vitest';
import { createDefaultLocalQueryExecutor } from './local-query-executor.js';
describe('createDefaultLocalQueryExecutor', () => {
it('dispatches postgres and sqlite drivers to their executors', async () => {
const postgres = {
execute: vi.fn(async () => ({
headers: ['pg'],
rows: [[1]],
totalRows: 1,
command: 'SELECT',
rowCount: 1,
})),
};
const sqlite = {
execute: vi.fn(async () => ({
headers: ['sqlite'],
rows: [[2]],
totalRows: 1,
command: 'SELECT',
rowCount: 1,
})),
};
const executor = createDefaultLocalQueryExecutor({ postgres, sqlite });
await expect(
executor.execute({
connectionId: 'pg',
connection: { driver: 'postgres', readonly: true },
sql: 'select 1',
}),
).resolves.toMatchObject({ headers: ['pg'] });
await expect(
executor.execute({
connectionId: 'local',
connection: { driver: 'sqlite', readonly: true },
sql: 'select 1',
}),
).resolves.toMatchObject({ headers: ['sqlite'] });
expect(postgres.execute).toHaveBeenCalledTimes(1);
expect(sqlite.execute).toHaveBeenCalledTimes(1);
});
it('rejects unsupported local execution drivers', async () => {
const executor = createDefaultLocalQueryExecutor({
postgres: { execute: vi.fn() },
sqlite: { execute: vi.fn() },
});
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'snowflake', readonly: true },
sql: 'select 1',
}),
).rejects.toThrow('No local query executor is configured for driver "snowflake".');
});
});

View file

@ -0,0 +1,34 @@
import { createPostgresQueryExecutor } from './postgres-query-executor.js';
import type {
KloSqlQueryExecutionInput,
KloSqlQueryExecutionResult,
KloSqlQueryExecutorPort,
} from './query-executor.js';
import { createSqliteQueryExecutor } from './sqlite-query-executor.js';
export interface DefaultLocalQueryExecutorOptions {
postgres?: KloSqlQueryExecutorPort;
sqlite?: KloSqlQueryExecutorPort;
}
function driverFor(input: KloSqlQueryExecutionInput): string {
return String(input.connection?.driver ?? '').toLowerCase();
}
export function createDefaultLocalQueryExecutor(options: DefaultLocalQueryExecutorOptions = {}): KloSqlQueryExecutorPort {
const postgres = options.postgres ?? createPostgresQueryExecutor();
const sqlite = options.sqlite ?? createSqliteQueryExecutor();
return {
async execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult> {
const driver = driverFor(input);
if (driver === 'postgres' || driver === 'postgresql') {
return postgres.execute(input);
}
if (driver === 'sqlite' || driver === 'sqlite3') {
return sqlite.execute(input);
}
throw new Error(`No local query executor is configured for driver "${input.connection?.driver ?? 'unknown'}".`);
},
};
}

View file

@ -0,0 +1,63 @@
import { describe, expect, it } from 'vitest';
import {
localConnectionInfoFromConfig,
localConnectionToWarehouseDescriptor,
localConnectionTypeForConfig,
} from './local-warehouse-descriptor.js';
describe('localConnectionToWarehouseDescriptor', () => {
it('maps local Postgres URLs to canonical warehouse descriptors', () => {
expect(
localConnectionToWarehouseDescriptor('warehouse', {
driver: 'postgres',
url: 'postgresql://readonly@db.example.test/analytics',
}),
).toMatchObject({
id: 'warehouse',
connection_type: 'POSTGRESQL',
host: 'db.example.test',
database: 'analytics',
});
});
it('maps BigQuery project and dataset from explicit fields', () => {
expect(
localConnectionToWarehouseDescriptor('bq', {
driver: 'bigquery',
project_id: 'acme',
dataset_id: 'warehouse',
}),
).toMatchObject({
id: 'bq',
connection_type: 'BIGQUERY',
project_id: 'acme',
dataset_id: 'warehouse',
});
});
it('returns null for non-warehouse adapters', () => {
expect(localConnectionToWarehouseDescriptor('looker', { driver: 'looker' })).toBeNull();
});
});
describe('local connection info helpers', () => {
it('returns canonical warehouse connection types for local catalogs', () => {
expect(localConnectionTypeForConfig('warehouse', { driver: 'postgres' })).toBe('POSTGRESQL');
expect(localConnectionTypeForConfig('bq', { driver: 'bigquery', project_id: 'acme' })).toBe('BIGQUERY');
expect(localConnectionTypeForConfig('snowflake', { driver: 'snowflake' })).toBe('SNOWFLAKE');
});
it('keeps non-warehouse adapter labels for display-only local connection surfaces', () => {
expect(localConnectionTypeForConfig('prod-metabase', { driver: 'metabase' })).toBe('metabase');
expect(localConnectionTypeForConfig('missing-driver', {} as never)).toBe('unknown');
});
it('builds nullable local connection info records', () => {
expect(localConnectionInfoFromConfig('warehouse', { driver: 'postgres' })).toEqual({
id: 'warehouse',
name: 'warehouse',
connectionType: 'POSTGRESQL',
});
expect(localConnectionInfoFromConfig('missing', undefined)).toBeNull();
});
});

View file

@ -0,0 +1,102 @@
import type { KloProjectConnectionConfig } from '../project/config.js';
import type { ConnectionType } from './connection-type.js';
export interface LocalWarehouseDescriptor {
id: string;
connection_type: ConnectionType;
host?: string | null;
database?: string | null;
account?: string | null;
project_id?: string | null;
dataset_id?: string | null;
connection_params: Record<string, unknown>;
}
export interface LocalConnectionInfo {
id: string;
name: string;
connectionType: string;
}
const DRIVER_TO_CONNECTION_TYPE: Record<string, ConnectionType> = {
postgres: 'POSTGRESQL',
postgresql: 'POSTGRESQL',
sqlite: 'SQLITE',
sqlserver: 'SQLSERVER',
mssql: 'SQLSERVER',
mysql: 'MYSQL',
clickhouse: 'CLICKHOUSE',
snowflake: 'SNOWFLAKE',
bigquery: 'BIGQUERY',
};
export function localConnectionToWarehouseDescriptor(
id: string,
connection: KloProjectConnectionConfig | undefined,
): LocalWarehouseDescriptor | null {
if (!connection) {
return null;
}
const connectionType = DRIVER_TO_CONNECTION_TYPE[String(connection.driver ?? '').toLowerCase()];
if (!connectionType) {
return null;
}
const info: LocalWarehouseDescriptor = {
id,
connection_type: connectionType,
connection_params: { ...connection },
};
const url = typeof connection.url === 'string' ? connection.url : null;
if (url && !url.startsWith('env:') && !url.startsWith('file:')) {
try {
const parsed = new URL(url);
info.host = parsed.hostname || null;
if (parsed.pathname.length > 1) {
const [first, second] = parsed.pathname.slice(1).split('/');
if (connectionType === 'BIGQUERY') {
info.project_id = stringField(connection.project_id) ?? parsed.hostname ?? first ?? null;
info.dataset_id = stringField(connection.dataset_id) ?? second ?? null;
} else {
info.database = first ?? null;
}
}
} catch {
info.host = stringField(connection.host);
}
}
info.host = stringField(connection.host) ?? info.host ?? null;
info.database = stringField(connection.database) ?? info.database ?? null;
info.account = stringField(connection.account) ?? null;
info.project_id = stringField(connection.project_id) ?? info.project_id ?? null;
info.dataset_id = stringField(connection.dataset_id) ?? info.dataset_id ?? null;
return info;
}
export function localConnectionTypeForConfig(id: string, connection: KloProjectConnectionConfig | undefined): string {
const descriptor = localConnectionToWarehouseDescriptor(id, connection);
if (descriptor) {
return descriptor.connection_type;
}
const driver = typeof connection?.driver === 'string' ? connection.driver.trim() : '';
return driver.length > 0 ? driver : 'unknown';
}
export function localConnectionInfoFromConfig(
id: string,
connection: KloProjectConnectionConfig | undefined,
): LocalConnectionInfo | null {
if (!connection) {
return null;
}
return {
id,
name: id,
connectionType: localConnectionTypeForConfig(id, connection),
};
}
function stringField(value: unknown): string | null {
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : null;
}

View file

@ -0,0 +1,120 @@
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import {
notionConnectionToPullConfig,
parseNotionConnectionConfig,
redactNotionConnectionConfig,
resolveNotionAuthToken,
} from './notion-config.js';
describe('standalone Notion connection config', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-notion-config-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('parses selected-root Notion config with safe defaults', () => {
const parsed = parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
crawl_mode: 'selected_roots',
root_page_ids: ['page-1'],
});
expect(parsed).toEqual({
driver: 'notion',
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
crawl_mode: 'selected_roots',
root_page_ids: ['page-1'],
root_database_ids: [],
root_data_source_ids: [],
max_pages_per_run: 1000,
max_knowledge_creates_per_run: 5,
max_knowledge_updates_per_run: 20,
last_successful_cursor: null,
});
});
it('redacts token references from display output', () => {
expect(
redactNotionConnectionConfig(
parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'file:/Users/example/.config/notion-token',
crawl_mode: 'all_accessible',
max_pages_per_run: 80,
}),
),
).toEqual({
driver: 'notion',
hasAuthToken: true,
crawlMode: 'all_accessible',
rootPageIds: [],
rootDatabaseIds: [],
rootDataSourceIds: [],
maxPagesPerRun: 80,
maxKnowledgeCreatesPerRun: 5,
maxKnowledgeUpdatesPerRun: 20,
warning: 'Anything accessible to this Notion integration can become organization knowledge.',
});
});
it('requires at least one selected root in selected_roots mode', () => {
expect(() =>
parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
crawl_mode: 'selected_roots',
}),
).toThrow('selected_roots requires at least one root page, database, or data source id');
});
it('resolves env and file token references without exposing the reference in errors', async () => {
const tokenPath = join(tempDir, 'notion-token.txt');
await writeFile(tokenPath, 'ntn_file_token\n', 'utf-8');
await expect(
resolveNotionAuthToken('env:NOTION_AUTH_TOKEN', {
env: { NOTION_AUTH_TOKEN: 'ntn_env_token' },
}),
).resolves.toBe('ntn_env_token');
await expect(resolveNotionAuthToken(`file:${tokenPath}`)).resolves.toBe('ntn_file_token');
await expect(resolveNotionAuthToken('env:MISSING_NOTION_TOKEN', { env: {} })).rejects.toThrow(
'Notion token environment variable MISSING_NOTION_TOKEN is not set',
);
});
it('converts standalone config into adapter pull config', async () => {
const pullConfig = await notionConnectionToPullConfig(
parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
crawl_mode: 'all_accessible',
max_pages_per_run: 12,
max_knowledge_creates_per_run: 2,
max_knowledge_updates_per_run: 7,
last_successful_cursor: '{"phase":"all_accessible_pages","cursor":"cursor-1"}',
}),
{ env: { NOTION_AUTH_TOKEN: 'ntn_env_token' } },
);
expect(pullConfig).toEqual({
authToken: 'ntn_env_token',
crawlMode: 'all_accessible',
rootPageIds: [],
rootDatabaseIds: [],
rootDataSourceIds: [],
maxPagesPerRun: 12,
maxKnowledgeCreatesPerRun: 2,
maxKnowledgeUpdatesPerRun: 7,
lastSuccessfulCursor: '{"phase":"all_accessible_pages","cursor":"cursor-1"}',
});
});
});

View file

@ -0,0 +1,196 @@
import { readFile } from 'node:fs/promises';
import { homedir } from 'node:os';
import { resolve } from 'node:path';
import { type NotionPullConfig, notionPullConfigSchema } from '../ingest/adapters/notion/types.js';
import type { KloProjectConnectionConfig } from '../project/config.js';
export const KLO_NOTION_ORG_KNOWLEDGE_WARNING =
'Anything accessible to this Notion integration can become organization knowledge.';
type KloNotionCrawlMode = 'all_accessible' | 'selected_roots';
export interface KloNotionConnectionConfig extends KloProjectConnectionConfig {
driver: 'notion';
auth_token_ref: string;
crawl_mode: KloNotionCrawlMode;
root_page_ids: string[];
root_database_ids: string[];
root_data_source_ids: string[];
max_pages_per_run: number;
max_knowledge_creates_per_run: number;
max_knowledge_updates_per_run: number;
last_successful_cursor: string | null;
}
export interface RedactedKloNotionConnectionConfig {
driver: 'notion';
hasAuthToken: boolean;
crawlMode: KloNotionCrawlMode;
rootPageIds: string[];
rootDatabaseIds: string[];
rootDataSourceIds: string[];
maxPagesPerRun: number;
maxKnowledgeCreatesPerRun: number;
maxKnowledgeUpdatesPerRun: number;
warning: typeof KLO_NOTION_ORG_KNOWLEDGE_WARNING;
}
interface ResolveNotionTokenOptions {
env?: Record<string, string | undefined>;
readTextFile?: (path: string) => Promise<string>;
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function record(value: unknown): Record<string, unknown> {
if (!isRecord(value)) {
throw new Error('Notion connection config must be an object');
}
return value;
}
function stringValue(value: unknown, fallback: string): string {
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : fallback;
}
function optionalString(value: unknown): string | null {
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : null;
}
function stringArray(value: unknown): string[] {
if (!Array.isArray(value)) {
return [];
}
return value.filter((item): item is string => typeof item === 'string' && item.trim().length > 0);
}
function integerWithFallback(value: unknown, fallback: number, name: string): number {
if (value === undefined || value === null) {
return fallback;
}
if (typeof value !== 'number' || !Number.isInteger(value)) {
throw new Error(`${name} must be an integer`);
}
return value;
}
function boundedInteger(value: unknown, fallback: number, name: string, min: number, max: number): number {
const parsed = integerWithFallback(value, fallback, name);
if (parsed < min || parsed > max) {
throw new Error(`${name} must be between ${min} and ${max}`);
}
return parsed;
}
export function parseNotionConnectionConfig(raw: unknown): KloNotionConnectionConfig {
const input = record(raw);
if (input.driver !== 'notion') {
throw new Error('Notion connection config requires driver: notion');
}
const authTokenRef = stringValue(input.auth_token_ref, '');
if (!authTokenRef) {
throw new Error('Notion connection config requires auth_token_ref');
}
if (!authTokenRef.startsWith('env:') && !authTokenRef.startsWith('file:')) {
throw new Error('Notion auth_token_ref must use env:NAME or file:/path');
}
const crawlMode = stringValue(input.crawl_mode, 'selected_roots');
if (crawlMode !== 'selected_roots' && crawlMode !== 'all_accessible') {
throw new Error(`Unsupported Notion crawl_mode: ${crawlMode}`);
}
const rootPageIds = stringArray(input.root_page_ids);
const rootDatabaseIds = stringArray(input.root_database_ids);
const rootDataSourceIds = stringArray(input.root_data_source_ids);
if (crawlMode === 'selected_roots' && rootPageIds.length + rootDatabaseIds.length + rootDataSourceIds.length === 0) {
throw new Error('selected_roots requires at least one root page, database, or data source id');
}
return {
...input,
driver: 'notion',
auth_token_ref: authTokenRef,
crawl_mode: crawlMode,
root_page_ids: rootPageIds,
root_database_ids: rootDatabaseIds,
root_data_source_ids: rootDataSourceIds,
max_pages_per_run: boundedInteger(input.max_pages_per_run, 1000, 'max_pages_per_run', 1, 10_000),
max_knowledge_creates_per_run: boundedInteger(
input.max_knowledge_creates_per_run,
5,
'max_knowledge_creates_per_run',
0,
25,
),
max_knowledge_updates_per_run: boundedInteger(
input.max_knowledge_updates_per_run,
20,
'max_knowledge_updates_per_run',
0,
100,
),
last_successful_cursor: optionalString(input.last_successful_cursor),
};
}
export function redactNotionConnectionConfig(config: KloNotionConnectionConfig): RedactedKloNotionConnectionConfig {
return {
driver: 'notion',
hasAuthToken: Boolean(config.auth_token_ref),
crawlMode: config.crawl_mode,
rootPageIds: config.root_page_ids,
rootDatabaseIds: config.root_database_ids,
rootDataSourceIds: config.root_data_source_ids,
maxPagesPerRun: config.max_pages_per_run,
maxKnowledgeCreatesPerRun: config.max_knowledge_creates_per_run,
maxKnowledgeUpdatesPerRun: config.max_knowledge_updates_per_run,
warning: KLO_NOTION_ORG_KNOWLEDGE_WARNING,
};
}
function expandHome(path: string): string {
return path === '~' || path.startsWith('~/') ? resolve(homedir(), path.slice(2)) : path;
}
export async function resolveNotionAuthToken(
authTokenRef: string,
options: ResolveNotionTokenOptions = {},
): Promise<string> {
if (authTokenRef.startsWith('env:')) {
const envName = authTokenRef.slice('env:'.length);
const value = (options.env ?? process.env)[envName];
if (!value) {
throw new Error(`Notion token environment variable ${envName} is not set`);
}
return value.trim();
}
if (authTokenRef.startsWith('file:')) {
const path = expandHome(authTokenRef.slice('file:'.length));
const readTextFile = options.readTextFile ?? ((filePath: string) => readFile(filePath, 'utf-8'));
const value = (await readTextFile(path)).trim();
if (!value) {
throw new Error(`Notion token file is empty: ${path}`);
}
return value;
}
throw new Error('Notion auth_token_ref must use env:NAME or file:/path');
}
export async function notionConnectionToPullConfig(
config: KloNotionConnectionConfig,
options: ResolveNotionTokenOptions = {},
): Promise<NotionPullConfig> {
return notionPullConfigSchema.parse({
authToken: await resolveNotionAuthToken(config.auth_token_ref, options),
crawlMode: config.crawl_mode,
rootPageIds: config.root_page_ids,
rootDatabaseIds: config.root_database_ids,
rootDataSourceIds: config.root_data_source_ids,
maxPagesPerRun: config.max_pages_per_run,
maxKnowledgeCreatesPerRun: config.max_knowledge_creates_per_run,
maxKnowledgeUpdatesPerRun: config.max_knowledge_updates_per_run,
lastSuccessfulCursor: config.last_successful_cursor,
});
}

View file

@ -0,0 +1,111 @@
import { describe, expect, it, vi } from 'vitest';
import { createPostgresQueryExecutor } from './postgres-query-executor.js';
function makeClient() {
const calls: unknown[] = [];
const client = {
connect: vi.fn(async () => undefined),
query: vi.fn(async (input: unknown) => {
calls.push(input);
if (input === 'BEGIN READ ONLY') {
return { rows: [], fields: [], rowCount: null, command: 'BEGIN' };
}
if (input === 'COMMIT') {
return { rows: [], fields: [], rowCount: null, command: 'COMMIT' };
}
return {
rows: [
['paid', 2],
['open', 1],
],
fields: [{ name: 'status' }, { name: 'order_count' }],
rowCount: 2,
command: 'SELECT',
};
}),
end: vi.fn(async () => undefined),
};
return { client, calls };
}
describe('createPostgresQueryExecutor', () => {
it('runs a read-only transaction in array row mode and closes the client', async () => {
const { client, calls } = makeClient();
const executor = createPostgresQueryExecutor({
clientFactory: vi.fn(() => client),
});
const result = await executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres', url: 'postgres://example/db', readonly: true },
sql: 'select status, count(*) as order_count from public.orders group by status',
maxRows: 50,
});
expect(client.connect).toHaveBeenCalledTimes(1);
expect(calls[0]).toBe('BEGIN READ ONLY');
expect(calls[1]).toEqual({
text: 'select * from (select status, count(*) as order_count from public.orders group by status) as klo_query_result limit 50',
rowMode: 'array',
});
expect(calls[2]).toBe('COMMIT');
expect(client.end).toHaveBeenCalledTimes(1);
expect(result).toEqual({
headers: ['status', 'order_count'],
rows: [
['paid', 2],
['open', 1],
],
totalRows: 2,
command: 'SELECT',
rowCount: 2,
});
});
it('rolls back and closes the client when query execution fails', async () => {
const client = {
connect: vi.fn(async () => undefined),
query: vi.fn(async (input: unknown) => {
if (input === 'BEGIN READ ONLY' || input === 'ROLLBACK') {
return { rows: [], fields: [], rowCount: null, command: String(input) };
}
throw new Error('syntax error');
}),
end: vi.fn(async () => undefined),
};
const executor = createPostgresQueryExecutor({
clientFactory: vi.fn(() => client),
});
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres', url: 'postgres://example/db', readonly: true },
sql: 'select * from broken',
maxRows: 10,
}),
).rejects.toThrow('syntax error');
expect(client.query).toHaveBeenCalledWith('ROLLBACK');
expect(client.end).toHaveBeenCalledTimes(1);
});
it('requires a Postgres url and read-only connection config', async () => {
const executor = createPostgresQueryExecutor({ clientFactory: vi.fn() });
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres', readonly: true },
sql: 'select 1',
}),
).rejects.toThrow('Local Postgres execution requires connections.warehouse.url');
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres', url: 'postgres://example/db', readonly: false },
sql: 'select 1',
}),
).rejects.toThrow('Local query execution requires connections.warehouse.readonly: true');
});
});

View file

@ -0,0 +1,80 @@
import { Client, type ClientConfig } from 'pg';
import type {
KloSqlQueryExecutionInput,
KloSqlQueryExecutionResult,
KloSqlQueryExecutorPort,
} from './query-executor.js';
import { limitSqlForExecution } from './read-only-sql.js';
interface PgClientLike {
connect(): Promise<unknown>;
query(input: string | { text: string; rowMode: 'array' }): Promise<{
fields: Array<{ name: string }>;
rows: unknown[][];
command: string;
rowCount: number | null;
}>;
end(): Promise<void>;
}
interface PostgresQueryExecutorOptions {
statementTimeoutMs?: number;
queryTimeoutMs?: number;
connectionTimeoutMs?: number;
clientFactory?: (config: ClientConfig) => PgClientLike;
}
function connectionDriver(input: KloSqlQueryExecutionInput): string {
return String(input.connection?.driver ?? '').toLowerCase();
}
function createDefaultClient(config: ClientConfig): PgClientLike {
return new Client(config);
}
export function createPostgresQueryExecutor(options: PostgresQueryExecutorOptions = {}): KloSqlQueryExecutorPort {
const clientFactory = options.clientFactory ?? createDefaultClient;
return {
async execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult> {
const driver = connectionDriver(input);
if (driver !== 'postgres' && driver !== 'postgresql') {
throw new Error(`Local Postgres execution cannot run driver "${input.connection?.driver ?? 'unknown'}".`);
}
if (input.connection?.readonly !== true) {
throw new Error(`Local query execution requires connections.${input.connectionId}.readonly: true.`);
}
if (typeof input.connection.url !== 'string' || input.connection.url.trim().length === 0) {
throw new Error(`Local Postgres execution requires connections.${input.connectionId}.url.`);
}
const client = clientFactory({
connectionString: input.connection.url,
statement_timeout: options.statementTimeoutMs ?? 30_000,
query_timeout: options.queryTimeoutMs ?? 35_000,
connectionTimeoutMillis: options.connectionTimeoutMs ?? 5_000,
application_name: 'klo-local-query',
});
await client.connect();
try {
await client.query('BEGIN READ ONLY');
const result = await client.query({
text: limitSqlForExecution(input.sql, input.maxRows),
rowMode: 'array',
});
await client.query('COMMIT');
return {
headers: result.fields.map((field) => field.name),
rows: result.rows,
totalRows: result.rows.length,
command: result.command,
rowCount: result.rowCount,
};
} catch (error) {
await client.query('ROLLBACK').catch(() => undefined);
throw error;
} finally {
await client.end();
}
},
};
}

View file

@ -0,0 +1,25 @@
import type { KloProjectConnectionConfig } from '../project/index.js';
export interface KloSqlQueryExecutionInput {
connectionId: string;
projectDir?: string;
connection: KloProjectConnectionConfig | undefined;
sql: string;
maxRows?: number;
}
export interface KloSqlQueryExecutionResult {
headers: string[];
rows: unknown[][];
totalRows: number;
command: string;
rowCount: number | null;
}
export interface KloSqlQueryExecutorPort {
execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult>;
}
export function normalizeQueryRows(rows: unknown[]): unknown[][] {
return rows.map((row) => (Array.isArray(row) ? row : Object.values(row as Record<string, unknown>)));
}

View file

@ -0,0 +1,30 @@
import { describe, expect, it } from 'vitest';
import { assertReadOnlySql, limitSqlForExecution } from './read-only-sql.js';
describe('assertReadOnlySql', () => {
it('allows select and with queries', () => {
expect(assertReadOnlySql('select * from orders')).toBe('select * from orders');
expect(assertReadOnlySql('with paid as (select * from orders) select * from paid')).toContain('with paid');
});
it('rejects mutating statements before opening a database connection', () => {
expect(() => assertReadOnlySql('delete from orders')).toThrow(
'Only read-only SELECT/WITH queries can be executed locally',
);
expect(() => assertReadOnlySql('create table x(id int)')).toThrow(
'Only read-only SELECT/WITH queries can be executed locally',
);
});
});
describe('limitSqlForExecution', () => {
it('wraps compiled SQL and strips trailing semicolons', () => {
expect(limitSqlForExecution('select * from public.orders; ', 25)).toBe(
'select * from (select * from public.orders) as klo_query_result limit 25',
);
});
it('returns the trimmed SQL when no maxRows value is provided', () => {
expect(limitSqlForExecution('select * from orders; ', undefined)).toBe('select * from orders');
});
});

View file

@ -0,0 +1,22 @@
const MUTATING_SQL =
/^\s*(insert|update|delete|merge|alter|drop|create|truncate|grant|revoke|copy|call|do|vacuum|analyze|refresh)\b/i;
const READ_SQL = /^\s*(select|with)\b/i;
export function assertReadOnlySql(sql: string): string {
const trimmed = sql.trim();
if (!READ_SQL.test(trimmed) || MUTATING_SQL.test(trimmed)) {
throw new Error('Only read-only SELECT/WITH queries can be executed locally.');
}
return trimmed;
}
export function limitSqlForExecution(sql: string, maxRows: number | undefined): string {
const trimmed = assertReadOnlySql(sql).replace(/;+\s*$/, '');
if (!maxRows) {
return trimmed;
}
if (!Number.isInteger(maxRows) || maxRows <= 0) {
throw new Error('maxRows must be a positive integer.');
}
return `select * from (${trimmed}) as klo_query_result limit ${maxRows}`;
}

View file

@ -0,0 +1,148 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { writeFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import Database from 'better-sqlite3';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { createSqliteQueryExecutor, sqliteDatabasePathFromConnection } from './sqlite-query-executor.js';
describe('createSqliteQueryExecutor', () => {
let tempDir: string;
let dbPath: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-sqlite-query-'));
dbPath = join(tempDir, 'warehouse.db');
const db = new Database(dbPath);
db.exec(`
CREATE TABLE orders (
id INTEGER PRIMARY KEY,
status TEXT NOT NULL,
amount INTEGER NOT NULL
);
INSERT INTO orders (status, amount) VALUES
('paid', 20),
('paid', 30),
('open', 10);
`);
db.close();
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('executes read-only SELECT SQL against a relative SQLite path', async () => {
const executor = createSqliteQueryExecutor();
const result = await executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: 'warehouse.db', readonly: true },
sql: 'select status, count(*) as order_count from orders group by status order by status',
maxRows: 10,
});
expect(result).toEqual({
headers: ['status', 'order_count'],
rows: [
['open', 1],
['paid', 2],
],
totalRows: 2,
command: 'SELECT',
rowCount: 2,
});
});
it('supports file urls for SQLite database paths', async () => {
expect(
sqliteDatabasePathFromConnection({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', url: `file://${dbPath}`, readonly: true },
sql: 'select 1',
}),
).toBe(dbPath);
});
it('resolves file references for SQLite path fields', async () => {
const pointerPath = join(tempDir, 'sqlite-path.txt');
writeFileSync(pointerPath, dbPath, 'utf-8');
expect(
sqliteDatabasePathFromConnection({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: `file:${pointerPath}`, readonly: true },
sql: 'select 1',
}),
).toBe(dbPath);
});
it('resolves env references for SQLite database urls', async () => {
const originalDatabaseUrl = process.env.KLO_SQLITE_TEST_URL;
process.env.KLO_SQLITE_TEST_URL = `sqlite:${dbPath}`;
try {
expect(
sqliteDatabasePathFromConnection({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', url: 'env:KLO_SQLITE_TEST_URL', readonly: true },
sql: 'select 1',
}),
).toBe(dbPath);
} finally {
if (originalDatabaseUrl === undefined) {
delete process.env.KLO_SQLITE_TEST_URL;
} else {
process.env.KLO_SQLITE_TEST_URL = originalDatabaseUrl;
}
}
});
it('rejects mutating SQL before opening the database', async () => {
const executor = createSqliteQueryExecutor();
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: 'warehouse.db', readonly: true },
sql: 'delete from orders',
}),
).rejects.toThrow('Only read-only SELECT/WITH queries can be executed locally');
});
it('requires a SQLite driver, read-only config, and a database path', async () => {
const executor = createSqliteQueryExecutor();
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'postgres', path: 'warehouse.db', readonly: true },
sql: 'select 1',
}),
).rejects.toThrow('Local SQLite execution cannot run driver "postgres"');
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: 'warehouse.db', readonly: false },
sql: 'select 1',
}),
).rejects.toThrow('Local query execution requires connections.warehouse.readonly: true');
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', readonly: true },
sql: 'select 1',
}),
).rejects.toThrow('Local SQLite execution requires connections.warehouse.path or connections.warehouse.url');
});
});

View file

@ -0,0 +1,94 @@
import { isAbsolute, resolve } from 'node:path';
import { fileURLToPath } from 'node:url';
import Database from 'better-sqlite3';
import { readFileSync } from 'node:fs';
import { homedir } from 'node:os';
import type {
KloSqlQueryExecutionInput,
KloSqlQueryExecutionResult,
KloSqlQueryExecutorPort,
} from './query-executor.js';
import { normalizeQueryRows } from './query-executor.js';
import { limitSqlForExecution } from './read-only-sql.js';
type SqliteConnectionConfig = Record<string, unknown> | undefined;
function connectionDriver(input: KloSqlQueryExecutionInput): string {
return String(input.connection?.driver ?? '').toLowerCase();
}
function stringConfigValue(connection: SqliteConnectionConfig, key: string): string | undefined {
const value = connection?.[key];
return typeof value === 'string' && value.trim().length > 0 ? resolveStringReference(key, value.trim()) : undefined;
}
function resolveStringReference(key: string, value: string): string {
if (value.startsWith('env:')) {
return process.env[value.slice('env:'.length)] ?? '';
}
if (key !== 'url' && value.startsWith('file:')) {
const rawPath = value.slice('file:'.length);
const path = rawPath.startsWith('~') ? resolve(homedir(), rawPath.slice(1)) : rawPath;
return readFileSync(path, 'utf-8').trim();
}
return value;
}
function sqlitePathFromUrl(url: string): string {
if (url.startsWith('file:')) {
return fileURLToPath(url);
}
if (url.startsWith('sqlite:')) {
const parsed = new URL(url);
if (parsed.pathname.length > 0) {
return decodeURIComponent(parsed.pathname);
}
}
return url;
}
export function sqliteDatabasePathFromConnection(input: KloSqlQueryExecutionInput): string {
const driver = connectionDriver(input);
if (driver !== 'sqlite' && driver !== 'sqlite3') {
throw new Error(`Local SQLite execution cannot run driver "${input.connection?.driver ?? 'unknown'}".`);
}
if (input.connection?.readonly !== true) {
throw new Error(`Local query execution requires connections.${input.connectionId}.readonly: true.`);
}
const pathValue = stringConfigValue(input.connection, 'path');
const urlValue = stringConfigValue(input.connection, 'url');
if (!pathValue && !urlValue) {
throw new Error(
`Local SQLite execution requires connections.${input.connectionId}.path or connections.${input.connectionId}.url.`,
);
}
const candidate = pathValue ?? sqlitePathFromUrl(urlValue as string);
return isAbsolute(candidate) ? candidate : resolve(input.projectDir ?? process.cwd(), candidate);
}
export function createSqliteQueryExecutor(): KloSqlQueryExecutorPort {
return {
async execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult> {
const sql = limitSqlForExecution(input.sql, input.maxRows);
const dbPath = sqliteDatabasePathFromConnection(input);
const db = new Database(dbPath, { readonly: true, fileMustExist: true });
try {
const statement = db.prepare(sql);
const rows = statement.all() as unknown[];
return {
headers: statement.columns().map((column) => column.name),
rows: normalizeQueryRows(rows),
totalRows: rows.length,
command: 'SELECT',
rowCount: rows.length,
};
} finally {
db.close();
}
},
};
}

View file

@ -0,0 +1,34 @@
import { mkdir, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { resolveKloConfigReference, resolveKloHomePath } from './config-reference.js';
describe('KLO config references', () => {
it('resolves env references without returning empty values', () => {
expect(resolveKloConfigReference('env:AI_GATEWAY_API_KEY', { AI_GATEWAY_API_KEY: ' gateway-key ' })).toBe(
'gateway-key',
);
expect(resolveKloConfigReference('env:AI_GATEWAY_API_KEY', { AI_GATEWAY_API_KEY: ' ' })).toBeUndefined();
expect(resolveKloConfigReference('env:AI_GATEWAY_API_KEY', {})).toBeUndefined();
});
it('resolves file references and trims file content', async () => {
const dir = join(tmpdir(), `klo-config-reference-${process.pid}`);
await mkdir(dir, { recursive: true });
const keyPath = join(dir, 'gateway-key.txt');
await writeFile(keyPath, 'file-gateway-key\n', 'utf8');
expect(resolveKloConfigReference(`file:${keyPath}`, {})).toBe('file-gateway-key');
});
it('returns literal values unchanged after trimming blank-only values', () => {
expect(resolveKloConfigReference('provider/model', {})).toBe('provider/model');
expect(resolveKloConfigReference(' ', {})).toBeUndefined();
expect(resolveKloConfigReference(undefined, {})).toBeUndefined();
});
it('resolves home-prefixed paths', () => {
expect(resolveKloHomePath('~/klo/key.txt')).toContain('/klo/key.txt');
});
});

View file

@ -0,0 +1,36 @@
import { readFileSync } from 'node:fs';
import { homedir } from 'node:os';
import { resolve } from 'node:path';
export function resolveKloHomePath(path: string): string {
if (path === '~') {
return homedir();
}
if (path.startsWith('~/')) {
return resolve(homedir(), path.slice(2));
}
return resolve(path);
}
export function resolveKloConfigReference(value: string | undefined, env: NodeJS.ProcessEnv): string | undefined {
if (!value) {
return undefined;
}
if (value.startsWith('env:')) {
const envName = value.slice('env:'.length).trim();
const envValue = env[envName];
return envValue && envValue.trim().length > 0 ? envValue.trim() : undefined;
}
if (value.startsWith('file:')) {
const filePath = resolveKloHomePath(value.slice('file:'.length).trim());
const fileValue = readFileSync(filePath, 'utf8').trim();
return fileValue.length > 0 ? fileValue : undefined;
}
const trimmed = value.trim();
return trimmed.length > 0 ? trimmed : undefined;
}

View file

@ -0,0 +1,42 @@
export interface KloStorageConfig {
configDir?: string;
homeDir?: string;
worktreesDir?: string;
}
export interface KloGitConfig {
userName: string;
userEmail: string;
bootstrapMessage?: string;
bootstrapAuthor?: string;
bootstrapAuthorEmail?: string;
}
export interface KloCoreConfig {
storage: KloStorageConfig;
git: KloGitConfig;
}
export interface KloLogger {
debug(message: string): void;
log(message: string): void;
warn(message: string): void;
error(message: string, error?: unknown): void;
}
export const noopLogger: KloLogger = {
debug: () => undefined,
log: () => undefined,
warn: () => undefined,
error: () => undefined,
};
export function resolveConfigDir(config: KloCoreConfig): string {
const homeDir = config.storage.homeDir ?? '/tmp';
return config.storage.configDir ?? `${homeDir}/klo/config`;
}
export function resolveWorktreesDir(config: KloCoreConfig): string {
const homeDir = config.storage.homeDir ?? '/tmp';
return config.storage.worktreesDir ?? `${homeDir}/.worktrees`;
}

View file

@ -0,0 +1,5 @@
export interface KloEmbeddingPort {
maxBatchSize: number;
computeEmbedding(text: string): Promise<number[]>;
computeEmbeddingsBulk(texts: string[]): Promise<number[][]>;
}

View file

@ -0,0 +1,43 @@
export interface KloFileWriteResult {
commitHash?: string | null;
[key: string]: unknown;
}
export interface KloFileReadResult {
content: string;
[key: string]: unknown;
}
export interface KloFileListResult {
files: string[];
}
export interface KloFileHistoryEntry {
sha?: string;
message?: string;
author?: string;
date?: string | Date;
[key: string]: unknown;
}
export interface KloFileStorePort<TSelf = unknown> {
writeFile(
path: string,
content: string,
author: string,
authorEmail: string,
commitMessage: string,
options?: { skipLock?: boolean },
): Promise<KloFileWriteResult>;
readFile(path: string): Promise<KloFileReadResult>;
deleteFile(
path: string,
author: string,
authorEmail: string,
commitMessage: string,
options?: { skipLock?: boolean },
): Promise<KloFileWriteResult | null>;
listFiles(path: string, recursive?: boolean): Promise<KloFileListResult>;
getFileHistory(path: string): Promise<KloFileHistoryEntry[] | unknown>;
forWorktree(workdir: string): TSelf;
}

View file

@ -0,0 +1,29 @@
import { simpleGit, type SimpleGit } from 'simple-git';
const GIT_HOOK_ENV_KEYS = [
'GIT_ALTERNATE_OBJECT_DIRECTORIES',
'GIT_DIR',
'GIT_INDEX_FILE',
'GIT_OBJECT_DIRECTORY',
'GIT_PREFIX',
'GIT_QUARANTINE_PATH',
'GIT_WORK_TREE',
'GIT_EDITOR',
'GIT_EXEC_PATH',
'GIT_PAGER',
'PAGER',
'VISUAL',
'EDITOR',
] as const;
function sanitizedGitEnv(env: NodeJS.ProcessEnv = process.env): NodeJS.ProcessEnv {
const sanitized = { ...env };
for (const key of GIT_HOOK_ENV_KEYS) {
delete sanitized[key];
}
return sanitized;
}
export function createSimpleGit(baseDir: string): SimpleGit {
return simpleGit({ baseDir }).env(sanitizedGitEnv());
}

View file

@ -0,0 +1,75 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import type { KloCoreConfig } from './config.js';
import { createSimpleGit } from './git-env.js';
import { GitService } from './git.service.js';
describe('GitService.assertWorktreeClean', () => {
let workdir: string;
let git: SimpleGit;
let gitService: GitService;
beforeEach(async () => {
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-clean-'));
git = createSimpleGit(workdir);
await git.init();
await git.addConfig('user.email', 't@test');
await git.addConfig('user.name', 'Test');
await writeFile(join(workdir, 'init'), 'init');
await git.add('.');
await git.commit('init');
const coreConfig: KloCoreConfig = {
storage: { configDir: workdir, homeDir: workdir },
git: { userName: 'Test', userEmail: 't@test' },
};
gitService = new GitService(coreConfig);
(gitService as any).git = git;
(gitService as any).configDir = workdir;
});
afterEach(async () => rm(workdir, { recursive: true, force: true }));
it('does not throw on a clean worktree', async () => {
await expect(gitService.assertWorktreeClean()).resolves.toBeUndefined();
});
it('throws when MERGE_HEAD exists', async () => {
await writeFile(join(workdir, '.git', 'MERGE_HEAD'), 'deadbeef\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/MERGE_HEAD/);
});
it('throws when CHERRY_PICK_HEAD exists', async () => {
await writeFile(join(workdir, '.git', 'CHERRY_PICK_HEAD'), 'deadbeef\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/CHERRY_PICK_HEAD/);
});
it('throws when REVERT_HEAD exists', async () => {
await writeFile(join(workdir, '.git', 'REVERT_HEAD'), 'deadbeef\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/REVERT_HEAD/);
});
it('throws when sequencer/todo exists (interrupted multi-commit revert/cherry-pick)', async () => {
await mkdir(join(workdir, '.git', 'sequencer'), { recursive: true });
await writeFile(join(workdir, '.git', 'sequencer', 'todo'), 'pick deadbeef foo\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/sequencer/);
});
it('throws when the index has unmerged paths', async () => {
await git.checkoutLocalBranch('a');
await writeFile(join(workdir, 'shared'), 'A version');
await git.add('.');
await git.commit('a');
await git.checkout('master').catch(() => git.checkout('main'));
await git.checkoutLocalBranch('b');
await writeFile(join(workdir, 'shared'), 'B version');
await git.add('.');
await git.commit('b');
await git.raw(['merge', 'a']).catch(() => undefined);
await expect(gitService.assertWorktreeClean()).rejects.toThrow();
});
});

View file

@ -0,0 +1,78 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { mkdir, mkdtemp, readdir, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import type { KloCoreConfig } from './config.js';
import { createSimpleGit } from './git-env.js';
import { GitService } from './git.service.js';
describe('GitService.deleteDirectories', () => {
let workdir: string;
let git: SimpleGit;
let gitService: GitService;
beforeEach(async () => {
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-dd-'));
git = createSimpleGit(workdir);
await git.init();
await git.addConfig('user.email', 't@test');
await git.addConfig('user.name', 'Test');
await writeFile(join(workdir, 'keep'), 'k');
await git.add('.');
await git.commit('init');
const coreConfig: KloCoreConfig = {
storage: { configDir: workdir, homeDir: workdir },
git: { userName: 'Test', userEmail: 't@test' },
};
gitService = new GitService(coreConfig);
(gitService as any).git = git;
(gitService as any).configDir = workdir;
});
afterEach(async () => rm(workdir, { recursive: true, force: true }));
it('removes multiple directories in a single commit', async () => {
for (const name of ['a', 'b', 'c']) {
await mkdir(join(workdir, name), { recursive: true });
await writeFile(join(workdir, name, 'f.txt'), name);
}
await git.add('.');
await git.commit('seed 3 dirs');
const beforeCommits = (await git.log()).total;
const result = await gitService.deleteDirectories(['a', 'b'], 'gc: drop a+b', 'System User', 'system@example.com');
expect(result.commitHash).toBeTruthy();
const entries = await readdir(workdir);
expect(entries).not.toContain('a');
expect(entries).not.toContain('b');
expect(entries).toContain('c');
const afterCommits = (await git.log()).total;
expect(afterCommits).toBe(beforeCommits + 1);
});
it('no-ops and returns a null hash when the input list is empty', async () => {
const result = await gitService.deleteDirectories([], 'empty', 'X', 'x@example.com');
expect(result.commitHash).toBe('');
expect(result.created).toBe(false);
});
it('ignores paths that have already been deleted — commits only the remaining ones', async () => {
await mkdir(join(workdir, 'stale'), { recursive: true });
await writeFile(join(workdir, 'stale', 'x'), 'x');
await git.add('.');
await git.commit('seed stale');
const result = await gitService.deleteDirectories(
['stale', 'missing'],
'gc: drop stale + missing',
'System User',
'system@example.com',
);
expect(result.commitHash).toBeTruthy();
const entries = await readdir(workdir);
expect(entries).not.toContain('stale');
});
});

View file

@ -0,0 +1,56 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import type { KloCoreConfig } from './config.js';
import { createSimpleGit } from './git-env.js';
import { GitService } from './git.service.js';
describe('GitService.resetHardTo', () => {
let workdir: string;
let git: SimpleGit;
let gitService: GitService;
beforeEach(async () => {
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-reset-'));
git = createSimpleGit(workdir);
await git.init();
await git.addConfig('user.email', 't@test');
await git.addConfig('user.name', 'Test');
await writeFile(join(workdir, 'init'), 'init');
await git.add('.');
await git.commit('init');
const coreConfig: KloCoreConfig = {
storage: { configDir: workdir, homeDir: workdir },
git: { userName: 'Test', userEmail: 't@test' },
};
gitService = new GitService(coreConfig);
(gitService as any).git = git;
(gitService as any).configDir = workdir;
});
afterEach(async () => rm(workdir, { recursive: true, force: true }));
it('rewinds HEAD to the target SHA, removing later commits and their files', async () => {
const baseSha = (await git.revparse(['HEAD'])).trim();
await writeFile(join(workdir, 'a'), 'a1');
await git.add('.');
await git.commit('a');
await writeFile(join(workdir, 'b'), 'b1');
await git.add('.');
await git.commit('b');
await gitService.resetHardTo(baseSha);
expect((await git.revparse(['HEAD'])).trim()).toBe(baseSha);
expect(await readFile(join(workdir, 'a'), 'utf-8').catch(() => null)).toBeNull();
expect(await readFile(join(workdir, 'b'), 'utf-8').catch(() => null)).toBeNull();
});
it('is a no-op when target SHA equals current HEAD', async () => {
const sha = (await git.revparse(['HEAD'])).trim();
await gitService.resetHardTo(sha);
expect((await git.revparse(['HEAD'])).trim()).toBe(sha);
});
});

View file

@ -0,0 +1,358 @@
import { mkdtemp, realpath, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import type { KloCoreConfig } from './config.js';
import { GitService } from './git.service.js';
// These tests drive a real git repo inside a temp directory — simple-git shells out to the
// system `git` binary. They are fast enough to run as unit tests and catch real issues that
// would be invisible with mocked git.
describe('GitService', () => {
let service: GitService;
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'git-service-spec-'));
const coreConfig: KloCoreConfig = {
storage: { configDir: tempDir, homeDir: tempDir },
git: {
userName: 'Test User',
userEmail: 'test@example.com',
bootstrapMessage: 'Initialize test config repo',
bootstrapAuthor: 'test-system',
bootstrapAuthorEmail: 'system@example.com',
},
};
service = new GitService(coreConfig);
await service.onModuleInit();
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
const writeAndCommit = async (filePath: string, content: string, message = 'msg') => {
await writeFile(join(tempDir, filePath), content, 'utf-8');
return service.commitFile(filePath, message, 'Test', 'test@example.com');
};
describe('cold-start bootstrap commit', () => {
it('writes an empty commit on init so HEAD always resolves', async () => {
// beforeEach already ran onModuleInit() against an empty temp dir.
const head = await service.revParseHead();
expect(head).toMatch(/^[0-9a-f]{40}$/);
});
it('does not double-commit when re-initialized', async () => {
const before = await service.revParseHead();
await service.onModuleInit();
const after = await service.revParseHead();
expect(after).toBe(before);
});
});
describe('commitFile `created` flag', () => {
it('is true for a real commit', async () => {
const info = await writeAndCommit('a.md', '# Hello');
expect(info.created).toBe(true);
});
it('is false on a no-op write (content unchanged)', async () => {
await writeAndCommit('a.md', '# Hello');
const second = await writeAndCommit('a.md', '# Hello', 'unused');
expect(second.created).toBe(false);
});
});
describe('addNote / getNote', () => {
it('attaches a note and reads it back', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, 'Rich message from LLM');
expect(await service.getNote(info.commitHash)).toBe('Rich message from LLM');
});
it('returns undefined when no note exists', async () => {
const info = await writeAndCommit('a.md', '# Hello');
expect(await service.getNote(info.commitHash)).toBeUndefined();
});
it('overwrites an existing note (idempotent retries)', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, 'First');
await service.addNote(info.commitHash, 'Second');
expect(await service.getNote(info.commitHash)).toBe('Second');
});
it('skips empty/whitespace messages silently', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, ' ');
expect(await service.getNote(info.commitHash)).toBeUndefined();
});
});
describe('getFileHistory', () => {
it('surfaces enhancedMessage when a note is present', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, 'Note body');
const history = await service.getFileHistory('a.md');
expect(history[0]?.enhancedMessage).toBe('Note body');
});
it('leaves enhancedMessage undefined when no note is attached', async () => {
await writeAndCommit('a.md', '# Hello');
const history = await service.getFileHistory('a.md');
expect(history[0]?.enhancedMessage).toBeUndefined();
});
});
describe('getCommitDiff', () => {
it('returns the patch scoped to the requested path', async () => {
const info = await writeAndCommit('a.md', '# Hello');
const diff = await service.getCommitDiff(info.commitHash, 'a.md');
expect(diff).toContain('diff --git');
expect(diff).toContain('Hello');
});
it('handles the repository initial commit without throwing', async () => {
const info = await writeAndCommit('first.md', 'first');
await expect(service.getCommitDiff(info.commitHash, 'first.md')).resolves.toBeDefined();
});
});
describe('squashTo', () => {
const writeAsSystem = async (filePath: string, content: string, message = 'msg') => {
await writeFile(join(tempDir, filePath), content, 'utf-8');
return service.commitFile(filePath, message, 'System User', 'system@example.com');
};
it('collapses 3 commits after preHead into a single commit', async () => {
const pre = await writeAsSystem('a.md', 'v1');
const preHead = pre.commitHash;
await writeAsSystem('b.md', 'b', 'add b');
await writeAsSystem('c.md', 'c', 'add c');
await writeAsSystem('a.md', 'v2', 'update a');
const result = await service.squashTo(preHead, {
message: 'Ingest: bundle 3 writes',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(true);
expect(result.squashedCount).toBe(3);
expect(result.commitHash).toBeTruthy();
expect(result.commitHash).not.toBe(preHead);
const commitHash = result.commitHash;
if (!commitHash) {
throw new Error('Expected squash commit hash');
}
// The squashed commit should preserve the final tree state.
const fileAtSquash = await service.getFileAtCommit('a.md', commitHash);
expect(fileAtSquash).toBe('v2');
const bAtSquash = await service.getFileAtCommit('b.md', commitHash);
expect(bAtSquash).toBe('b');
});
it('is a no-op when preHead equals HEAD', async () => {
const pre = await writeAsSystem('a.md', 'v1');
const result = await service.squashTo(pre.commitHash, {
message: 'nothing to squash',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(false);
expect(result.commitHash).toBe(pre.commitHash);
});
it('skips squash when a foreign-author commit sits between preHead and HEAD', async () => {
const pre = await writeAsSystem('a.md', 'v1');
const preHead = pre.commitHash;
await writeAsSystem('b.md', 'from us', 'ours');
// Foreign commit
await writeAndCommit('c.md', 'from someone else', 'foreign');
await writeAsSystem('d.md', 'ours again', 'ours 2');
const result = await service.squashTo(preHead, {
message: 'should be skipped',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(false);
expect(result.reason).toContain('foreign');
expect(result.squashedCount).toBe(3);
});
it('returns cleanly when preHead is empty (no starting commit)', async () => {
const result = await service.squashTo('', {
message: 'would have squashed',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(false);
expect(result.commitHash).toBeNull();
});
});
describe('worktree lifecycle', () => {
// macOS canonicalizes tmp paths (/var/folders → /private/var/folders) when git
// returns them from `worktree list`. Resolve through realpath() before comparing.
const canonicalSiblingPath = async (suffix: string): Promise<string> => {
const parent = await realpath(join(tempDir, '..'));
return join(parent, `wt-${Date.now()}-${suffix}`);
};
it('addWorktree creates a branch + directory at the given startSha', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const wtDir = await canonicalSiblingPath('add');
await service.addWorktree(wtDir, 'session/alpha', commitHash);
const list = await service.listWorktrees();
expect(list.find((e) => e.path === wtDir && e.branch === 'refs/heads/session/alpha')).toBeTruthy();
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('removeWorktree detaches the worktree entry', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const wtDir = await canonicalSiblingPath('rm');
await service.addWorktree(wtDir, 'session/beta', commitHash);
await service.removeWorktree(wtDir);
const list = await service.listWorktrees();
expect(list.find((e) => e.path === wtDir)).toBeFalsy();
});
it('deleteBranch removes a branch ref', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const wtDir = await canonicalSiblingPath('br');
await service.addWorktree(wtDir, 'session/gamma', commitHash);
await service.removeWorktree(wtDir);
await service.deleteBranch('session/gamma', true);
const branches = await (service as unknown as { git: import('simple-git').SimpleGit }).git.branchLocal();
expect(branches.all).not.toContain('session/gamma');
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
});
describe('forWorktree', () => {
it('returns a GitService whose operations run inside the given worktree', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-fw`);
await service.addWorktree(wtDir, 'session/delta', commitHash);
const scoped = service.forWorktree(wtDir);
expect(await scoped.revParseHead()).toBe(commitHash);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
});
describe('squashMergeIntoMain', () => {
it('merges a session branch as one commit on main, returning the new SHA + touched paths', async () => {
const { commitHash: baseSha } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-sm`);
await service.addWorktree(wtDir, 'session/happy', baseSha);
const scoped = service.forWorktree(wtDir);
await writeFile(join(wtDir, 'a.yaml'), 'one: 1\n', 'utf-8');
await scoped.commitFile('a.yaml', 'wip a', 'System User', 'system@example.com');
await writeFile(join(wtDir, 'b.yaml'), 'two: 2\n', 'utf-8');
await scoped.commitFile('b.yaml', 'wip b', 'System User', 'system@example.com');
const result = await service.squashMergeIntoMain(
'session/happy',
'System User',
'system@example.com',
'Memory capture: 2 files [chat=abcd1234]',
);
expect(result.ok).toBe(true);
if (!result.ok) {
throw new Error('unreachable');
}
expect(result.squashSha).toMatch(/^[0-9a-f]{40}$/);
expect(result.touchedPaths.sort()).toEqual(['a.yaml', 'b.yaml']);
const mainHead = await service.revParseHead();
expect(mainHead).toBe(result.squashSha);
expect(mainHead).not.toBe(baseSha);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('returns ok with empty touchedPaths when the session branch has no diff vs main', async () => {
const { commitHash: baseSha } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-sm-empty`);
await service.addWorktree(wtDir, 'session/empty', baseSha);
const result = await service.squashMergeIntoMain(
'session/empty',
'System User',
'system@example.com',
'should be a no-op',
);
expect(result.ok).toBe(true);
if (!result.ok) {
throw new Error('unreachable');
}
expect(result.touchedPaths).toEqual([]);
expect(result.squashSha).toBe(baseSha);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('returns conflict=true and leaves main clean when session+main touched same file differently', async () => {
await writeAndCommit('shared.yaml', 'base\n');
const base = await service.revParseHead();
if (!base) {
throw new Error('no base head');
}
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-conf`);
await service.addWorktree(wtDir, 'session/conf', base);
const scoped = service.forWorktree(wtDir);
await writeFile(join(wtDir, 'shared.yaml'), 'session-edit\n', 'utf-8');
await scoped.commitFile('shared.yaml', 'session edit', 'System User', 'system@example.com');
// Main edits the same file a different way, after the session branched.
await writeAndCommit('shared.yaml', 'main-edit\n');
const result = await service.squashMergeIntoMain(
'session/conf',
'System User',
'system@example.com',
'Memory capture: 1 file [chat=dead1234]',
);
expect(result.ok).toBe(false);
if (result.ok) {
throw new Error('unreachable');
}
expect(result.conflict).toBe(true);
expect(result.conflictPaths).toContain('shared.yaml');
const status = await (service as unknown as { git: import('simple-git').SimpleGit }).git.status();
expect(status.isClean()).toBe(true);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
});
});

View file

@ -0,0 +1,855 @@
import { promises as fs } from 'node:fs';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import { noopLogger, resolveConfigDir, type KloCoreConfig, type KloLogger } from './config.js';
import { createSimpleGit } from './git-env.js';
export interface GitCommitInfo {
commitHash: string;
shortHash: string;
message: string;
author: string;
authorEmail: string;
timestamp: string;
committedDate: string;
/**
* True if this call produced a new commit. False when the file was already up-to-date
* and the returned info describes the pre-existing HEAD commit (no-op write).
*/
created: boolean;
/** Async LLM-generated commit summary attached as a git note. Undefined if no note present. */
enhancedMessage?: string;
}
export interface WorktreeEntry {
path: string;
branch: string | null;
head: string | null;
}
export type SquashMergeResult =
| { ok: true; squashSha: string; touchedPaths: string[] }
| { ok: false; conflict: true; conflictPaths: string[] };
export class GitService {
private readonly logger: KloLogger;
private git!: SimpleGit;
private configDir: string;
constructor(
private readonly config: KloCoreConfig,
logger?: KloLogger,
) {
this.logger = logger ?? noopLogger;
this.configDir = resolveConfigDir(config);
}
async onModuleInit(): Promise<void> {
// Ensure config directory exists
await fs.mkdir(this.configDir, { recursive: true });
this.logger.log(`Config directory ensured at: ${this.configDir}`);
// Initialize simple-git
this.git = createSimpleGit(this.configDir);
// Initialize git repository
await this.initialize();
}
private async initialize(): Promise<void> {
try {
// Check if already initialized
const isRepo = await this.git.checkIsRepo();
if (!isRepo) {
await this.git.init();
const gitConfig = this.config.git;
await this.git.addConfig('user.name', gitConfig.userName);
await this.git.addConfig('user.email', gitConfig.userEmail);
this.logger.log('Initialized git repository');
}
// Ensure HEAD always resolves to a commit so callers (e.g., the memory-agent squash flow)
// can rely on `revParseHead()` returning a SHA. Idempotent: skip if HEAD already exists.
const head = await this.revParseHead();
if (!head) {
await this.git.commit(this.config.git.bootstrapMessage ?? 'Initialize klo project repository', {
'--allow-empty': null,
'--author': `${this.config.git.bootstrapAuthor ?? 'klo system'} <${
this.config.git.bootstrapAuthorEmail ?? 'system@klo.local'
}>`,
});
this.logger.log('Wrote bootstrap commit to config repo');
}
} catch (error) {
this.logger.error('Failed to initialize git repository', error);
throw new Error('Failed to initialize git repository');
}
}
async commitFile(
filePath: string,
commitMessage: string,
author: string,
authorEmail: string,
): Promise<GitCommitInfo> {
try {
// Stage the file
await this.git.add(filePath);
// Check if there are any staged changes to commit
const stagedChanges = await this.git.diff(['--cached', '--name-only']);
if (!stagedChanges.trim()) {
// No changes to commit, file already matches what's in git
this.logger.debug(`No changes to commit for ${filePath}, file already up to date`);
// Return info about the current HEAD commit
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: false,
};
}
// There are changes to commit
const result = await this.git.commit(commitMessage, {
'--author': `${author} <${authorEmail}>`,
});
if (!result.commit) {
throw new Error('No commit hash returned');
}
// Get commit details
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
};
} catch (error) {
this.logger.error(`Failed to commit file ${filePath}`, error);
throw new Error(`Failed to commit file: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Stage multiple files and produce a single commit. Mirrors `commitFile` but batches
* N paths into one atomic commit used by the SL capture agent to commit all edits at once.
*/
async commitFiles(
filePaths: string[],
commitMessage: string,
author: string,
authorEmail: string,
): Promise<GitCommitInfo> {
try {
for (const filePath of filePaths) {
await this.git.add(filePath);
}
const stagedChanges = await this.git.diff(['--cached', '--name-only']);
if (!stagedChanges.trim()) {
this.logger.debug(`No changes to commit for ${filePaths.length} file(s), already up to date`);
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: false,
};
}
const result = await this.git.commit(commitMessage, {
'--author': `${author} <${authorEmail}>`,
});
if (!result.commit) {
throw new Error('No commit hash returned');
}
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
};
} catch (error) {
this.logger.error(`Failed to batch commit ${filePaths.length} file(s)`, error);
throw new Error(`Failed to batch commit: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Revert working-tree changes for the given paths (equivalent to `git checkout -- <paths>`).
* Used to roll back dirty files when validation fails.
*/
async checkoutFiles(filePaths: string[]): Promise<void> {
if (filePaths.length === 0) {
return;
}
try {
await this.git.checkout(['--', ...filePaths]);
} catch (error) {
this.logger.warn(
`Failed to checkout ${filePaths.length} file(s): ${error instanceof Error ? error.message : String(error)}`,
);
}
}
/**
* Read the content of `filePath` as it existed at `commitHash`. Equivalent to
* `git show <sha>:<path>`. Reads from git object storage, so it's safe against
* concurrent working-tree mutations.
*/
async getFileAtCommit(filePath: string, commitHash: string): Promise<string> {
try {
return await this.git.show([`${commitHash}:${filePath}`]);
} catch (error) {
this.logger.error(`Failed to read ${filePath} at ${commitHash}`, error);
throw new Error(`Failed to read file at commit: ${error instanceof Error ? error.message : String(error)}`);
}
}
async getFileHistory(filePath: string, limit: number = 50): Promise<GitCommitInfo[]> {
try {
const log = await this.git.log({
file: filePath,
maxCount: limit,
});
// N+1 fetch of notes is fine here: capped at 100 commits, cold UI path.
return Promise.all(
log.all.map(async (commit) => ({
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
enhancedMessage: await this.getNote(commit.hash),
})),
);
} catch (error) {
this.logger.error(`Failed to get history for ${filePath}`, error);
throw new Error(`Failed to retrieve file history: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Attach or overwrite an LLM-generated summary note on a commit.
* Uses `-f` so retries overwrite rather than fail on existing notes (idempotent).
* Callers are responsible for holding `config:repo` Redlock notes writes mutate
* `.git/refs/notes/commits` and must serialize with commits.
*/
async addNote(commitHash: string, message: string): Promise<void> {
const trimmed = message.trim();
if (!trimmed) {
return;
}
try {
await this.git.raw(['notes', 'add', '-f', '-m', trimmed, commitHash]);
} catch (error) {
this.logger.error(`Failed to attach note to ${commitHash}`, error);
throw new Error(`Failed to attach git note: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Read the LLM-generated note for a commit, or undefined if none present.
* Swallows `simple-git`'s "no note found" error so callers can treat it as optional.
*/
async getNote(commitHash: string): Promise<string | undefined> {
try {
const note = await this.git.raw(['notes', 'show', commitHash]);
const trimmed = note.trim();
return trimmed ? trimmed : undefined;
} catch {
// `git notes show` exits non-zero when no note exists — treat as "no note".
return undefined;
}
}
/**
* Return the patch for a commit, optionally scoped to a single path.
* Strips the commit header above the first `diff --git` so only the patch body remains,
* and clips to 12 KB to bound LLM token cost. Returns '' if the commit changed nothing
* on the requested path (e.g. a commit that only touched other files).
*/
async getCommitDiff(commitHash: string, path?: string): Promise<string> {
const args = ['show', '--format=', '--no-color', '--patch', commitHash];
if (path) {
args.push('--', path);
}
try {
const raw = await this.git.raw(args);
const diffStart = raw.indexOf('diff --git');
const body = diffStart >= 0 ? raw.slice(diffStart) : raw.trim();
const MAX_DIFF_BYTES = 12_000;
return body.length > MAX_DIFF_BYTES ? `${body.slice(0, MAX_DIFF_BYTES)}\n… [diff truncated]` : body;
} catch (error) {
this.logger.error(`Failed to read diff for ${commitHash}`, error);
throw new Error(`Failed to read commit diff: ${error instanceof Error ? error.message : String(error)}`);
}
}
async deleteFile(
filePath: string,
commitMessage: string,
author: string,
authorEmail: string,
): Promise<GitCommitInfo> {
try {
// Remove the file from git
await this.git.rm(filePath);
// Commit the deletion
const result = await this.git.commit(commitMessage, {
'--author': `${author} <${authorEmail}>`,
});
if (!result.commit) {
throw new Error('No commit hash returned');
}
// Get commit details
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
};
} catch (error) {
this.logger.error(`Failed to delete file ${filePath}`, error);
throw new Error(`Failed to delete file: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Resolve HEAD to a full commit SHA. Returns the empty string if the repo has no commits yet
* (a freshly-init'd repo before any writes), so callers can treat that as "nothing to reconcile".
*/
async revParseHead(): Promise<string> {
try {
const sha = await this.git.revparse(['HEAD']);
return sha.trim();
} catch {
return '';
}
}
/**
* Verify a commit object exists in the local repo. Used by the reconciler to detect
* the "history was rewritten / partial clone" case before attempting `git diff $sha..HEAD`.
*/
async commitExists(commitHash: string): Promise<boolean> {
if (!commitHash) {
return false;
}
try {
await this.git.raw(['cat-file', '-e', `${commitHash}^{commit}`]);
return true;
} catch {
return false;
}
}
/**
* `git diff --name-status $from..$to -- $pathSpec`. Returns one entry per changed path.
* Renames (`R{score}\told\tnew`) are split into a `D` for the old path plus an `A` for
* the new the reconciler treats each path independently and the new path's row will
* upsert with whatever content the file actually has.
*/
async diffNameStatus(
from: string,
to: string,
pathSpec?: string,
): Promise<Array<{ status: 'A' | 'M' | 'D'; path: string }>> {
const args = ['diff', '--name-status', '-z', `${from}..${to}`];
if (pathSpec) {
args.push('--', pathSpec);
}
const raw = await this.git.raw(args);
if (!raw) {
return [];
}
// -z output: NUL-separated fields. For A/M/D: "<status>\0<path>\0". For R/C: "<status>\0<old>\0<new>\0".
const fields = raw.split('\0').filter((f) => f.length > 0);
const out: Array<{ status: 'A' | 'M' | 'D'; path: string }> = [];
let i = 0;
while (i < fields.length) {
const status = fields[i];
const code = status[0];
if (code === 'R' || code === 'C') {
const oldPath = fields[i + 1];
const newPath = fields[i + 2];
out.push({ status: 'D', path: oldPath });
out.push({ status: 'A', path: newPath });
i += 3;
} else if (code === 'A' || code === 'M' || code === 'D') {
out.push({ status: code, path: fields[i + 1] });
i += 2;
} else {
// Unknown status (T type-change, U unmerged, X unknown) — treat as modify, skip if no path
if (fields[i + 1]) {
out.push({ status: 'M', path: fields[i + 1] });
}
i += 2;
}
}
return out;
}
/**
* List all paths under the working tree that match `pathSpec`, scoped to HEAD.
* Used for the reconciler's first-ever run when there's no watermark to diff from.
*/
async listFilesAtHead(pathSpec: string): Promise<string[]> {
try {
const raw = await this.git.raw(['ls-tree', '-r', '-z', '--name-only', 'HEAD', '--', pathSpec]);
if (!raw) {
return [];
}
return raw.split('\0').filter((f) => f.length > 0);
} catch {
return [];
}
}
/**
* Collapse all commits between `preHead` and current HEAD into a single commit with the given
* message. Used by the memory agent to squash N per-tool-call commits into one ingest commit.
*
* Author-check guard: if any commit between preHead..HEAD has an author other than
* `expectedAuthor`, skips the squash and returns `{ squashed: false, reason: ... }`. This
* prevents accidentally collapsing another writer's commits if writes interleaved with ours.
*
* Caller is responsible for holding the `config:repo` lock so writes and squash serialize.
*/
async squashTo(
preHead: string,
options: { message: string; author: string; authorEmail: string; expectedAuthor?: string },
): Promise<{ squashed: boolean; commitHash: string | null; reason?: string; squashedCount?: number }> {
const { message, author, authorEmail } = options;
const expectedAuthor = options.expectedAuthor ?? author;
if (!preHead) {
return { squashed: false, commitHash: null, reason: 'no pre-head recorded (empty repo at start)' };
}
let currentHead: string;
try {
currentHead = (await this.git.revparse(['HEAD'])).trim();
} catch {
return { squashed: false, commitHash: null, reason: 'no HEAD (repo is empty)' };
}
if (currentHead === preHead) {
return { squashed: false, commitHash: preHead, reason: 'no new commits' };
}
try {
const log = await this.git.log({ from: preHead, to: 'HEAD' });
const commits = log.all;
if (commits.length === 0) {
return { squashed: false, commitHash: preHead, reason: 'no new commits' };
}
const foreign = commits.find((c) => c.author_name !== expectedAuthor);
if (foreign) {
this.logger.warn(
`Skipping squash: commit ${foreign.hash.substring(0, 8)} authored by "${foreign.author_name}" ` +
`differs from expected "${expectedAuthor}". Leaving ${commits.length} commit(s) as-is.`,
);
return {
squashed: false,
commitHash: currentHead,
reason: `foreign commit by ${foreign.author_name}`,
squashedCount: commits.length,
};
}
// Soft reset to preHead, then produce a single commit with all the staged changes.
await this.git.reset(['--soft', preHead]);
const staged = await this.git.diff(['--cached', '--name-only']);
if (!staged.trim()) {
// All intervening commits cancelled each other out — return to preHead and commit nothing.
return { squashed: true, commitHash: preHead, reason: 'no net changes', squashedCount: commits.length };
}
await this.git.commit(message, { '--author': `${author} <${authorEmail}>` });
const newHead = (await this.git.revparse(['HEAD'])).trim();
this.logger.log(
`squashTo: collapsed ${commits.length} commit(s) into ${newHead.substring(0, 8)} (was ${currentHead.substring(0, 8)})`,
);
return { squashed: true, commitHash: newHead, squashedCount: commits.length };
} catch (error) {
this.logger.error('Failed to squash commits', error);
throw new Error(`Failed to squash commits: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Squash-merge `branch` into the currently-checked-out branch of THIS worktree (the
* main worktree, when called on the root GitService instance). Produces a single
* commit whose tree equals the source branch's tree, with the given message/author.
* Returns `{ ok: false, conflict: true, conflictPaths }` and leaves the main worktree
* clean if git reports merge conflicts.
*
* Caller must hold the `config:repo` lock so interactive writes don't race against the
* merge window.
*/
async squashMergeIntoMain(
branch: string,
author: string,
authorEmail: string,
commitMessage: string,
): Promise<SquashMergeResult> {
// Diff of HEAD..branch (two dots) lists commits/files reachable from `branch` that
// aren't on HEAD — i.e. exactly what the squash would apply. Three dots (HEAD...branch)
// is symmetric difference and would mis-classify cases where main moved ahead.
const diff = await this.git.raw(['diff', '--name-only', `HEAD..${branch}`]);
const touchedPaths = diff
.split('\n')
.map((l) => l.trim())
.filter(Boolean);
if (touchedPaths.length === 0) {
const head = (await this.git.revparse(['HEAD'])).trim();
return { ok: true, squashSha: head, touchedPaths: [] };
}
// `git merge --squash` may NOT throw on a textual conflict — it stages the clean
// hunks and leaves conflicted paths unmerged in the index. simple-git may also
// throw if the underlying git exits non-zero. Handle both: try the merge, then
// independently inspect the index for unmerged paths before committing.
let mergeError: unknown = null;
try {
await this.git.raw(['merge', '--squash', branch]);
} catch (error) {
mergeError = error;
}
const unmergedOut = await this.git.raw(['diff', '--name-only', '--diff-filter=U']).catch(() => '');
const conflictPaths = unmergedOut
.split('\n')
.map((l) => l.trim())
.filter(Boolean);
if (conflictPaths.length > 0 || mergeError !== null) {
// `merge --abort` only works for an in-progress merge; squash sets MERGE_MSG but not
// MERGE_HEAD, so fall back to a hard reset which clears the index and worktree.
await this.git.raw(['merge', '--abort']).catch(() => undefined);
await this.git.raw(['reset', '--hard', 'HEAD']).catch(() => undefined);
this.logger.warn(
`squashMergeIntoMain: conflict merging ${branch} — aborted. conflictPaths=${conflictPaths.join(',')}` +
(mergeError ? ` error=${mergeError instanceof Error ? mergeError.message : String(mergeError)}` : ''),
);
return { ok: false, conflict: true, conflictPaths };
}
await this.git.commit(commitMessage, { '--author': `${author} <${authorEmail}>` });
const squashSha = (await this.git.revparse(['HEAD'])).trim();
return { ok: true, squashSha, touchedPaths };
}
/**
* Rewinds the current branch's HEAD to `targetSha`, discarding all later commits and any
* uncommitted worktree changes. Used by Stage-3 to back out a failed work-unit's commits
* on the session worktree - simpler and more robust than `git revert` over a multi-commit
* range, which can pause the sequencer on conflicts.
*/
async resetHardTo(targetSha: string): Promise<void> {
await this.git.raw(['reset', '--hard', targetSha]);
}
/**
* Throws if the worktree is in a state that would make a downstream merge unsafe: an
* in-progress merge, rebase, cherry-pick, revert, interrupted sequencer operation, or
* unmerged paths in the index.
*/
async assertWorktreeClean(): Promise<void> {
const inProgressMarkers: ReadonlyArray<{ relPath: string; label: string }> = [
{ relPath: 'MERGE_HEAD', label: 'MERGE_HEAD' },
{ relPath: 'REBASE_HEAD', label: 'REBASE_HEAD' },
{ relPath: 'CHERRY_PICK_HEAD', label: 'CHERRY_PICK_HEAD' },
{ relPath: 'REVERT_HEAD', label: 'REVERT_HEAD' },
{ relPath: 'sequencer/todo', label: 'sequencer (interrupted multi-commit op)' },
];
for (const { relPath, label } of inProgressMarkers) {
const gitPath = (await this.git.raw(['rev-parse', '--git-path', relPath])).trim();
const fullPath = gitPath.startsWith('/') ? gitPath : join(this.configDir, gitPath);
if (await this.fileExists(fullPath)) {
throw new Error(
`Worktree has in-progress git operation (${label} present at ${fullPath}); refusing to proceed`,
);
}
}
const unmerged = (await this.git.raw(['diff', '--name-only', '--diff-filter=U']).catch(() => ''))
.split('\n')
.map((line) => line.trim())
.filter(Boolean);
if (unmerged.length > 0) {
throw new Error(
`Worktree has ${unmerged.length} unmerged path(s): ${unmerged.slice(0, 5).join(', ')}; refusing to proceed`,
);
}
}
private async fileExists(path: string): Promise<boolean> {
try {
await fs.access(path);
return true;
} catch {
return false;
}
}
/**
* Create a new worktree at `path` with a new branch `branch` pointing at `startSha`.
* Used by the memory agent to isolate per-session writes from interactive saves on main.
*/
async addWorktree(path: string, branch: string, startSha: string): Promise<void> {
try {
await this.git.raw(['worktree', 'add', '-b', branch, path, startSha]);
} catch (error) {
throw new Error(`Failed to add worktree at ${path}: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Remove the worktree entry and its on-disk directory. Uses `--force` because session
* worktrees are klo-internal a clean working tree is not required.
*/
async removeWorktree(path: string): Promise<void> {
try {
await this.git.raw(['worktree', 'remove', '--force', path]);
} catch (error) {
this.logger.warn(
`removeWorktree failed for ${path}: ${error instanceof Error ? error.message : String(error)} — attempting prune`,
);
await this.git.raw(['worktree', 'prune']).catch(() => undefined);
}
}
/**
* List all worktrees attached to this repo, parsed from `worktree list --porcelain`.
* The main worktree is included.
*/
async listWorktrees(): Promise<WorktreeEntry[]> {
const out = await this.git.raw(['worktree', 'list', '--porcelain']);
const entries: WorktreeEntry[] = [];
let current: Partial<WorktreeEntry> = {};
for (const line of out.split('\n')) {
if (line.startsWith('worktree ')) {
if (current.path) {
entries.push({
path: current.path,
branch: current.branch ?? null,
head: current.head ?? null,
});
}
current = { path: line.slice('worktree '.length) };
} else if (line.startsWith('HEAD ')) {
current.head = line.slice('HEAD '.length);
} else if (line.startsWith('branch ')) {
current.branch = line.slice('branch '.length);
}
}
if (current.path) {
entries.push({
path: current.path,
branch: current.branch ?? null,
head: current.head ?? null,
});
}
return entries;
}
async deleteBranch(branch: string, force = false): Promise<void> {
await this.git.raw(['branch', force ? '-D' : '-d', branch]);
}
/**
* Lightweight factory returning a GitService instance whose simple-git client is scoped
* to `workdir`. Used by memory-agent session worktrees. The returned instance shares
* config and the logger with the parent; it does NOT run `onModuleInit`
* (the main instance has already initialized the repo).
*/
forWorktree(workdir: string): GitService {
const scoped = new GitService(this.config, this.logger);
scoped.git = createSimpleGit(workdir);
scoped.configDir = workdir;
return scoped;
}
async deleteDirectory(
directoryPath: string,
commitMessage: string,
author: string,
authorEmail: string,
): Promise<GitCommitInfo> {
try {
// Remove the directory recursively from git
await this.git.rm(['-r', directoryPath]);
// Commit the deletion
const result = await this.git.commit(commitMessage, {
'--author': `${author} <${authorEmail}>`,
});
if (!result.commit) {
throw new Error('No commit hash returned');
}
// Get commit details
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
};
} catch (error) {
this.logger.error(`Failed to delete directory ${directoryPath}`, error);
throw new Error(`Failed to delete directory: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Remove multiple directories recursively and commit them as one change.
* Paths that don't exist in the working tree are skipped silently (useful for GC
* where the DB-known path has already been evicted by a previous run).
* Returns a GitCommitInfo with created=false and an empty commitHash when no
* paths were actually removed.
*/
async deleteDirectories(
directoryPaths: string[],
commitMessage: string,
author: string,
authorEmail: string,
): Promise<GitCommitInfo> {
if (directoryPaths.length === 0) {
return {
commitHash: '',
shortHash: '',
message: commitMessage,
author,
authorEmail,
timestamp: new Date().toISOString(),
committedDate: new Date().toISOString(),
created: false,
};
}
const removed: string[] = [];
for (const path of directoryPaths) {
try {
await this.git.rm(['-r', path]);
removed.push(path);
} catch (error) {
this.logger.warn(
`deleteDirectories: skipping ${path}: ${error instanceof Error ? error.message : String(error)}`,
);
}
}
if (removed.length === 0) {
return {
commitHash: '',
shortHash: '',
message: commitMessage,
author,
authorEmail,
timestamp: new Date().toISOString(),
committedDate: new Date().toISOString(),
created: false,
};
}
const result = await this.git.commit(commitMessage, { '--author': `${author} <${authorEmail}>` });
if (!result.commit) {
throw new Error('No commit hash returned from deleteDirectories');
}
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details after deleteDirectories');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
};
}
}

View file

@ -0,0 +1,27 @@
export type { KloCoreConfig, KloGitConfig, KloLogger, KloStorageConfig } from './config.js';
export { noopLogger, resolveConfigDir, resolveWorktreesDir } from './config.js';
export { resolveKloConfigReference, resolveKloHomePath } from './config-reference.js';
export type { KloEmbeddingPort } from './embedding.js';
export {
REDACTED_KLO_CREDENTIAL_VALUE,
redactKloSensitiveMetadata,
redactKloSensitiveText,
redactKloSensitiveValue,
} from './redaction.js';
export type {
KloFileHistoryEntry,
KloFileListResult,
KloFileReadResult,
KloFileStorePort,
KloFileWriteResult,
} from './file-store.js';
export type { GitCommitInfo, SquashMergeResult, WorktreeEntry } from './git.service.js';
export { GitService } from './git.service.js';
export type {
SentinelPayload,
SessionOutcome,
SessionWorktree,
SessionWorktreeServiceDeps,
WorktreeConfigPort,
} from './session-worktree.service.js';
export { SessionWorktreeService } from './session-worktree.service.js';

View file

@ -0,0 +1,47 @@
export const REDACTED_KLO_CREDENTIAL_VALUE = '<redacted>';
const SENSITIVE_FIELD_NAME = /(password|secret|token|api[_-]?key|private[_-]?key|passphrase|credential|authorization|url)/i;
const URL_CREDENTIAL_PATTERN = /([a-z][a-z0-9+.-]*:\/\/[^:\s/@]+:)([^@\s/]+)(@)/gi;
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function isSensitiveField(key: string): boolean {
return SENSITIVE_FIELD_NAME.test(key);
}
export function redactKloSensitiveValue(key: string, value: unknown): unknown {
if (isSensitiveField(key)) {
return REDACTED_KLO_CREDENTIAL_VALUE;
}
if (Array.isArray(value)) {
return value.map((item) => redactKloSensitiveValue(key, item));
}
if (isRecord(value)) {
return redactKloSensitiveMetadata(value);
}
return value;
}
export function redactKloSensitiveMetadata(metadata: Record<string, unknown>): Record<string, unknown> {
const redacted: Record<string, unknown> = {};
for (const [key, value] of Object.entries(metadata)) {
if (Array.isArray(value)) {
redacted[key] = value.map((item) =>
isRecord(item) ? redactKloSensitiveMetadata(item) : redactKloSensitiveValue(key, item),
);
continue;
}
if (isRecord(value)) {
redacted[key] = redactKloSensitiveValue(key, value);
continue;
}
redacted[key] = redactKloSensitiveValue(key, value);
}
return redacted;
}
export function redactKloSensitiveText(value: string): string {
return value.replace(URL_CREDENTIAL_PATTERN, `$1${REDACTED_KLO_CREDENTIAL_VALUE}$3`);
}

View file

@ -0,0 +1,124 @@
import { mkdtemp, realpath, rm, stat } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import type { KloCoreConfig } from './config.js';
import { GitService } from './git.service.js';
import { SessionWorktreeService, type WorktreeConfigPort } from './session-worktree.service.js';
interface TestWorktreeConfig extends WorktreeConfigPort<TestWorktreeConfig> {
workdir?: string;
}
// SessionWorktreeService glues a real GitService to a scoped config adapter.
describe('SessionWorktreeService', () => {
let sessionService: SessionWorktreeService<TestWorktreeConfig>;
let gitService: GitService;
let homeDir: string;
beforeEach(async () => {
homeDir = await mkdtemp(join(tmpdir(), 'sws-spec-'));
homeDir = await realpath(homeDir);
const coreConfig: KloCoreConfig = {
storage: { configDir: homeDir, homeDir },
git: {
userName: 'System User',
userEmail: 'system@example.com',
bootstrapMessage: 'Initialize test config repo',
bootstrapAuthor: 'test-system',
bootstrapAuthorEmail: 'system@example.com',
},
};
gitService = new GitService(coreConfig);
await gitService.onModuleInit();
const configService: TestWorktreeConfig = {
forWorktree: vi.fn(
(workdir: string): TestWorktreeConfig => ({ workdir, forWorktree: configService.forWorktree }),
),
};
sessionService = new SessionWorktreeService({
coreConfig,
gitService,
configService,
});
});
afterEach(async () => {
await rm(homeDir, { recursive: true, force: true });
});
describe('create', () => {
it('creates a worktree + branch and returns scoped services', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const session = await sessionService.create('chat-abc', baseSha);
expect(session.workdir).toBe(join(homeDir, '.worktrees', 'session-chat-abc'));
expect(session.branch).toBe('session/chat-abc');
expect(session.baseSha).toBe(baseSha);
const stats = await stat(session.workdir);
expect(stats.isDirectory()).toBe(true);
// Scoped git instance reports the worktree's HEAD (= baseSha at creation time).
expect(await session.git.revParseHead()).toBe(baseSha);
const list = await gitService.listWorktrees();
expect(list.find((e) => e.path === session.workdir)).toBeTruthy();
});
it('appends a timestamp suffix when the primary dir already exists', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const first = await sessionService.create('chat-dup', baseSha);
const second = await sessionService.create('chat-dup', baseSha);
expect(first.workdir).not.toBe(second.workdir);
expect(second.branch).toMatch(/^session\/chat-dup-\d+$/);
});
});
describe('cleanup', () => {
it('success removes the worktree dir and deletes the branch', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const session = await sessionService.create('chat-cleanup-ok', baseSha);
await sessionService.cleanup(session, 'success');
const list = await gitService.listWorktrees();
expect(list.find((e) => e.path === session.workdir)).toBeFalsy();
await expect(stat(session.workdir)).rejects.toThrow();
});
it('conflict keeps the worktree and writes a sentinel file', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const session = await sessionService.create('chat-cleanup-conflict', baseSha);
await sessionService.cleanup(session, 'conflict', { conflictPaths: ['shared.yaml'] });
// Dir still exists.
await expect(stat(session.workdir)).resolves.toBeTruthy();
const { readFile } = await import('node:fs/promises');
const raw = await readFile(join(session.workdir, '.klo-outcome'), 'utf-8');
const parsed = JSON.parse(raw);
expect(parsed.outcome).toBe('conflict');
expect(parsed.chatId).toBe('chat-cleanup-conflict');
expect(parsed.conflictPaths).toEqual(['shared.yaml']);
expect(typeof parsed.at).toBe('string');
});
});
});

View file

@ -0,0 +1,113 @@
import { mkdir, stat, writeFile } from 'node:fs/promises';
import { join } from 'node:path';
import { noopLogger, resolveWorktreesDir, type KloCoreConfig, type KloLogger } from './config.js';
import { GitService } from './git.service.js';
export type SessionOutcome = 'success' | 'empty' | 'conflict' | 'crash';
export interface SentinelPayload {
outcome: SessionOutcome;
at: string;
chatId: string;
baseSha: string;
conflictPaths?: string[];
}
export interface WorktreeConfigPort<TConfig> {
forWorktree(workdir: string): TConfig;
}
export interface SessionWorktree<TConfig> {
chatId: string;
workdir: string;
branch: string;
baseSha: string;
createdAt: Date;
git: GitService;
config: TConfig;
}
export interface SessionWorktreeServiceDeps<TConfig extends WorktreeConfigPort<TConfig>> {
coreConfig: KloCoreConfig;
gitService: GitService;
configService: TConfig;
logger?: KloLogger;
}
export class SessionWorktreeService<TConfig extends WorktreeConfigPort<TConfig> = WorktreeConfigPort<never>> {
private readonly logger: KloLogger;
private readonly worktreesRoot: string;
constructor(private readonly deps: SessionWorktreeServiceDeps<TConfig>) {
this.logger = deps.logger ?? noopLogger;
this.worktreesRoot = resolveWorktreesDir(deps.coreConfig);
}
async create(sessionKey: string, baseSha: string): Promise<SessionWorktree<TConfig>> {
await mkdir(this.worktreesRoot, { recursive: true });
let dirName = `session-${sessionKey}`;
let branch = `session/${sessionKey}`;
let workdir = join(this.worktreesRoot, dirName);
try {
await stat(workdir);
const suffix = Date.now().toString();
dirName = `session-${sessionKey}-${suffix}`;
branch = `session/${sessionKey}-${suffix}`;
workdir = join(this.worktreesRoot, dirName);
this.logger.warn(`session worktree collision for key=${sessionKey}; using suffix ${suffix}`);
} catch {
// no collision: primary name is free
}
await this.deps.gitService.addWorktree(workdir, branch, baseSha);
return {
chatId: sessionKey,
workdir,
branch,
baseSha,
createdAt: new Date(),
git: this.deps.gitService.forWorktree(workdir),
config: this.deps.configService.forWorktree(workdir),
};
}
async cleanup(
session: SessionWorktree<TConfig>,
outcome: SessionOutcome,
extra?: { conflictPaths?: string[] },
): Promise<void> {
if (outcome === 'success' || outcome === 'empty') {
try {
await this.deps.gitService.removeWorktree(session.workdir);
await this.deps.gitService.deleteBranch(session.branch, true);
} catch (error) {
this.logger.warn(
`cleanup(${outcome}) failed for ${session.chatId}: ${
error instanceof Error ? error.message : String(error)
}`,
);
}
return;
}
const payload: SentinelPayload = {
outcome,
at: new Date().toISOString(),
chatId: session.chatId,
baseSha: session.baseSha,
...(extra?.conflictPaths ? { conflictPaths: extra.conflictPaths } : {}),
};
try {
await writeFile(join(session.workdir, '.klo-outcome'), JSON.stringify(payload, null, 2), 'utf-8');
} catch (error) {
this.logger.warn(
`cleanup(${outcome}) failed to write sentinel for ${session.chatId}: ${
error instanceof Error ? error.message : String(error)
}`,
);
}
}
}

View file

@ -0,0 +1 @@
export * from './semantic-layer-compute.js';

View file

@ -0,0 +1,339 @@
import { once } from 'node:events';
import { createServer } from 'node:http';
import { describe, expect, it, vi } from 'vitest';
import { createHttpSemanticLayerComputePort, createPythonSemanticLayerComputePort } from './semantic-layer-compute.js';
const source = {
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'number' }],
joins: [],
measures: [{ name: 'order_count', expr: 'count(*)' }],
};
const sourceGenerationInput = {
tables: [
{
name: 'orders',
db: 'public',
comment: 'Orders table',
columns: [
{ name: 'id', type: 'integer', primaryKey: true, nullable: false, comment: 'Order ID' },
{ name: 'customer_id', type: 'integer' },
{ name: 'amount', type: 'decimal', comment: 'Order amount' },
],
},
{
name: 'customers',
db: 'public',
columns: [
{ name: 'id', type: 'integer', primaryKey: true },
{ name: 'email', type: 'varchar' },
],
},
],
links: [
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
relationshipType: 'MANY_TO_ONE',
},
],
dialect: 'postgres',
};
const sourceGenerationDaemonPayload = {
tables: [
{
name: 'orders',
db: 'public',
comment: 'Orders table',
columns: [
{ name: 'id', type: 'integer', primary_key: true, nullable: false, comment: 'Order ID' },
{ name: 'customer_id', type: 'integer' },
{ name: 'amount', type: 'decimal', comment: 'Order amount' },
],
},
{
name: 'customers',
db: 'public',
columns: [
{ name: 'id', type: 'integer', primary_key: true },
{ name: 'email', type: 'varchar' },
],
},
],
links: [
{
from_table: 'orders',
from_column: 'customer_id',
to_table: 'customers',
to_column: 'id',
relationship_type: 'MANY_TO_ONE',
},
],
dialect: 'postgres',
};
const sourceGenerationDaemonResponse = {
source_count: 2,
sources: [
{
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'number' }],
joins: [
{
to: 'customers',
on: 'customer_id = customers.id',
relationship: 'many_to_one',
},
],
measures: [{ name: 'record_count', expr: 'count(id)' }],
},
],
};
describe('createPythonSemanticLayerComputePort', () => {
it('calls the semantic-query stdio command', async () => {
const runJson = vi.fn(async () => ({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
}));
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toEqual({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
});
expect(runJson).toHaveBeenCalledWith('semantic-query', {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
});
});
it('calls the semantic-validate stdio command', async () => {
const runJson = vi.fn(async () => ({
valid: true,
errors: [],
warnings: [],
per_source_warnings: {},
}));
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(
port.validateSources({
sources: [source],
dialect: 'postgres',
recentlyTouched: ['orders'],
}),
).resolves.toEqual({
valid: true,
errors: [],
warnings: [],
perSourceWarnings: {},
});
expect(runJson).toHaveBeenCalledWith('semantic-validate', {
sources: [source],
dialect: 'postgres',
recently_touched: ['orders'],
});
});
it('calls the semantic-generate-sources stdio command', async () => {
const runJson = vi.fn(async () => sourceGenerationDaemonResponse);
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(runJson).toHaveBeenCalledWith('semantic-generate-sources', sourceGenerationDaemonPayload);
});
});
describe('createHttpSemanticLayerComputePort', () => {
it('calls semantic query and validate HTTP endpoints through an injected runner', async () => {
const requestJson = vi.fn(async (path: string) => {
if (path === '/semantic-layer/query') {
return {
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
};
}
return {
valid: true,
errors: [],
warnings: [],
per_source_warnings: {},
};
});
const port = createHttpSemanticLayerComputePort({ baseUrl: 'http://127.0.0.1:8765/', requestJson });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toEqual({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
});
await expect(
port.validateSources({
sources: [source],
dialect: 'postgres',
recentlyTouched: ['orders'],
}),
).resolves.toEqual({
valid: true,
errors: [],
warnings: [],
perSourceWarnings: {},
});
expect(requestJson).toHaveBeenNthCalledWith(1, '/semantic-layer/query', {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
});
expect(requestJson).toHaveBeenNthCalledWith(2, '/semantic-layer/validate', {
sources: [source],
dialect: 'postgres',
recently_touched: ['orders'],
});
});
it('calls the semantic source-generation HTTP endpoint through an injected runner', async () => {
const requestJson = vi.fn(async () => sourceGenerationDaemonResponse);
const port = createHttpSemanticLayerComputePort({ baseUrl: 'http://127.0.0.1:8765/', requestJson });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(requestJson).toHaveBeenCalledWith('/semantic-layer/generate-sources', sourceGenerationDaemonPayload);
});
it('posts JSON to a running HTTP daemon endpoint', async () => {
const requests: Array<{ url: string | undefined; body: unknown }> = [];
const server = createServer((request, response) => {
const chunks: Buffer[] = [];
request.on('data', (chunk: Buffer) => chunks.push(chunk));
request.on('end', () => {
requests.push({
url: request.url,
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
});
response.writeHead(200, { 'content-type': 'application/json' });
response.end(
JSON.stringify({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
}),
);
});
});
server.listen(0, '127.0.0.1');
await once(server, 'listening');
try {
const address = server.address();
if (!address || typeof address === 'string') {
throw new Error('expected TCP server address');
}
const port = createHttpSemanticLayerComputePort({ baseUrl: `http://127.0.0.1:${address.port}` });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toMatchObject({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
});
expect(requests).toEqual([
{
url: '/semantic-layer/query',
body: {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
},
},
]);
} finally {
server.close();
}
});
it('posts source-generation JSON to a running HTTP daemon endpoint', async () => {
const requests: Array<{ url: string | undefined; body: unknown }> = [];
const server = createServer((request, response) => {
const chunks: Buffer[] = [];
request.on('data', (chunk: Buffer) => chunks.push(chunk));
request.on('end', () => {
requests.push({
url: request.url,
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
});
response.writeHead(200, { 'content-type': 'application/json' });
response.end(JSON.stringify(sourceGenerationDaemonResponse));
});
});
server.listen(0, '127.0.0.1');
await once(server, 'listening');
try {
const address = server.address();
if (!address || typeof address === 'string') {
throw new Error('expected TCP server address');
}
const port = createHttpSemanticLayerComputePort({ baseUrl: `http://127.0.0.1:${address.port}` });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(requests).toEqual([
{
url: '/semantic-layer/generate-sources',
body: sourceGenerationDaemonPayload,
},
]);
} finally {
server.close();
}
});
});

View file

@ -0,0 +1,304 @@
import { request as httpRequest } from 'node:http';
import { request as httpsRequest } from 'node:https';
import { URL } from 'node:url';
import { spawn } from 'node:child_process';
import type { SemanticLayerQueryInput, SemanticLayerSource } from '../sl/index.js';
export interface KloSemanticLayerComputeQueryResult {
sql: string;
dialect: string;
columns: Array<Record<string, unknown>>;
plan: Record<string, unknown>;
}
export interface KloSemanticLayerComputeValidationResult {
valid: boolean;
errors: string[];
warnings: string[];
perSourceWarnings: Record<string, string[]>;
}
export interface KloSemanticLayerSourceGenerationColumnInput {
name: string;
type: string;
primaryKey?: boolean;
nullable?: boolean;
comment?: string | null;
}
export interface KloSemanticLayerSourceGenerationTableInput {
name: string;
catalog?: string | null;
db?: string | null;
comment?: string | null;
columns: KloSemanticLayerSourceGenerationColumnInput[];
}
export interface KloSemanticLayerSourceGenerationLinkInput {
fromTable: string;
fromColumn: string;
toTable: string;
toColumn: string;
relationshipType: string;
}
export interface KloSemanticLayerSourceGenerationInput {
tables: KloSemanticLayerSourceGenerationTableInput[];
links: KloSemanticLayerSourceGenerationLinkInput[];
dialect?: string;
}
export interface KloSemanticLayerSourceGenerationResult {
sources: Array<Record<string, unknown>>;
sourceCount: number;
}
export interface KloSemanticLayerComputePort {
query(input: {
sources: Array<Record<string, unknown> | SemanticLayerSource>;
query: SemanticLayerQueryInput;
dialect: string;
}): Promise<KloSemanticLayerComputeQueryResult>;
validateSources(input: {
sources: Array<Record<string, unknown> | SemanticLayerSource>;
dialect: string;
recentlyTouched?: string[];
}): Promise<KloSemanticLayerComputeValidationResult>;
generateSources(input: KloSemanticLayerSourceGenerationInput): Promise<KloSemanticLayerSourceGenerationResult>;
}
export type KloDaemonCommand = 'semantic-query' | 'semantic-validate' | 'semantic-generate-sources';
export type KloDaemonJsonRunner = (
subcommand: KloDaemonCommand,
payload: Record<string, unknown>,
) => Promise<Record<string, unknown>>;
export type KloDaemonHttpJsonRunner = (path: string, payload: Record<string, unknown>) => Promise<Record<string, unknown>>;
export interface PythonSemanticLayerComputeOptions {
command?: string;
args?: string[];
cwd?: string;
env?: NodeJS.ProcessEnv;
runJson?: KloDaemonJsonRunner;
}
export interface HttpSemanticLayerComputeOptions {
baseUrl: string;
requestJson?: KloDaemonHttpJsonRunner;
}
function parseJsonObject(raw: string, subcommand: string): Record<string, unknown> {
const parsed = JSON.parse(raw) as unknown;
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
throw new Error(`klo-daemon ${subcommand} returned non-object JSON`);
}
return parsed as Record<string, unknown>;
}
function runProcessJson(
options: Required<Pick<PythonSemanticLayerComputeOptions, 'command' | 'args'>> &
Pick<PythonSemanticLayerComputeOptions, 'cwd' | 'env'>,
): KloDaemonJsonRunner {
return async (subcommand: KloDaemonCommand, payload: Record<string, unknown>): Promise<Record<string, unknown>> =>
new Promise((resolve, reject) => {
const child = spawn(options.command, [...options.args, subcommand], {
cwd: options.cwd,
env: { ...process.env, ...options.env },
stdio: ['pipe', 'pipe', 'pipe'],
});
const stdout: Buffer[] = [];
const stderr: Buffer[] = [];
child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk));
child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk));
child.on('error', reject);
child.on('close', (code) => {
const stdoutText = Buffer.concat(stdout).toString('utf8').trim();
const stderrText = Buffer.concat(stderr).toString('utf8').trim();
if (code !== 0) {
reject(new Error(`klo-daemon ${subcommand} failed: ${stderrText || `exit code ${code}`}`));
return;
}
try {
resolve(parseJsonObject(stdoutText, subcommand));
} catch (error) {
reject(error);
}
});
child.stdin.end(`${JSON.stringify(payload)}\n`);
});
}
function normalizedBaseUrl(baseUrl: string): string {
return baseUrl.endsWith('/') ? baseUrl : `${baseUrl}/`;
}
function postJson(baseUrl: string): KloDaemonHttpJsonRunner {
return async (path, payload) =>
new Promise((resolve, reject) => {
const target = new URL(path.replace(/^\//, ''), normalizedBaseUrl(baseUrl));
const body = JSON.stringify(payload);
const client = target.protocol === 'https:' ? httpsRequest : httpRequest;
const request = client(
target,
{
method: 'POST',
headers: {
accept: 'application/json',
'content-type': 'application/json',
'content-length': Buffer.byteLength(body),
},
},
(response) => {
const chunks: Buffer[] = [];
response.on('data', (chunk: Buffer) => chunks.push(chunk));
response.on('end', () => {
const text = Buffer.concat(chunks).toString('utf8');
const statusCode = response.statusCode ?? 0;
if (statusCode < 200 || statusCode >= 300) {
reject(new Error(`klo-daemon HTTP ${path} failed with ${statusCode}: ${text}`));
return;
}
try {
resolve(parseJsonObject(text, path));
} catch (error) {
reject(error);
}
});
},
);
request.on('error', reject);
request.end(body);
});
}
function stringArray(value: unknown): string[] {
return Array.isArray(value) ? value.filter((item): item is string => typeof item === 'string') : [];
}
function recordValue(value: unknown): Record<string, unknown> {
return value && typeof value === 'object' && !Array.isArray(value) ? (value as Record<string, unknown>) : {};
}
function recordArray(value: unknown): Array<Record<string, unknown>> {
return Array.isArray(value)
? value.filter(
(item): item is Record<string, unknown> => item !== null && typeof item === 'object' && !Array.isArray(item),
)
: [];
}
function sourceGenerationPayload(input: KloSemanticLayerSourceGenerationInput): Record<string, unknown> {
return {
tables: input.tables.map((table) => ({
name: table.name,
...(table.catalog !== undefined ? { catalog: table.catalog } : {}),
...(table.db !== undefined ? { db: table.db } : {}),
...(table.comment !== undefined ? { comment: table.comment } : {}),
columns: table.columns.map((column) => ({
name: column.name,
type: column.type,
...(column.primaryKey !== undefined ? { primary_key: column.primaryKey } : {}),
...(column.nullable !== undefined ? { nullable: column.nullable } : {}),
...(column.comment !== undefined ? { comment: column.comment } : {}),
})),
})),
links: input.links.map((link) => ({
from_table: link.fromTable,
from_column: link.fromColumn,
to_table: link.toTable,
to_column: link.toColumn,
relationship_type: link.relationshipType,
})),
dialect: input.dialect ?? 'postgres',
};
}
function sourceGenerationResult(raw: Record<string, unknown>): KloSemanticLayerSourceGenerationResult {
return {
sources: recordArray(raw.sources),
sourceCount: typeof raw.source_count === 'number' ? raw.source_count : recordArray(raw.sources).length,
};
}
export function createPythonSemanticLayerComputePort(
options: PythonSemanticLayerComputeOptions = {},
): KloSemanticLayerComputePort {
const command = options.command ?? 'python';
const args = options.args ?? ['-m', 'klo_daemon'];
const runJson = options.runJson ?? runProcessJson({ command, args, cwd: options.cwd, env: options.env });
return {
async query(input) {
const raw = await runJson('semantic-query', {
sources: input.sources,
dialect: input.dialect,
query: input.query,
});
return {
sql: typeof raw.sql === 'string' ? raw.sql : '',
dialect: typeof raw.dialect === 'string' ? raw.dialect : input.dialect,
columns: recordArray(raw.columns),
plan: recordValue(raw.plan),
};
},
async validateSources(input) {
const raw = await runJson('semantic-validate', {
sources: input.sources,
dialect: input.dialect,
recently_touched: input.recentlyTouched,
});
return {
valid: raw.valid === true,
errors: stringArray(raw.errors),
warnings: stringArray(raw.warnings),
perSourceWarnings: recordValue(raw.per_source_warnings) as Record<string, string[]>,
};
},
async generateSources(input) {
const raw = await runJson('semantic-generate-sources', sourceGenerationPayload(input));
return sourceGenerationResult(raw);
},
};
}
export function createHttpSemanticLayerComputePort(
options: HttpSemanticLayerComputeOptions,
): KloSemanticLayerComputePort {
const requestJson = options.requestJson ?? postJson(options.baseUrl);
return {
async query(input) {
const raw = await requestJson('/semantic-layer/query', {
sources: input.sources,
dialect: input.dialect,
query: input.query,
});
return {
sql: typeof raw.sql === 'string' ? raw.sql : '',
dialect: typeof raw.dialect === 'string' ? raw.dialect : input.dialect,
columns: recordArray(raw.columns),
plan: recordValue(raw.plan),
};
},
async validateSources(input) {
const raw = await requestJson('/semantic-layer/validate', {
sources: input.sources,
dialect: input.dialect,
recently_touched: input.recentlyTouched,
});
return {
valid: raw.valid === true,
errors: stringArray(raw.errors),
warnings: stringArray(raw.warnings),
perSourceWarnings: recordValue(raw.per_source_warnings) as Record<string, string[]>,
};
},
async generateSources(input) {
const raw = await requestJson('/semantic-layer/generate-sources', sourceGenerationPayload(input));
return sourceGenerationResult(raw);
},
};
}

View file

@ -0,0 +1,12 @@
import { describe, expect, it } from 'vitest';
import { kloContextPackageInfo } from './index.js';
describe('kloContextPackageInfo', () => {
it('identifies the context package', () => {
expect(kloContextPackageInfo).toEqual({
name: '@klo/context',
version: '0.0.0-private',
});
});
});

View file

@ -0,0 +1,144 @@
export interface KloContextPackageInfo {
name: '@klo/context';
version: '0.0.0-private';
}
export const kloContextPackageInfo: KloContextPackageInfo = {
name: '@klo/context',
version: '0.0.0-private',
};
export * from './agent/index.js';
export * from './core/index.js';
export * from './daemon/index.js';
export * from './ingest/index.js';
export * from './llm/index.js';
export type {
CaptureSession,
CaptureSignals,
MemoryAgentInput,
MemoryAgentResult,
MemoryAgentServiceDeps,
MemoryAgentSettings,
MemoryAgentSourceType,
MemoryCommitMessagePort,
MemoryConnectionPort,
MemoryFileStorePort,
MemoryKnowledgeSlRefsPort,
MemoryLockPort,
MemorySlSourceReconcilerPort,
MemoryTelemetryPort,
MemoryToolSetLike,
MemoryToolsetFactoryPort,
} from './memory/index.js';
export * from './project/index.js';
export * from './prompts/index.js';
export * from './search/index.js';
export * from './sql-analysis/index.js';
export type {
KloColumnAnalysisResult,
KloColumnDescriptionPromptInput,
KloColumnEmbeddingForeignKeys,
KloColumnEmbeddingTextInput,
KloColumnSampleInput,
KloColumnSampleResult,
KloColumnSampleUpdate,
KloColumnStatsInput,
KloColumnStatsResult,
KloConnectionDriver,
KloConnectorCapabilities,
KloCredentialEnvelope,
KloCredentialEnvReference,
KloCredentialFileReference,
KloDataDictionaryColumnState,
KloDataDictionarySampleDecision,
KloDataDictionarySettings,
KloDataDictionarySkipReason,
KloDataSourceDescriptionPromptInput,
KloDescriptionCachePort,
KloDescriptionColumn,
KloDescriptionColumnTable,
KloDescriptionGenerationSettings,
KloDescriptionGeneratorOptions,
KloDescriptionSource,
KloDescriptionTableInput,
KloDescriptionUpdate,
KloEmbeddingPort as KloScanEmbeddingPort,
KloEmbeddingUpdate,
KloEnrichedColumn,
KloEnrichedRelationship,
KloEnrichedSchema,
KloEnrichedTable,
KloEnrichmentScanPhaseResult,
KloGenerateColumnDescriptionsInput,
KloGenerateDataSourceDescriptionInput,
KloGenerateTableDescriptionInput,
KloOptionalConnectorCapabilities,
KloProgressPort,
KloQueryResult as KloScanQueryResult,
KloReadOnlyQueryInput,
KloRelationshipEndpoint,
KloRelationshipSource,
KloRelationshipType,
KloRelationshipUpdate,
KloResolvedCredentialEnvelope,
KloScanArtifactPaths,
KloScanConnector,
KloScanContext,
KloScanDiffSummary,
KloScanEnrichmentSummary,
KloScanInput,
KloScanLoggerPort,
KloScanMetadataStore,
KloScanMode,
KloScanOrchestratorOptions,
KloScanOrchestratorRunInput,
KloScanOrchestratorRunResult,
KloScanRelationshipSummary,
KloScanReport,
KloScanTrigger,
KloScanWarning,
KloScanWarningCode,
KloSchemaColumn,
KloSchemaDimensionType,
KloSchemaForeignKey,
KloSchemaScope,
KloSchemaSnapshot,
KloSchemaTable,
KloSchemaTableKind,
KloSkippedRelationship,
KloStructuralScanPhaseResult,
KloStructuralSyncPlan,
KloStructuralSyncStats,
KloTableDescriptionPromptInput,
KloTableRef,
KloTableSampleInput,
KloTableSampleResult,
KloColumnTypeMapping,
} from './scan/index.js';
export {
appendKloWordLimitInstruction,
buildKloColumnDescriptionPrompt,
buildKloColumnEmbeddingText,
buildKloDataSourceDescriptionPrompt,
buildKloTableDescriptionPrompt,
createKloConnectorCapabilities,
defaultKloDataDictionarySettings,
inferKloDimensionType,
isKloDataDictionaryCandidate,
kloColumnTypeMappingFromNative,
KloDescriptionGenerator,
KloScanOrchestrator,
normalizeKloNativeType,
REDACTED_KLO_CREDENTIAL_VALUE,
redactKloCredentialEnvelope,
redactKloCredentialValue,
redactKloScanMetadata,
redactKloScanReport,
redactKloScanWarning,
shouldKloSampleColumnForDictionary,
} from './scan/index.js';
export * from './skills/index.js';
export * from './sl/index.js';
export * from './tools/index.js';
export * from './wiki/index.js';

View file

@ -0,0 +1,42 @@
import { describe, expect, it } from 'vitest';
import { actionTargetConnectionId, memoryActionIdentity } from './action-identity.js';
describe('memory action target identity', () => {
it('keys SL actions by target connection and wiki actions by run connection', () => {
expect(
memoryActionIdentity(
{ target: 'sl', type: 'created', key: 'orders', detail: '', targetConnectionId: 'warehouse-b' },
'looker-run',
),
).toBe('sl:warehouse-b:orders');
expect(memoryActionIdentity({ target: 'sl', type: 'created', key: 'orders', detail: '' }, 'warehouse-a')).toBe(
'sl:warehouse-a:orders',
);
expect(
memoryActionIdentity(
{
target: 'wiki',
type: 'created',
key: 'knowledge/global/orders.md',
detail: '',
targetConnectionId: 'ignored',
},
'looker-run',
),
).toBe('wiki:looker-run:knowledge/global/orders.md');
});
it('resolves action target connection only for SL actions', () => {
expect(
actionTargetConnectionId(
{ target: 'sl', type: 'updated', key: 'orders', detail: '', targetConnectionId: 'warehouse-b' },
'looker-run',
),
).toBe('warehouse-b');
expect(actionTargetConnectionId({ target: 'wiki', type: 'updated', key: 'orders', detail: '' }, 'looker-run')).toBe(
'looker-run',
);
});
});

View file

@ -0,0 +1,9 @@
import type { MemoryAction } from '../memory/index.js';
export function actionTargetConnectionId(action: MemoryAction, runConnectionId: string): string {
return action.target === 'sl' ? (action.targetConnectionId ?? runConnectionId) : runConnectionId;
}
export function memoryActionIdentity(action: MemoryAction, runConnectionId: string): string {
return `${action.target}:${actionTargetConnectionId(action, runConnectionId)}:${action.key}`;
}

View file

@ -0,0 +1,75 @@
import { describe, expect, it } from 'vitest';
import type { DbtParsedTable } from './parse-schema.js';
import { findMatchingKloTable, matchDbtTables, type DbtHostTableLite } from './match-tables.js';
const hostTables: DbtHostTableLite[] = [
{ id: '1', name: 'orders', catalog: 'warehouse', db: 'analytics', columns: [{ id: 'c1', name: 'id' }] },
{ id: '2', name: 'orders', catalog: 'warehouse', db: 'staging', columns: [{ id: 'c2', name: 'id' }] },
{ id: '3', name: 'customers', catalog: null, db: null, columns: [{ id: 'c3', name: 'id' }] },
];
function table(input: Partial<DbtParsedTable>): DbtParsedTable {
return {
name: 'orders',
description: null,
database: null,
schema: null,
columns: [],
resourceType: 'model',
...input,
};
}
describe('dbt descriptions table matching', () => {
it('uses schema plus name first and checks catalog when dbt database is present', () => {
expect(
findMatchingKloTable(table({ database: 'warehouse', schema: 'analytics' }), hostTables, null)?.id,
).toBe('1');
});
it('does not fall back to name-only for source tables', () => {
expect(findMatchingKloTable(table({ resourceType: 'source' }), hostTables, null)).toBeUndefined();
});
it('uses targetSchema for models and name-only only when unique', () => {
expect(findMatchingKloTable(table({ resourceType: 'model' }), hostTables, 'staging')?.id).toBe('2');
expect(findMatchingKloTable(table({ name: 'customers', resourceType: 'model' }), hostTables, null)?.id).toBe(
'3',
);
expect(findMatchingKloTable(table({ resourceType: 'model' }), hostTables, null)).toBeUndefined();
});
it('summarizes matched columns and descriptions', () => {
const matches = matchDbtTables(
[
table({
name: 'customers',
description: 'Customers',
columns: [
{ name: 'id', description: 'Primary key', dataType: null },
{ name: 'missing', description: 'Missing', dataType: null },
],
}),
],
hostTables,
null,
);
expect(matches).toEqual([
{
dbtTable: 'customers',
dbtSchema: null,
dbtDatabase: null,
hostTableId: '3',
hostTableName: 'customers',
matched: true,
tableDescriptionAction: 'import',
tableDescriptionFound: true,
columnsToImport: 1,
columnsMatched: 1,
columnsTotal: 2,
columnDescriptionsFound: 1,
},
]);
});
});

View file

@ -0,0 +1,127 @@
import type { DbtParsedTable } from './parse-schema.js';
export interface DbtHostTableLite {
id: string;
name: string;
catalog: string | null;
db: string | null;
columns: Array<{ id: string; name: string }>;
}
export interface DbtTableMatch {
dbtTable: string;
dbtSchema: string | null;
dbtDatabase: string | null;
hostTableId: string | null;
hostTableName: string | null;
matched: boolean;
tableDescriptionAction: 'skip' | 'import';
tableDescriptionFound: boolean;
columnsToImport: number;
columnsMatched: number;
columnsTotal: number;
columnDescriptionsFound: number;
}
export function matchDbtTables(
dbtTables: DbtParsedTable[],
hostTables: DbtHostTableLite[],
targetSchema?: string | null,
): DbtTableMatch[] {
return dbtTables.map((dbtTable) => {
const hostTable = findMatchingKloTable(dbtTable, hostTables, targetSchema);
if (!hostTable) {
return {
dbtTable: dbtTable.name,
dbtSchema: dbtTable.schema,
dbtDatabase: dbtTable.database,
hostTableId: null,
hostTableName: null,
matched: false,
tableDescriptionAction: 'skip',
tableDescriptionFound: Boolean(dbtTable.description),
columnsToImport: 0,
columnsMatched: 0,
columnsTotal: dbtTable.columns.length,
columnDescriptionsFound: dbtTable.columns.filter((column) => Boolean(column.description)).length,
};
}
const analysis = analyzeColumns(dbtTable, hostTable);
return {
dbtTable: dbtTable.name,
dbtSchema: dbtTable.schema,
dbtDatabase: dbtTable.database,
hostTableId: hostTable.id,
hostTableName: hostTable.name,
matched: true,
tableDescriptionAction: dbtTable.description ? 'import' : 'skip',
tableDescriptionFound: Boolean(dbtTable.description),
...analysis,
};
});
}
export function findMatchingKloTable(
dbtTable: DbtParsedTable,
hostTables: DbtHostTableLite[],
targetSchema?: string | null,
): DbtHostTableLite | undefined {
const dbtName = dbtTable.name.toLowerCase();
const effectiveSchema = dbtTable.schema ?? targetSchema ?? null;
if (effectiveSchema) {
const strictMatch = hostTables.find((table) => {
const nameMatches = table.name.toLowerCase() === dbtName;
const schemaMatches = table.db?.toLowerCase() === effectiveSchema.toLowerCase();
if (!nameMatches || !schemaMatches) {
return false;
}
if (dbtTable.database && table.catalog) {
return table.catalog.toLowerCase() === dbtTable.database.toLowerCase();
}
return true;
});
if (strictMatch) {
return strictMatch;
}
}
if (dbtTable.resourceType === 'source') {
return undefined;
}
const nameMatches = hostTables.filter((table) => table.name.toLowerCase() === dbtName);
return nameMatches.length === 1 ? nameMatches[0] : undefined;
}
function analyzeColumns(
dbtTable: DbtParsedTable,
hostTable: DbtHostTableLite,
): Pick<DbtTableMatch, 'columnsToImport' | 'columnsMatched' | 'columnsTotal' | 'columnDescriptionsFound'> {
let columnsToImport = 0;
let columnsMatched = 0;
let columnDescriptionsFound = 0;
for (const dbtColumn of dbtTable.columns) {
const hostColumn = hostTable.columns.find(
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
);
if (!hostColumn) {
continue;
}
columnsMatched++;
if (dbtColumn.description) {
columnDescriptionsFound++;
columnsToImport++;
}
}
return {
columnsToImport,
columnsMatched,
columnsTotal: dbtTable.columns.length,
columnDescriptionsFound,
};
}

View file

@ -0,0 +1,62 @@
import { describe, expect, it } from 'vitest';
import type { ParsedSemanticModel } from '../metricflow/deep-parse.js';
import { mergeSemanticModelTables } from './merge-semantic-model-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
const semanticModel: ParsedSemanticModel = {
name: 'orders_semantic',
description: 'Order facts',
modelRef: 'fct_orders',
dimensions: [
{ name: 'status', column: 'status', type: 'categorical', description: 'Order status' },
{ name: 'ordered_at', column: 'ordered_at', type: 'time' },
],
measures: [],
entities: [],
defaultTimeDimension: null,
};
describe('mergeSemanticModelTables', () => {
it('adds missing MetricFlow model refs as dbt model tables', () => {
const input: DbtSchemaParseResult = { projectName: null, dbtVersion: null, tables: [], relationships: [] };
expect(mergeSemanticModelTables(input, [semanticModel])).toEqual({
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'fct_orders',
description: 'Order facts',
database: null,
schema: null,
resourceType: 'model',
columns: [
{ name: 'status', description: 'Order status', dataType: null },
{ name: 'ordered_at', description: null, dataType: 'TIMESTAMP' },
],
},
],
});
});
it('does not add a duplicate table when schema parsing already found the model ref', () => {
const input: DbtSchemaParseResult = {
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'FCT_ORDERS',
description: 'Existing',
database: null,
schema: null,
resourceType: 'model',
columns: [],
},
],
};
expect(mergeSemanticModelTables(input, [semanticModel]).tables).toHaveLength(1);
});
});

View file

@ -0,0 +1,37 @@
import type { ParsedSemanticModel } from '../metricflow/deep-parse.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export function mergeSemanticModelTables(
parseResult: DbtSchemaParseResult,
semanticModels: ParsedSemanticModel[],
): DbtSchemaParseResult {
const merged: DbtSchemaParseResult = {
...parseResult,
tables: [...parseResult.tables],
relationships: [...parseResult.relationships],
};
const existingTableNames = new Set(merged.tables.map((table) => table.name.toLowerCase()));
for (const model of semanticModels) {
const tableName = model.modelRef;
if (existingTableNames.has(tableName.toLowerCase())) {
continue;
}
merged.tables.push({
name: tableName,
description: model.description,
database: null,
schema: null,
columns: model.dimensions.map((dimension) => ({
name: dimension.column,
description: dimension.description ?? null,
dataType: dimension.type === 'time' ? 'TIMESTAMP' : null,
})),
resourceType: 'model',
});
existingTableNames.add(tableName.toLowerCase());
}
return merged;
}

View file

@ -0,0 +1,214 @@
import { describe, expect, it } from 'vitest';
import { parseDbtSchemaFile, parseDbtSchemaFiles } from './parse-schema.js';
describe('dbt descriptions schema parser', () => {
it('resolves shared dbt vars and defaults before parsing schema YAML', () => {
const result = parseDbtSchemaFile(
`
version: 2
sources:
- name: raw
database: "{{ var('database') }}"
schema: "{{ var('schema', 'fallback_schema') }}"
tables:
- name: orders
identifier: fct_orders
description: "Orders from {{ var('database') }}"
columns:
- name: customer_id
description: "Customer id"
tests:
- relationships:
to: ref('customers')
field: id
models:
- name: "{{ var('model_name', 'orders_model') }}"
schema: "{{ var('model_schema') }}"
columns:
- name: id
description: "Order id"
`,
{ path: 'models/schema.yml', variables: new Map([['database', 'analytics'], ['model_schema', 'mart']]) },
);
expect(result.tables).toEqual([
{
name: 'fct_orders',
description: 'Orders from analytics',
database: 'analytics',
schema: 'fallback_schema',
columns: [
{
name: 'customer_id',
description: 'Customer id',
dataType: null,
dataTests: [{ name: 'relationships', package: 'dbt', kwargs: { to: "ref('customers')", field: 'id' } }],
},
],
resourceType: 'source',
},
{
name: 'orders_model',
description: null,
database: null,
schema: 'mart',
columns: [{ name: 'id', description: 'Order id', dataType: null }],
resourceType: 'model',
},
]);
expect(result.relationships).toEqual([
{
fromTable: 'fct_orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
fromSchema: 'fallback_schema',
},
]);
});
it('deduplicates tables by database schema and name while merging columns', () => {
const result = parseDbtSchemaFiles([
{
path: 'models/a.yml',
content: `
version: 2
models:
- name: orders
description: Orders
columns:
- name: id
description: Primary key
`,
},
{
path: 'models/b.yml',
content: `
version: 2
models:
- name: orders
columns:
- name: status
description: Status
- name: id
data_type: integer
`,
},
]);
expect(result.tables).toEqual([
{
name: 'orders',
description: 'Orders',
database: null,
schema: null,
resourceType: 'model',
columns: [
{ name: 'id', description: 'Primary key', dataType: 'integer' },
{ name: 'status', description: 'Status', dataType: null },
],
},
]);
});
it('returns an empty result for malformed YAML and preserves unresolved Jinja text', () => {
expect(parseDbtSchemaFile('{{{{ invalid yaml', { path: 'broken.yml' })).toEqual({
projectName: null,
dbtVersion: null,
tables: [],
relationships: [],
});
const unresolved = parseDbtSchemaFile(
`
version: 2
models:
- name: "{{ var('missing_model') }}"
`,
{ variables: new Map() },
);
expect(unresolved.tables[0]?.name).toBe("{{ var('missing_model') }}");
});
it('extracts data tests, constraints, enum values, tags, and freshness', () => {
const result = parseDbtSchemaFile(`
version: 2
sources:
- name: raw
schema: jaffle
tags: ["raw"]
tables:
- name: customers
tags: ["core"]
loaded_at_field: updated_at
freshness:
warn_after: { count: 12, period: hour }
columns:
- name: id
tests:
- not_null
- unique
- name: status
data_tests:
- accepted_values:
values: ['active', 'inactive']
models:
- name: orders
tags: ["finance"]
loaded_at_field: run_at
columns:
- name: status
data_tests:
- dbt_utils.expression_is_true:
expression: "status is not null"
- accepted_values: ['placed', 'shipped']
`);
const customers = result.tables.find((table) => table.name === 'customers');
expect(customers?.tagsDbt).toEqual(['raw', 'core']);
expect(customers?.freshnessDbt?.loadedAtField).toBe('updated_at');
expect(customers?.freshnessDbt?.raw).toBeDefined();
const id = customers?.columns.find((column) => column.name === 'id');
expect(id?.constraints?.dbt).toEqual({ not_null: true, unique: true });
const status = customers?.columns.find((column) => column.name === 'status');
expect(status?.enumValuesDbt).toEqual(['active', 'inactive']);
const orders = result.tables.find((table) => table.name === 'orders');
expect(orders?.tagsDbt).toEqual(['finance']);
expect(orders?.freshnessDbt?.loadedAtField).toBe('run_at');
const ordersStatus = orders?.columns.find((column) => column.name === 'status');
expect(ordersStatus?.enumValuesDbt).toEqual(['placed', 'shipped']);
expect(ordersStatus?.dataTests).toEqual(
expect.arrayContaining([
expect.objectContaining({ package: 'dbt_utils', name: 'expression_is_true' }),
expect.objectContaining({ package: 'dbt', name: 'accepted_values' }),
]),
);
});
it('parses relationships from model column data tests', () => {
const result = parseDbtSchemaFile(`
version: 2
models:
- name: orders
schema: public
columns:
- name: customer_id
data_tests:
- relationships:
arguments:
to: "ref('customers')"
field: id
`);
expect(result.relationships).toEqual([
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
fromSchema: 'public',
},
]);
});
});

View file

@ -0,0 +1,655 @@
import { createHash } from 'node:crypto';
import { parse as parseYaml } from 'yaml';
import { type KloLogger, noopLogger } from '../../../core/index.js';
import { resolveJinjaVariables } from '../../dbt-shared/project-vars.js';
export interface DbtParsedColumn {
name: string;
description: string | null;
dataType: string | null;
dataTests?: DbtDataTestRef[];
constraints?: DbtColumnConstraints;
enumValuesDbt?: string[];
}
export interface DbtDataTestRef {
name: string;
package: string;
kwargs?: Record<string, unknown>;
}
export interface DbtColumnConstraints {
dbt: {
not_null?: boolean;
unique?: boolean;
};
}
export interface DbtParsedRelationship {
fromTable: string;
fromColumn: string;
toTable: string;
toColumn: string;
fromSchema?: string;
toSchema?: string;
description?: string;
}
export interface DbtParsedTable {
name: string;
description: string | null;
database: string | null;
schema: string | null;
columns: DbtParsedColumn[];
resourceType?: 'source' | 'model';
tagsDbt?: string[];
freshnessDbt?: {
raw?: unknown;
loadedAtField?: string | null;
};
}
export interface DbtSchemaParseResult {
projectName: string | null;
dbtVersion: string | null;
tables: DbtParsedTable[];
relationships: DbtParsedRelationship[];
}
export interface DbtSchemaFile {
content: string;
path: string;
}
interface ParseDbtSchemaOptions {
path?: string;
variables?: Map<string, string>;
projectName?: string | null;
logger?: KloLogger;
}
interface DbtSchemaYaml {
version?: number;
sources?: DbtSchemaSource[];
models?: DbtSchemaModel[];
}
interface DbtSchemaSource {
name: string;
description?: string;
database?: string;
schema?: string;
tags?: string[];
tables?: DbtSchemaTable[];
}
interface DbtSchemaTable {
name: string;
description?: string;
identifier?: string;
tags?: string[];
loaded_at_field?: string;
freshness?: unknown;
columns?: DbtSchemaColumn[];
}
interface DbtSchemaModel {
name: string;
description?: string;
database?: string;
schema?: string;
tags?: string[];
loaded_at_field?: string;
freshness?: unknown;
columns?: DbtSchemaColumn[];
}
interface DbtSchemaColumn {
name: string;
description?: string;
data_type?: string;
data_tests?: DbtSchemaDataTest[];
tests?: DbtSchemaDataTest[];
}
type DbtSchemaDataTest =
| string
| {
relationships?: {
to?: string;
field?: string;
arguments?: { to?: string; field?: string };
};
not_null?: unknown;
unique?: unknown;
accepted_values?: { values?: unknown } | unknown;
[key: string]: unknown;
};
export function parseDbtSchemaFile(content: string, options: ParseDbtSchemaOptions = {}): DbtSchemaParseResult {
return new DbtSchemaParser(options.logger ?? noopLogger).parseFile(content, options);
}
export function parseDbtSchemaFiles(
files: DbtSchemaFile[],
variables?: Map<string, string>,
options: { projectName?: string | null; logger?: KloLogger } = {},
): DbtSchemaParseResult {
return new DbtSchemaParser(options.logger ?? noopLogger).parseFiles(files, variables, options.projectName ?? null);
}
export function computeDbtSchemaHash(files: DbtSchemaFile[]): string {
const combined = [...files]
.sort((a, b) => a.path.localeCompare(b.path))
.map((file) => `${file.path}:${file.content}`)
.join('\n');
return createHash('sha256').update(combined).digest('hex').substring(0, 16);
}
class DbtSchemaParser {
constructor(private readonly logger: KloLogger) {}
parseFile(yamlContent: string, options: ParseDbtSchemaOptions = {}): DbtSchemaParseResult {
this.logger.debug(`Parsing schema file: ${options.path ?? 'unknown'}`);
const resolved = options.variables
? resolveJinjaVariables(yamlContent, options.variables)
: { content: yamlContent, unresolvedVars: [] };
if (resolved.unresolvedVars.length > 0) {
this.logger.warn(
`Unresolved dbt variables in ${options.path ?? 'schema file'}: ${resolved.unresolvedVars.join(', ')}`,
);
}
let schema: DbtSchemaYaml;
try {
schema = parseYaml(resolved.content) as DbtSchemaYaml;
} catch (error) {
this.logger.warn(`Failed to parse YAML${options.path ? ` at ${options.path}` : ''}: ${error}`);
return this.emptyResult(options.projectName ?? null);
}
if (!schema || typeof schema !== 'object') {
return this.emptyResult(options.projectName ?? null);
}
const tables = [...this.parseSources(schema.sources), ...this.parseModels(schema.models)];
const relationships = [
...this.parseSourceRelationships(schema.sources),
...this.parseModelRelationships(schema.models),
];
return {
projectName: options.projectName ?? null,
dbtVersion: null,
tables,
relationships,
};
}
parseFiles(
files: DbtSchemaFile[],
variables?: Map<string, string>,
projectName: string | null = null,
): DbtSchemaParseResult {
const allTables: DbtParsedTable[] = [];
const allRelationships: DbtParsedRelationship[] = [];
for (const file of files) {
const result = this.parseFile(file.content, { path: file.path, variables, projectName });
allTables.push(...result.tables);
allRelationships.push(...result.relationships);
}
return {
projectName,
dbtVersion: null,
tables: this.deduplicateTables(allTables),
relationships: this.deduplicateRelationships(allRelationships),
};
}
private parseSources(sources: DbtSchemaSource[] | undefined): DbtParsedTable[] {
if (!sources || !Array.isArray(sources)) {
return [];
}
const tables: DbtParsedTable[] = [];
for (const source of sources) {
const sourceSchema = source.schema ?? source.name;
const sourceDatabase = source.database ?? null;
const sourceTags = this.normalizeTagList(source.tags);
if (!source.tables || !Array.isArray(source.tables)) {
continue;
}
for (const table of source.tables) {
const tagsDbt = this.mergeTagsDbt(sourceTags, this.normalizeTagList(table.tags));
const freshnessDbt = this.buildFreshnessDbt(table.freshness, table.loaded_at_field);
tables.push({
name: table.identifier ?? table.name,
description: this.normalizeDescription(table.description),
database: sourceDatabase,
schema: sourceSchema,
columns: this.parseColumns(table.columns),
resourceType: 'source',
...(tagsDbt ? { tagsDbt } : {}),
...(freshnessDbt ? { freshnessDbt } : {}),
});
}
}
return tables;
}
private parseModels(models: DbtSchemaModel[] | undefined): DbtParsedTable[] {
if (!models || !Array.isArray(models)) {
return [];
}
const tables: DbtParsedTable[] = [];
for (const model of models) {
if (!model.name) {
continue;
}
const tagsDbt = this.mergeTagsDbt(this.normalizeTagList(model.tags));
const freshnessDbt = this.buildFreshnessDbt(model.freshness, model.loaded_at_field);
tables.push({
name: model.name,
description: this.normalizeDescription(model.description),
database: model.database ?? null,
schema: model.schema ?? null,
columns: this.parseColumns(model.columns),
resourceType: 'model',
...(tagsDbt ? { tagsDbt } : {}),
...(freshnessDbt ? { freshnessDbt } : {}),
});
}
return tables;
}
private parseColumns(columns: DbtSchemaColumn[] | undefined): DbtParsedColumn[] {
if (!columns || !Array.isArray(columns)) {
return [];
}
return columns.map((column) => {
const { refs, constraints, enumValues } = this.parseDataTests(column.data_tests ?? column.tests);
return {
name: column.name,
description: this.normalizeDescription(column.description),
dataType: column.data_type ?? null,
...(refs.length > 0 ? { dataTests: refs } : {}),
...(constraints ? { constraints } : {}),
...(enumValues.length > 0 ? { enumValuesDbt: enumValues } : {}),
};
});
}
private parseDataTests(tests: DbtSchemaDataTest[] | undefined): {
refs: DbtDataTestRef[];
constraints: DbtColumnConstraints | undefined;
enumValues: string[];
} {
const refs: DbtDataTestRef[] = [];
const dbt: { not_null?: boolean; unique?: boolean } = {};
const enumValues: string[] = [];
if (!tests?.length) {
return { refs, constraints: undefined, enumValues };
}
for (const test of tests) {
if (typeof test === 'string') {
const parsed = this.parseTestNameString(test);
refs.push(parsed);
if (parsed.package === 'dbt' && parsed.name === 'not_null') {
dbt.not_null = true;
}
if (parsed.package === 'dbt' && parsed.name === 'unique') {
dbt.unique = true;
}
continue;
}
for (const [key, value] of Object.entries(test)) {
if (key === 'relationships') {
refs.push({
name: 'relationships',
package: 'dbt',
...(value && typeof value === 'object' && !Array.isArray(value)
? { kwargs: value as Record<string, unknown> }
: {}),
});
continue;
}
if (key === 'not_null') {
refs.push({ name: 'not_null', package: 'dbt' });
dbt.not_null = true;
continue;
}
if (key === 'unique') {
refs.push({ name: 'unique', package: 'dbt' });
dbt.unique = true;
continue;
}
if (key === 'accepted_values') {
if (Array.isArray(value)) {
enumValues.push(...value.map((item) => String(item)));
refs.push({ name: 'accepted_values', package: 'dbt', kwargs: { values: value } });
continue;
}
if (value && typeof value === 'object' && !Array.isArray(value)) {
const values = (value as { values?: unknown }).values;
if (Array.isArray(values)) {
enumValues.push(...values.map((item) => String(item)));
}
refs.push({ name: 'accepted_values', package: 'dbt', kwargs: value as Record<string, unknown> });
continue;
}
}
refs.push({
...this.parseTestNameString(key),
...(value && typeof value === 'object' && !Array.isArray(value)
? { kwargs: value as Record<string, unknown> }
: {}),
});
}
}
const constraints = dbt.not_null || dbt.unique ? { dbt } : undefined;
return { refs, constraints, enumValues };
}
private parseTestNameString(value: string): { name: string; package: string } {
const parts = value.split('.');
if (parts.length >= 2) {
return { package: parts[0]!, name: parts.slice(1).join('.') };
}
return { package: 'dbt', name: value };
}
private parseSourceRelationships(sources: DbtSchemaSource[] | undefined): DbtParsedRelationship[] {
if (!sources || !Array.isArray(sources)) {
return [];
}
const relationships: DbtParsedRelationship[] = [];
for (const source of sources) {
const sourceSchema = source.schema ?? source.name;
if (!source.tables || !Array.isArray(source.tables)) {
continue;
}
for (const table of source.tables) {
const tableName = table.identifier ?? table.name;
if (!table.columns || !Array.isArray(table.columns)) {
continue;
}
for (const column of table.columns) {
const tests = column.data_tests ?? column.tests ?? [];
for (const test of tests) {
const relationship = this.parseRelationshipTest(test, tableName, column.name, sourceSchema);
if (relationship) {
relationships.push(relationship);
}
}
}
}
}
return relationships;
}
private parseModelRelationships(models: DbtSchemaModel[] | undefined): DbtParsedRelationship[] {
if (!models || !Array.isArray(models)) {
return [];
}
const relationships: DbtParsedRelationship[] = [];
for (const model of models) {
if (!model.name || !model.columns || !Array.isArray(model.columns)) {
continue;
}
for (const column of model.columns) {
const tests = column.data_tests ?? column.tests ?? [];
for (const test of tests) {
const relationship = this.parseRelationshipTest(test, model.name, column.name, model.schema ?? undefined);
if (relationship) {
relationships.push(relationship);
}
}
}
}
return relationships;
}
private parseRelationshipTest(
test: DbtSchemaDataTest,
fromTable: string,
fromColumn: string,
fromSchema?: string,
): DbtParsedRelationship | null {
if (typeof test === 'string' || !test.relationships) {
return null;
}
const relationship = test.relationships;
const toRef = relationship.to ?? relationship.arguments?.to;
const toColumn = relationship.field ?? relationship.arguments?.field;
if (!toRef || !toColumn) {
this.logger.debug(`Skipping incomplete relationship test for ${fromTable}.${fromColumn}`);
return null;
}
const toTable = this.parseRef(toRef);
if (!toTable) {
this.logger.debug(`Could not parse ref: ${toRef}`);
return null;
}
return {
fromTable,
fromColumn,
toTable,
toColumn,
...(fromSchema ? { fromSchema } : {}),
};
}
private parseRef(refString: string): string | null {
const refMatch = refString.match(/ref\s*\(\s*['"]([^'"]+)['"]\s*\)/);
if (refMatch) {
return refMatch[1];
}
const sourceMatch = refString.match(/source\s*\(\s*['"][^'"]+['"]\s*,\s*['"]([^'"]+)['"]\s*\)/);
if (sourceMatch) {
return sourceMatch[1];
}
return null;
}
private normalizeDescription(description: string | undefined): string | null {
if (!description) {
return null;
}
const trimmed = description.trim();
return trimmed.length > 0 ? trimmed : null;
}
private normalizeTagList(tags: string[] | undefined): string[] {
if (!tags || !Array.isArray(tags)) {
return [];
}
return tags.map((tag) => String(tag));
}
private mergeTagsDbt(...lists: Array<string[] | undefined>): string[] | undefined {
const merged: string[] = [];
const seen = new Set<string>();
for (const list of lists) {
for (const item of list ?? []) {
if (!seen.has(item)) {
seen.add(item);
merged.push(item);
}
}
}
return merged.length > 0 ? merged : undefined;
}
private buildFreshnessDbt(freshness: unknown, loadedAtField: string | undefined): DbtParsedTable['freshnessDbt'] {
const loadedTrim = loadedAtField?.trim();
const hasFreshness = freshness !== undefined && freshness !== null;
if (!hasFreshness && !loadedTrim) {
return undefined;
}
return {
...(hasFreshness ? { raw: freshness } : {}),
...(hasFreshness ? { loadedAtField: loadedTrim ?? null } : loadedTrim ? { loadedAtField: loadedTrim } : {}),
};
}
private deduplicateTables(tables: DbtParsedTable[]): DbtParsedTable[] {
const seen = new Map<string, DbtParsedTable>();
for (const table of tables) {
const key = `${table.database ?? ''}.${table.schema ?? ''}.${table.name}`.toLowerCase();
const existing = seen.get(key);
if (!existing) {
seen.set(key, table);
continue;
}
seen.set(key, {
...existing,
description: existing.description ?? table.description,
columns: this.mergeColumns(existing.columns, table.columns),
tagsDbt: this.mergeTagsDbt(existing.tagsDbt, table.tagsDbt),
freshnessDbt: this.mergeFreshnessDbt(existing.freshnessDbt, table.freshnessDbt),
});
}
return Array.from(seen.values());
}
private mergeColumns(existing: DbtParsedColumn[], incoming: DbtParsedColumn[]): DbtParsedColumn[] {
const seen = new Map<string, DbtParsedColumn>();
for (const column of existing) {
seen.set(column.name.toLowerCase(), column);
}
for (const column of incoming) {
const key = column.name.toLowerCase();
const existingColumn = seen.get(key);
if (!existingColumn) {
seen.set(key, column);
continue;
}
seen.set(key, {
...existingColumn,
description: existingColumn.description ?? column.description,
dataType: existingColumn.dataType ?? column.dataType,
dataTests: this.mergeDbtDataTests(existingColumn.dataTests, column.dataTests),
constraints: this.mergeDbtConstraints(existingColumn.constraints, column.constraints),
enumValuesDbt: this.mergeStringList(existingColumn.enumValuesDbt, column.enumValuesDbt),
});
}
return Array.from(seen.values());
}
private deduplicateRelationships(relationships: DbtParsedRelationship[]): DbtParsedRelationship[] {
const seen = new Set<string>();
const result: DbtParsedRelationship[] = [];
for (const relationship of relationships) {
const key =
`${relationship.fromTable}.${relationship.fromColumn}->${relationship.toTable}.${relationship.toColumn}`.toLowerCase();
if (!seen.has(key)) {
seen.add(key);
result.push(relationship);
}
}
return result;
}
private mergeFreshnessDbt(
existing?: DbtParsedTable['freshnessDbt'],
incoming?: DbtParsedTable['freshnessDbt'],
): DbtParsedTable['freshnessDbt'] {
if (!existing && !incoming) {
return undefined;
}
const raw = existing?.raw !== undefined ? existing.raw : incoming?.raw;
const loadedAtField = existing?.loadedAtField ?? incoming?.loadedAtField;
return {
...(raw !== undefined ? { raw } : {}),
...(loadedAtField !== undefined ? { loadedAtField } : {}),
};
}
private mergeDbtConstraints(
existing?: DbtColumnConstraints,
incoming?: DbtColumnConstraints,
): DbtColumnConstraints | undefined {
const notNull = !!(existing?.dbt.not_null || incoming?.dbt.not_null);
const unique = !!(existing?.dbt.unique || incoming?.dbt.unique);
if (!notNull && !unique) {
return undefined;
}
return { dbt: { ...(notNull ? { not_null: true } : {}), ...(unique ? { unique: true } : {}) } };
}
private mergeStringList(existing?: string[], incoming?: string[]): string[] | undefined {
return this.mergeTagsDbt(existing, incoming);
}
private mergeDbtDataTests(existing?: DbtDataTestRef[], incoming?: DbtDataTestRef[]): DbtDataTestRef[] | undefined {
if (!existing?.length) {
return incoming?.length ? [...incoming] : undefined;
}
if (!incoming?.length) {
return [...existing];
}
const tests = new Map<string, DbtDataTestRef>();
for (const test of [...existing, ...incoming]) {
const kwargsKey =
test.kwargs && Object.keys(test.kwargs).length > 0
? `:${createHash('sha256').update(JSON.stringify(test.kwargs)).digest('hex').slice(0, 16)}`
: '';
tests.set(`${test.package}:${test.name}${kwargsKey}`, test);
}
return [...tests.values()];
}
private emptyResult(projectName: string | null): DbtSchemaParseResult {
return {
projectName,
dbtVersion: null,
tables: [],
relationships: [],
};
}
}

View file

@ -0,0 +1,102 @@
import { describe, expect, it } from 'vitest';
import type { DbtSchemaParseResult } from './parse-schema.js';
import { toDescriptionUpdates } from './to-description-updates.js';
import type { DbtHostTableLite } from './match-tables.js';
const hostTables: DbtHostTableLite[] = [
{
id: '1',
name: 'orders',
catalog: 'warehouse',
db: 'analytics',
columns: [
{ id: 'c1', name: 'id' },
{ id: 'c2', name: 'amount' },
],
},
];
function parseResult(description: string | null, columnDescription: string | null): DbtSchemaParseResult {
return {
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'orders',
description,
database: 'warehouse',
schema: 'analytics',
resourceType: 'model',
columns: [
{ name: 'id', description: columnDescription, dataType: null },
{ name: 'missing', description: 'not imported', dataType: null },
],
},
],
};
}
describe('dbt descriptions update payloads', () => {
it('emits dbt writes and matching ai invalidations when descriptions exist', () => {
expect(
toDescriptionUpdates({
connectionId: 'conn-1',
parseResult: parseResult('Orders table', 'Primary key'),
hostTables,
targetSchema: null,
}),
).toEqual({
dbt: [
{
connectionId: 'conn-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'dbt',
tableDescription: 'Orders table',
columnDescriptions: { id: 'Primary key' },
},
],
aiInvalidations: [
{
connectionId: 'conn-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'ai',
},
],
});
});
it('does not emit spurious dbt writes or ai invalidations when no descriptions exist', () => {
expect(
toDescriptionUpdates({
connectionId: 'conn-1',
parseResult: parseResult(null, null),
hostTables,
targetSchema: null,
}),
).toEqual({ dbt: [], aiInvalidations: [] });
});
it('emits ai invalidation without a dbt description write when only structural metadata exists', () => {
const result = parseResult(null, null);
result.tables[0]!.tagsDbt = ['finance'];
expect(
toDescriptionUpdates({
connectionId: 'conn-1',
parseResult: result,
hostTables,
targetSchema: null,
}),
).toEqual({
dbt: [],
aiInvalidations: [
{
connectionId: 'conn-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'ai',
},
],
});
});
});

View file

@ -0,0 +1,70 @@
import type { KloDescriptionUpdate } from '../../../scan/enrichment-types.js';
import { findMatchingKloTable, type DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export interface DbtDescriptionUpdates {
dbt: KloDescriptionUpdate[];
aiInvalidations: KloDescriptionUpdate[];
}
export function toDescriptionUpdates(input: {
connectionId: string;
parseResult: DbtSchemaParseResult;
hostTables: DbtHostTableLite[];
targetSchema: string | null;
}): DbtDescriptionUpdates {
const dbt: KloDescriptionUpdate[] = [];
const aiInvalidations: KloDescriptionUpdate[] = [];
for (const dbtTable of input.parseResult.tables) {
const hostTable = findMatchingKloTable(dbtTable, input.hostTables, input.targetSchema);
if (!hostTable) {
continue;
}
const tableDescription = dbtTable.description ?? undefined;
const columnDescriptions: Record<string, string | null> = {};
for (const dbtColumn of dbtTable.columns) {
if (!dbtColumn.description) {
continue;
}
const hostColumn = hostTable.columns.find(
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
);
if (hostColumn) {
columnDescriptions[hostColumn.name] = dbtColumn.description;
}
}
const hasColumnDescriptions = Object.keys(columnDescriptions).length > 0;
const hasDescriptionChange = tableDescription !== undefined || hasColumnDescriptions;
const hasMetadataChange =
!!dbtTable.tagsDbt?.length ||
dbtTable.freshnessDbt !== undefined ||
dbtTable.columns.some(
(column) => column.constraints !== undefined || !!column.enumValuesDbt?.length || !!column.dataTests?.length,
);
if (!hasDescriptionChange && !hasMetadataChange) {
continue;
}
const tableRef = { catalog: hostTable.catalog, db: hostTable.db, name: hostTable.name };
if (hasDescriptionChange) {
dbt.push({
connectionId: input.connectionId,
table: tableRef,
source: 'dbt',
...(tableDescription !== undefined ? { tableDescription } : {}),
...(hasColumnDescriptions ? { columnDescriptions } : {}),
});
}
aiInvalidations.push({
connectionId: input.connectionId,
table: tableRef,
source: 'ai',
});
}
return { dbt, aiInvalidations };
}

View file

@ -0,0 +1,70 @@
import { describe, expect, it } from 'vitest';
import { toMetadataUpdates } from './to-metadata-updates.js';
describe('toMetadataUpdates', () => {
it('emits source-keyed dbt metadata updates for matched tables and columns', () => {
const updates = toMetadataUpdates({
connectionId: 'conn_1',
targetSchema: 'analytics',
hostTables: [
{
id: 'orders-id',
name: 'orders',
catalog: 'warehouse',
db: 'analytics',
columns: [
{ id: 'status-id', name: 'status' },
{ id: 'created-id', name: 'created_at' },
],
},
],
parseResult: {
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'orders',
description: null,
database: 'warehouse',
schema: 'analytics',
resourceType: 'model',
tagsDbt: ['finance'],
freshnessDbt: { loadedAtField: 'created_at' },
columns: [
{
name: 'status',
description: null,
dataType: null,
enumValuesDbt: ['placed', 'shipped'],
constraints: { dbt: { not_null: true } },
dataTests: [{ name: 'accepted_values', package: 'dbt', kwargs: { values: ['placed', 'shipped'] } }],
},
],
},
],
},
});
expect(updates).toEqual([
{
connectionId: 'conn_1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'dbt',
tableFields: {
tags: ['finance'],
freshness: { loaded_at_field: 'created_at' },
},
columnFields: {
status: {
constraints: { not_null: true },
enum_values: ['placed', 'shipped'],
tests: [
{ name: 'accepted_values', package: 'dbt', kwargs: { values: ['placed', 'shipped'] } },
],
},
},
},
]);
});
});

View file

@ -0,0 +1,74 @@
import type { KloMetadataUpdate } from '../../../scan/enrichment-types.js';
import { findMatchingKloTable, type DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export function toMetadataUpdates(input: {
connectionId: string;
parseResult: DbtSchemaParseResult;
hostTables: DbtHostTableLite[];
targetSchema: string | null;
}): KloMetadataUpdate[] {
const updates: KloMetadataUpdate[] = [];
for (const dbtTable of input.parseResult.tables) {
const hostTable = findMatchingKloTable(dbtTable, input.hostTables, input.targetSchema);
if (!hostTable) {
continue;
}
const tableFields: Record<string, unknown> = {};
if (dbtTable.tagsDbt?.length) {
tableFields.tags = dbtTable.tagsDbt;
}
if (dbtTable.freshnessDbt) {
tableFields.freshness = {
...(dbtTable.freshnessDbt.raw !== undefined ? { raw: dbtTable.freshnessDbt.raw } : {}),
...(dbtTable.freshnessDbt.loadedAtField !== undefined
? { loaded_at_field: dbtTable.freshnessDbt.loadedAtField }
: {}),
};
}
const columnFields: Record<string, Record<string, unknown>> = {};
for (const dbtColumn of dbtTable.columns) {
const hostColumn = hostTable.columns.find(
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
);
if (!hostColumn) {
continue;
}
const fields: Record<string, unknown> = {};
if (dbtColumn.constraints) {
fields.constraints = dbtColumn.constraints.dbt;
}
if (dbtColumn.enumValuesDbt?.length) {
fields.enum_values = dbtColumn.enumValuesDbt;
}
if (dbtColumn.dataTests?.length) {
fields.tests = dbtColumn.dataTests.map((test) => ({
name: test.name,
package: test.package,
...(test.kwargs ? { kwargs: test.kwargs } : {}),
}));
}
if (Object.keys(fields).length > 0) {
columnFields[hostColumn.name] = fields;
}
}
if (Object.keys(tableFields).length === 0 && Object.keys(columnFields).length === 0) {
continue;
}
updates.push({
connectionId: input.connectionId,
table: { catalog: hostTable.catalog, db: hostTable.db, name: hostTable.name },
source: 'dbt',
...(Object.keys(tableFields).length > 0 ? { tableFields } : {}),
...(Object.keys(columnFields).length > 0 ? { columnFields } : {}),
});
}
return updates;
}

View file

@ -0,0 +1,62 @@
import { describe, expect, it } from 'vitest';
import type { DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
import { toRelationshipUpdates } from './to-relationship-updates.js';
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
const hostTables: DbtHostTableLite[] = [
{
id: '1',
name: 'orders',
catalog: 'warehouse',
db: 'analytics',
columns: [{ id: 'c1', name: 'customer_id' }],
},
{
id: '2',
name: 'customers',
catalog: 'warehouse',
db: 'staging',
columns: [{ id: 'c2', name: 'id' }],
},
];
const parseResult: DbtSchemaParseResult = {
projectName: null,
dbtVersion: null,
tables: [],
relationships: [
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
fromSchema: 'analytics',
toSchema: 'analytics',
description: 'schema intentionally differs from the host customers table',
},
{ fromTable: 'orders', fromColumn: 'missing', toTable: 'customers', toColumn: 'id' },
{ fromTable: 'orders', fromColumn: 'customer_id', toTable: 'missing_table', toColumn: 'id' },
],
};
describe('dbt relationship update payloads', () => {
it('validates relationships using the current name-only matching behavior and dbt provenance', () => {
expect(toRelationshipUpdates({ connectionId: 'conn-1', parseResult, hostTables })).toEqual({
joins: [
{
connectionId: 'conn-1',
fromTable: 'orders',
fromColumns: ['customer_id'],
toTable: 'customers',
toColumns: ['id'],
relationship: 'many_to_one',
author: 'dbt',
authorEmail: DBT_SYSTEM_EMAIL,
},
],
skippedNoMatch: 2,
});
});
});

View file

@ -0,0 +1,57 @@
import type { KloJoinUpdate } from '../../../scan/enrichment-types.js';
import type { DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export interface DbtRelationshipUpdates {
joins: KloJoinUpdate[];
skippedNoMatch: number;
}
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
export function toRelationshipUpdates(input: {
connectionId: string;
parseResult: DbtSchemaParseResult;
hostTables: DbtHostTableLite[];
}): DbtRelationshipUpdates {
const tablesByName = new Map<string, DbtHostTableLite>();
for (const table of input.hostTables) {
tablesByName.set(table.name.toLowerCase(), table);
}
const joins: KloJoinUpdate[] = [];
let skippedNoMatch = 0;
for (const relationship of input.parseResult.relationships) {
const fromTable = tablesByName.get(relationship.fromTable.toLowerCase());
const toTable = tablesByName.get(relationship.toTable.toLowerCase());
if (!fromTable || !toTable) {
skippedNoMatch++;
continue;
}
const fromColumn = fromTable.columns.find(
(column) => column.name.toLowerCase() === relationship.fromColumn.toLowerCase(),
);
const toColumn = toTable.columns.find(
(column) => column.name.toLowerCase() === relationship.toColumn.toLowerCase(),
);
if (!fromColumn || !toColumn) {
skippedNoMatch++;
continue;
}
joins.push({
connectionId: input.connectionId,
fromTable: fromTable.name,
fromColumns: [fromColumn.name],
toTable: toTable.name,
toColumns: [toColumn.name],
relationship: 'many_to_one',
author: 'dbt',
authorEmail: DBT_SYSTEM_EMAIL,
});
}
return { joins, skippedNoMatch };
}

View file

@ -0,0 +1,410 @@
import { describe, expect, it } from 'vitest';
import { type DbtHostTableLite, matchDbtTables } from './dbt-descriptions/match-tables.js';
import { mergeSemanticModelTables } from './dbt-descriptions/merge-semantic-model-tables.js';
import { parseDbtSchemaFiles } from './dbt-descriptions/parse-schema.js';
import { toDescriptionUpdates } from './dbt-descriptions/to-description-updates.js';
import { toRelationshipUpdates } from './dbt-descriptions/to-relationship-updates.js';
import { parseMetricflowFiles } from './metricflow/deep-parse.js';
import { mapCrossModelMetricToSource, mapSemanticModelToSource } from './metricflow/semantic-models.js';
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
const metricflowYaml = `
semantic_models:
- name: orders_semantic
description: MetricFlow order facts
model: ref('fct_orders')
defaults:
agg_time_dimension: ordered_at
entities:
- name: customer
type: foreign
expr: customer_id
description: Customer relationship
dimensions:
- name: status
type: categorical
expr: status
description: Order status
- name: ordered_at
type: time
expr: ordered_at
measures:
- name: total_revenue
agg: sum
expr: amount
description: Revenue
- name: customers_semantic
description: Customer dimension
model: ref('dim_customers')
entities:
- name: customer
type: primary
expr: id
dimensions:
- name: country
type: categorical
expr: country
description: Customer country
measures:
- name: customer_count
agg: count
expr: id
description: Customer count
metrics:
- name: total_revenue
type: simple
type_params:
measure: total_revenue
- name: customer_count
type: simple
type_params:
measure: customer_count
- name: revenue_per_customer
description: Revenue per customer
type: derived
type_params:
expr: total_revenue / NULLIF(customer_count, 0)
metrics:
- name: total_revenue
alias: total_revenue
- name: customer_count
alias: customer_count
`;
const schemaYaml = `
version: 2
sources:
- name: raw
database: warehouse
schema: landing
tables:
- name: customers
identifier: dim_customers
description: Raw customer dimension
columns:
- name: id
description: Customer primary key
- name: country
description: Country name
models:
- name: "{{ var('orders_model', 'fct_orders') }}"
schema: "{{ var('mart_schema', 'analytics') }}"
description: Modeled orders
columns:
- name: customer_id
description: Linked customer id
tests:
- relationships:
to: ref('dim_customers')
field: id
- name: status
description: Order status
- name: amount
description: Gross amount
`;
const hostTables: DbtHostTableLite[] = [
{
id: 'orders-table',
name: 'fct_orders',
catalog: 'warehouse',
db: 'analytics',
columns: [
{ id: 'orders-customer-id', name: 'customer_id' },
{ id: 'orders-status', name: 'status' },
{ id: 'orders-amount', name: 'amount' },
{ id: 'orders-ordered-at', name: 'ordered_at' },
],
},
{
id: 'customers-table',
name: 'dim_customers',
catalog: 'warehouse',
db: 'landing',
columns: [
{ id: 'customers-id', name: 'id' },
{ id: 'customers-country', name: 'country' },
],
},
];
describe('dbt extraction golden parity fixture', () => {
it('freezes the relocated MetricFlow and dbt-description contract together', () => {
const metricflow = parseMetricflowFiles([{ path: 'semantic_models/orders.yml', content: metricflowYaml }]);
expect(metricflow).toEqual({
semanticModels: [
{
name: 'orders_semantic',
description: 'MetricFlow order facts',
modelRef: 'fct_orders',
dimensions: [
{
name: 'status',
column: 'status',
type: 'string',
label: 'Status',
description: 'Order status',
},
{
name: 'ordered_at',
column: 'ordered_at',
type: 'time',
label: 'Ordered At',
description: undefined,
},
],
measures: [
{
type: 'simple',
name: 'total_revenue',
column: 'amount',
aggregation: 'sum',
label: 'Total Revenue',
description: 'Revenue',
},
],
entities: [{ name: 'customer', type: 'foreign', expr: 'customer_id', description: 'Customer relationship' }],
defaultTimeDimension: 'ordered_at',
},
{
name: 'customers_semantic',
description: 'Customer dimension',
modelRef: 'dim_customers',
dimensions: [
{
name: 'country',
column: 'country',
type: 'string',
label: 'Country',
description: 'Customer country',
},
],
measures: [
{
type: 'simple',
name: 'customer_count',
column: 'id',
aggregation: 'count',
label: 'Customer Count',
description: 'Customer count',
},
],
entities: [{ name: 'customer', type: 'primary', expr: 'id' }],
defaultTimeDimension: null,
},
],
crossModelMetrics: [
{
name: 'revenue_per_customer',
label: null,
description: 'Revenue per customer',
type: 'derived',
expr: 'total_revenue / NULLIF(customer_count, 0)',
dependsOn: [
{ metricName: 'orders_semantic', alias: 'total_revenue' },
{ metricName: 'customers_semantic', alias: 'customer_count' },
],
filter: null,
},
],
relationships: [
{
fromTable: 'fct_orders',
fromColumn: 'customer_id',
toTable: 'dim_customers',
toColumn: 'id',
description: 'Customer relationship',
},
],
warnings: [],
});
expect(mapSemanticModelToSource(metricflow.semanticModels[0], 'analytics.fct_orders')).toEqual({
name: 'fct-orders',
table: 'analytics.fct_orders',
grain: ['status', 'ordered_at'],
columns: [
{ name: 'status', type: 'string', description: 'Order status' },
{ name: 'ordered_at', type: 'time' },
],
measures: [
{
name: 'total_revenue',
expr: 'sum(amount)',
description: 'Revenue',
},
],
joins: [],
descriptions: { dbt: 'MetricFlow order facts' },
});
expect(mapCrossModelMetricToSource(metricflow.crossModelMetrics[0])).toEqual({
name: 'revenue-per-customer',
sql: 'total_revenue / NULLIF(customer_count, 0)',
descriptions: { dbt: 'Revenue per customer' },
grain: [],
columns: [],
measures: [
{
name: 'revenue_per_customer',
expr: 'total_revenue / NULLIF(customer_count, 0)',
description: 'Revenue per customer',
},
],
joins: [],
});
const schema = parseDbtSchemaFiles(
[{ path: 'models/schema.yml', content: schemaYaml }],
new Map([
['orders_model', 'fct_orders'],
['mart_schema', 'analytics'],
]),
);
const merged = mergeSemanticModelTables(schema, metricflow.semanticModels);
expect(merged).toEqual({
projectName: null,
dbtVersion: null,
tables: [
{
name: 'dim_customers',
description: 'Raw customer dimension',
database: 'warehouse',
schema: 'landing',
columns: [
{ name: 'id', description: 'Customer primary key', dataType: null },
{ name: 'country', description: 'Country name', dataType: null },
],
resourceType: 'source',
},
{
name: 'fct_orders',
description: 'Modeled orders',
database: null,
schema: 'analytics',
columns: [
{
name: 'customer_id',
description: 'Linked customer id',
dataType: null,
dataTests: [
{
name: 'relationships',
package: 'dbt',
kwargs: { to: "ref('dim_customers')", field: 'id' },
},
],
},
{ name: 'status', description: 'Order status', dataType: null },
{ name: 'amount', description: 'Gross amount', dataType: null },
],
resourceType: 'model',
},
],
relationships: [
{
fromTable: 'fct_orders',
fromColumn: 'customer_id',
toTable: 'dim_customers',
toColumn: 'id',
fromSchema: 'analytics',
},
],
});
expect(matchDbtTables(merged.tables, hostTables, 'analytics')).toEqual([
{
dbtTable: 'dim_customers',
dbtSchema: 'landing',
dbtDatabase: 'warehouse',
hostTableId: 'customers-table',
hostTableName: 'dim_customers',
matched: true,
tableDescriptionAction: 'import',
tableDescriptionFound: true,
columnsToImport: 2,
columnsMatched: 2,
columnsTotal: 2,
columnDescriptionsFound: 2,
},
{
dbtTable: 'fct_orders',
dbtSchema: 'analytics',
dbtDatabase: null,
hostTableId: 'orders-table',
hostTableName: 'fct_orders',
matched: true,
tableDescriptionAction: 'import',
tableDescriptionFound: true,
columnsToImport: 3,
columnsMatched: 3,
columnsTotal: 3,
columnDescriptionsFound: 3,
},
]);
expect(
toDescriptionUpdates({
connectionId: 'warehouse-1',
parseResult: merged,
hostTables,
targetSchema: 'analytics',
}),
).toEqual({
dbt: [
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'landing', name: 'dim_customers' },
source: 'dbt',
tableDescription: 'Raw customer dimension',
columnDescriptions: {
id: 'Customer primary key',
country: 'Country name',
},
},
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'fct_orders' },
source: 'dbt',
tableDescription: 'Modeled orders',
columnDescriptions: {
customer_id: 'Linked customer id',
status: 'Order status',
amount: 'Gross amount',
},
},
],
aiInvalidations: [
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'landing', name: 'dim_customers' },
source: 'ai',
},
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'fct_orders' },
source: 'ai',
},
],
});
expect(toRelationshipUpdates({ connectionId: 'warehouse-1', parseResult: merged, hostTables })).toEqual({
joins: [
{
connectionId: 'warehouse-1',
fromTable: 'fct_orders',
fromColumns: ['customer_id'],
toTable: 'dim_customers',
toColumns: ['id'],
relationship: 'many_to_one',
author: 'dbt',
authorEmail: DBT_SYSTEM_EMAIL,
},
],
skippedNoMatch: 0,
});
});
});

View file

@ -0,0 +1,36 @@
import { describe, expect, it } from 'vitest';
import { chunkDbtProject } from './chunk.js';
describe('chunkDbtProject', () => {
const diffSet = (modified: string[]) => ({ added: [], modified, deleted: [], unchanged: [] });
it('caps peerFileIndex when the project has very many yaml files', () => {
const modelPaths = Array.from({ length: 201 }, (_, i) => `models/m${i}.yml`);
const allPaths = ['dbt_project.yml', ...modelPaths].sort();
const { workUnits } = chunkDbtProject({ allPaths });
const [first] = workUnits;
expect(first).toBeDefined();
expect(first?.peerFileIndex).toHaveLength(200);
expect(first?.notes).toMatch(/capped at 200/);
});
it('keeps large-project model work units when dbt_project.yml changes', () => {
const modelPaths = Array.from({ length: 30 }, (_, i) => `models/m${i}.yml`);
const allPaths = ['dbt_project.yml', ...modelPaths].sort();
const { workUnits } = chunkDbtProject({ allPaths }, { diffSet: diffSet(['dbt_project.yml']) });
expect(workUnits).toHaveLength(30);
expect(workUnits[0]?.rawFiles).toEqual(['models/m0.yml']);
expect(workUnits[0]?.dependencyPaths).toContain('dbt_project.yml');
});
it('keeps large-project model work units when non-model yaml peers change', () => {
const modelPaths = Array.from({ length: 30 }, (_, i) => `models/m${i}.yml`);
const allPaths = ['dbt_project.yml', 'seeds/seed_properties.yml', ...modelPaths].sort();
const { workUnits } = chunkDbtProject({ allPaths }, { diffSet: diffSet(['seeds/seed_properties.yml']) });
expect(workUnits).toHaveLength(30);
expect(workUnits[0]?.rawFiles).toEqual(['models/m0.yml']);
expect(workUnits[0]?.dependencyPaths).toContain('seeds/seed_properties.yml');
});
});

View file

@ -0,0 +1,130 @@
import type { ChunkResult, DiffSet, WorkUnit } from '../../types.js';
import type { ParsedDbtProject } from './parse.js';
interface ChunkOptions {
diffSet?: DiffSet;
}
/**
* Per-model work units (when the project has more than 25 YAML files) only name `rawFiles` under
* `models/**`. Other `.yml` (e.g. some `seeds/` or custom layouts) still appear in `peerFileIndex`
* or in the small-project / no-models fallbacks v1 does not emit one WU per non-models file.
*/
const MODELS_PREFIX = 'models/';
/** `peerFileIndex` is a hint only (agents may not read those paths). Cap to limit prompt size. */
const MAX_PEER_FILE_INDEX = 200;
function projectYamlPath(allPaths: string[]): string | undefined {
if (allPaths.includes('dbt_project.yml')) {
return 'dbt_project.yml';
}
if (allPaths.includes('dbt_project.yaml')) {
return 'dbt_project.yaml';
}
return undefined;
}
function modelRelativePaths(allPaths: string[]): string[] {
return allPaths.filter((p) => p.replace(/\\/g, '/').startsWith(MODELS_PREFIX)).sort();
}
function unitKeyForModelFile(mf: string): string {
const base = mf
.replace(/\.(ya?ml)$/i, '')
.replace(/\\/g, '/')
.replace(/[^a-zA-Z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
return `dbt-${base.toLowerCase()}`;
}
function emitFirstRunWorkUnits(allPaths: string[], dbtDep: string | undefined): WorkUnit[] {
if (allPaths.length === 0) {
return [];
}
if (allPaths.length <= 25) {
return [
{
unitKey: 'dbt-all',
displayLabel: 'dbt project (all yaml)',
rawFiles: [...allPaths],
peerFileIndex: [],
dependencyPaths: [],
notes: 'dbt project — all YAML in one WorkUnit (≤25 files)',
},
];
}
const modelFiles = modelRelativePaths(allPaths);
if (modelFiles.length === 0) {
return [
{
unitKey: 'dbt-all',
displayLabel: 'dbt project (all yaml, no models/**)',
rawFiles: [...allPaths],
peerFileIndex: [],
dependencyPaths: dbtDep ? [dbtDep] : [],
notes: 'dbt: no models/**/*.yml — single slice with dbt_project as dependency if present',
},
];
}
return modelFiles.map((mf) => {
const allPeers = allPaths.filter((p) => p !== mf).sort();
const truncated = allPeers.length > MAX_PEER_FILE_INDEX;
const peerFileIndex = truncated ? allPeers.slice(0, MAX_PEER_FILE_INDEX) : allPeers;
const dependencyPaths = dbtDep && allPaths.includes(dbtDep) && mf !== dbtDep ? [dbtDep].sort() : [];
const notes = truncated
? `dbt model schema slice (peer index capped at ${MAX_PEER_FILE_INDEX} of ${allPeers.length} paths)`
: 'dbt model schema slice';
return {
unitKey: unitKeyForModelFile(mf),
displayLabel: `dbt ${mf}`,
rawFiles: [mf],
peerFileIndex,
dependencyPaths: dependencyPaths,
notes,
};
});
}
function applyDiffSet(firstRunUnits: WorkUnit[], diffSet: DiffSet): ChunkResult {
const touched = new Set([...diffSet.added, ...diffSet.modified]);
const kept: WorkUnit[] = [];
for (const wu of firstRunUnits) {
const touchedRawFiles = wu.rawFiles.filter((p) => touched.has(p));
const touchedDependencies = wu.dependencyPaths.filter((p) => touched.has(p));
const touchedPeerFiles = wu.peerFileIndex.filter((p) => touched.has(p));
if (touchedRawFiles.length === 0 && touchedDependencies.length === 0 && touchedPeerFiles.length === 0) {
continue;
}
const rawFiles = touchedRawFiles.length > 0 ? touchedRawFiles : wu.rawFiles;
const unchangedRaw = touchedRawFiles.length > 0 ? wu.rawFiles.filter((p) => !touched.has(p)) : [];
for (const p of wu.rawFiles) {
if (!rawFiles.includes(p) && !unchangedRaw.includes(p)) {
unchangedRaw.push(p);
}
}
const combinedDeps = new Set<string>([...wu.dependencyPaths, ...unchangedRaw, ...touchedPeerFiles]);
kept.push({
...wu,
rawFiles: rawFiles.sort(),
dependencyPaths: [...combinedDeps].sort(),
});
}
const eviction = diffSet.deleted.length > 0 ? { deletedRawPaths: [...diffSet.deleted].sort() } : undefined;
return { workUnits: kept, eviction };
}
export function chunkDbtProject(project: ParsedDbtProject, opts: ChunkOptions = {}): ChunkResult {
const dbtDep = projectYamlPath(project.allPaths);
const firstRun = emitFirstRunWorkUnits(project.allPaths, dbtDep);
if (!opts.diffSet) {
return { workUnits: firstRun };
}
return applyDiffSet(firstRun, opts.diffSet);
}

View file

@ -0,0 +1,51 @@
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import type { SourceAdapter } from '../../types.js';
import { DbtSourceAdapter } from './dbt.adapter.js';
describe('DbtSourceAdapter', () => {
let stagedDir: string;
let adapter: SourceAdapter;
beforeEach(async () => {
stagedDir = await mkdtemp(join(tmpdir(), 'dbt-adapter-'));
adapter = new DbtSourceAdapter();
});
afterEach(async () => {
await rm(stagedDir, { recursive: true, force: true });
});
it('declares the expected source key and skill list', () => {
expect(adapter.source).toBe('dbt');
expect(adapter.skillNames).toEqual(['dbt_ingest']);
});
it('detects a staged dbt project root (dbt_project.yml)', async () => {
await writeFile(join(stagedDir, 'dbt_project.yml'), "name: 'jaffle'\nversion: '1.0.0'\n", 'utf-8');
expect(await adapter.detect(stagedDir)).toBe(true);
});
it('chunk: dbt_project.yml + models/a.yml yields one WU (≤25 files)', async () => {
await writeFile(join(stagedDir, 'dbt_project.yml'), "name: 'jaffle'\n", 'utf-8');
await mkdir(join(stagedDir, 'models'), { recursive: true });
await writeFile(
join(stagedDir, 'models/a.yml'),
'version: 2\nmodels:\n - name: orders\n description: Orders\n',
'utf-8',
);
const result = await adapter.chunk(stagedDir);
expect(result.workUnits).toHaveLength(1);
expect(result.workUnits[0].unitKey).toBe('dbt-all');
expect(result.parseArtifacts).toMatchObject({
projectName: 'jaffle',
tables: [{ name: 'orders', description: 'Orders' }],
});
});
it('implements fetch() for git-backed dbt source setup', () => {
expect(adapter.fetch).toBeTypeOf('function');
});
});

View file

@ -0,0 +1,48 @@
import { join } from 'node:path';
import type { ChunkResult, DiffSet, SourceAdapter } from '../../types.js';
import type { FetchContext } from '../../types.js';
import { loadProjectInfo } from '../../dbt-shared/project-vars.js';
import { loadDbtSchemaFiles } from '../../dbt-shared/schema-files.js';
import { parseDbtSchemaFiles } from '../dbt-descriptions/parse-schema.js';
import { chunkDbtProject } from './chunk.js';
import { detectDbtStagedDir } from './detect.js';
import { fetchDbtRepo, type DbtPullConfig } from './fetch.js';
import { parseDbtStagedDir } from './parse.js';
interface DbtSourceAdapterOptions {
homeDir?: string;
}
export class DbtSourceAdapter implements SourceAdapter {
readonly source = 'dbt' as const;
/** Runner merges: ingest_triage, sl_capture, knowledge_capture (see ingest-bundle.runner.ts) */
readonly skillNames: string[] = ['dbt_ingest'];
constructor(private readonly options: DbtSourceAdapterOptions = {}) {}
detect(stagedDir: string): Promise<boolean> {
return detectDbtStagedDir(stagedDir);
}
async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
const config = pullConfig as DbtPullConfig | undefined;
if (!config?.repoUrl) {
throw new Error('dbt fetch requires repoUrl');
}
await fetchDbtRepo({
config,
cacheDir: join(this.options.homeDir ?? '.klo/cache', 'dbt', ctx.connectionId),
stagedDir,
});
}
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const project = await parseDbtStagedDir(stagedDir);
const projectInfo = await loadProjectInfo(stagedDir);
const schemaFiles = await loadDbtSchemaFiles(stagedDir);
const parseArtifacts = parseDbtSchemaFiles(schemaFiles, projectInfo.variables, {
projectName: projectInfo.projectName,
});
return { ...chunkDbtProject(project, { diffSet }), parseArtifacts };
}
}

View file

@ -0,0 +1,12 @@
import { access } from 'node:fs/promises';
import { join } from 'node:path';
export async function detectDbtStagedDir(stagedDir: string): Promise<boolean> {
for (const name of ['dbt_project.yml', 'dbt_project.yaml'] as const) {
try {
await access(join(stagedDir, name));
return true;
} catch {}
}
return false;
}

View file

@ -0,0 +1,38 @@
import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { fetchDbtRepo } from './fetch.js';
describe('fetchDbtRepo', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-dbt-fetch-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('copies dbt yaml files from a fetched repo subpath into staged dir', async () => {
const cacheDir = join(tempDir, 'cache');
const stagedDir = join(tempDir, 'staged');
await mkdir(join(cacheDir, 'analytics', 'models'), { recursive: true });
await writeFile(join(cacheDir, 'analytics', 'dbt_project.yml'), 'name: analytics\n', 'utf-8');
await writeFile(join(cacheDir, 'analytics', 'models', 'orders.yml'), 'models: []\n', 'utf-8');
const cloneOrPull = vi.fn(async () => ({ commitHash: 'abc123' }));
await expect(
fetchDbtRepo({
config: { repoUrl: 'https://github.com/acme/dbt.git', path: 'analytics' },
cacheDir,
stagedDir,
deps: { cloneOrPull },
}),
).resolves.toEqual({ commitHash: 'abc123', filesCopied: 2 });
await expect(readFile(join(stagedDir, 'dbt_project.yml'), 'utf-8')).resolves.toContain('analytics');
await expect(readFile(join(stagedDir, 'models', 'orders.yml'), 'utf-8')).resolves.toContain('models');
});
});

View file

@ -0,0 +1,60 @@
import { access, copyFile, mkdir, readdir } from 'node:fs/promises';
import { dirname, join, relative } from 'node:path';
import { cloneOrPull, sanitizeRepoError } from '../../repo-fetch.js';
export interface DbtPullConfig {
repoUrl: string;
branch?: string;
path?: string;
authToken?: string | null;
}
export interface FetchDbtRepoParams {
config: DbtPullConfig;
cacheDir: string;
stagedDir: string;
deps?: {
cloneOrPull?: typeof cloneOrPull;
};
}
export async function fetchDbtRepo(params: FetchDbtRepoParams): Promise<{ commitHash: string; filesCopied: number }> {
try {
const runCloneOrPull = params.deps?.cloneOrPull ?? cloneOrPull;
const { commitHash } = await runCloneOrPull({
repoUrl: params.config.repoUrl,
authToken: params.config.authToken,
cacheDir: params.cacheDir,
branch: params.config.branch ?? 'main',
});
const sourceRoot = params.config.path ? join(params.cacheDir, params.config.path) : params.cacheDir;
const filesCopied = await copyYamlFilesRecursive(sourceRoot, params.stagedDir);
return { commitHash, filesCopied };
} catch (error) {
throw new Error(sanitizeRepoError(error, params.config.authToken));
}
}
async function copyYamlFilesRecursive(sourceRoot: string, destRoot: string): Promise<number> {
try {
await access(sourceRoot);
} catch {
return 0;
}
await mkdir(destRoot, { recursive: true });
const entries = await readdir(sourceRoot, { withFileTypes: true, recursive: true });
let copied = 0;
for (const entry of entries) {
if (!entry.isFile() || !/\.ya?ml$/i.test(entry.name)) {
continue;
}
const absSrc = join(entry.parentPath, entry.name);
const rel = relative(sourceRoot, absSrc);
const dest = join(destRoot, rel);
await mkdir(dirname(dest), { recursive: true });
await copyFile(absSrc, dest);
copied += 1;
}
return copied;
}

View file

@ -0,0 +1,8 @@
import { describe, expect, it } from 'vitest';
import { normalizeDbtPath } from './parse.js';
describe('normalizeDbtPath', () => {
it('normalizes Windows separators to POSIX separators', () => {
expect(normalizeDbtPath('models\\marts\\orders.yml')).toBe('models/marts/orders.yml');
});
});

View file

@ -0,0 +1,32 @@
import { readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
const YAML_EXT_RE = /\.(ya?ml)$/i;
export function normalizeDbtPath(path: string): string {
return path.replaceAll('\\', '/');
}
async function collectYamlFiles(stagedDir: string): Promise<string[]> {
const entries = await readdir(stagedDir, { withFileTypes: true, recursive: true });
const paths: string[] = [];
for (const entry of entries) {
if (!entry.isFile() || !YAML_EXT_RE.test(entry.name)) {
continue;
}
const abs = join(entry.parentPath, entry.name);
paths.push(normalizeDbtPath(relative(stagedDir, abs)));
}
paths.sort();
return paths;
}
export interface ParsedDbtProject {
/** All `.yml` / `.yaml` paths under stagedDir, relative + sorted. */
allPaths: string[];
}
export async function parseDbtStagedDir(stagedDir: string): Promise<ParsedDbtProject> {
const allPaths = await collectYamlFiles(stagedDir);
return { allPaths };
}

View file

@ -0,0 +1,48 @@
import { readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
import type { ChunkResult, DiffSet, SourceAdapter, WorkUnit } from '../../types.js';
export class FakeSourceAdapter implements SourceAdapter {
readonly source = 'fake';
readonly skillNames: string[] = [];
detect(): Promise<boolean> {
return Promise.resolve(true);
}
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const subDirs = (await readdir(stagedDir, { withFileTypes: true }))
.filter((e) => e.isDirectory())
.map((e) => e.name)
.sort();
const workUnits: WorkUnit[] = [];
for (const subDir of subDirs) {
const entries = await readdir(join(stagedDir, subDir), { withFileTypes: true, recursive: true });
const rawFiles = entries
.filter((e) => e.isFile())
.map((e) => relative(stagedDir, join(e.parentPath, e.name)))
.sort();
if (rawFiles.length === 0) {
continue;
}
if (diffSet) {
const touched = new Set([...diffSet.added, ...diffSet.modified]);
const anyTouched = rawFiles.some((p) => touched.has(p));
if (!anyTouched) {
continue;
}
}
workUnits.push({
unitKey: `fake-${subDir}`,
displayLabel: subDir,
rawFiles,
peerFileIndex: [],
dependencyPaths: [],
});
}
const eviction = diffSet && diffSet.deleted.length > 0 ? { deletedRawPaths: [...diffSet.deleted] } : undefined;
return { workUnits, eviction };
}
}

View file

@ -0,0 +1,146 @@
{
"name": "eviction-churn",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": [
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn"
]
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 3,
"rows": [
{
"queryid": "501",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 20,
"totalExecTime": 500,
"meanExecTime": 25,
"totalRows": 40
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": null,
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q501": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 20,
"totalExecTime": 500,
"totalRows": 40
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T08:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn",
"pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn",
"baseline_first_run:no_previous_pgss_baseline"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 3,
"templates": [
{
"id": "db5_q501",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q501/page.md"
}
]
}
},
"templates/db5_q501/metadata.json": {
"json": {
"id": "db5_q501",
"title": "postgres · analytics.orders [db5_q501]",
"path": "templates/db5_q501/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q501/page.md": {
"text": "# db5_q501\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q501/usage.json": {
"json": {
"stats": {
"executions": 20,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 25,
"error_rate": 0,
"rows_produced": 40
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -0,0 +1,144 @@
{
"name": "first-run",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "101",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 10,
"totalExecTime": 250,
"meanExecTime": 25,
"totalRows": 20
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [
"^svc_"
],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": null,
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q101": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 10,
"totalExecTime": 250,
"totalRows": 20
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T08:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_first_run:no_previous_pgss_baseline"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q101",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q101/page.md"
}
]
}
},
"templates/db5_q101/metadata.json": {
"json": {
"id": "db5_q101",
"title": "postgres · analytics.orders [db5_q101]",
"path": "templates/db5_q101/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q101/page.md": {
"text": "# db5_q101\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q101/usage.json": {
"json": {
"stats": {
"executions": 10,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 25,
"error_rate": 0,
"rows_produced": 20
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -0,0 +1,181 @@
{
"name": "normal-delta",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "201",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 12,
"totalExecTime": 160,
"meanExecTime": 13.333333333333334,
"totalRows": 58
},
{
"queryid": "201",
"userid": "12",
"username": "svc_loader",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 5,
"totalExecTime": 50,
"meanExecTime": 10,
"totalRows": 25
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [
"^svc_"
],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q201": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 10,
"totalExecTime": 100,
"totalRows": 50
},
"12": {
"calls": 5,
"totalExecTime": 50,
"totalRows": 25
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q201": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 12,
"totalExecTime": 160,
"totalRows": 58
},
"12": {
"calls": 5,
"totalExecTime": 50,
"totalRows": 25
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": false,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q201",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q201/page.md"
}
]
}
},
"templates/db5_q201/metadata.json": {
"json": {
"id": "db5_q201",
"title": "postgres · analytics.orders [db5_q201]",
"path": "templates/db5_q201/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "low",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q201/page.md": {
"text": "# db5_q201\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q201/usage.json": {
"json": {
"stats": {
"executions": 2,
"distinct_users": 1,
"first_seen": "2026-05-08T09:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 30,
"error_rate": 0,
"rows_produced": 8
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -0,0 +1,159 @@
{
"name": "reset-detected",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T11:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "301",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 3,
"totalExecTime": 90,
"meanExecTime": 30,
"totalRows": 9
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q301": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 100,
"totalExecTime": 1000,
"totalRows": 500
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T11:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q301": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 3,
"totalExecTime": 90,
"totalRows": 9
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z"
],
"degraded": true,
"statsResetAt": "2026-05-08T11:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q301",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q301/page.md"
}
]
}
},
"templates/db5_q301/metadata.json": {
"json": {
"id": "db5_q301",
"title": "postgres · analytics.orders [db5_q301]",
"path": "templates/db5_q301/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q301/page.md": {
"text": "# db5_q301\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q301/usage.json": {
"json": {
"stats": {
"executions": 3,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 30,
"error_rate": 0,
"rows_produced": 9
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -0,0 +1,159 @@
{
"name": "version-change",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "401",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 4,
"totalExecTime": 80,
"meanExecTime": 20,
"totalRows": 8
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 15.7",
"templates": {
"db5_q401": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 100,
"totalExecTime": 1000,
"totalRows": 500
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q401": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 4,
"totalExecTime": 80,
"totalRows": 8
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_reset:pg_server_major changed from 15 to 16"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q401",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q401/page.md"
}
]
}
},
"templates/db5_q401/metadata.json": {
"json": {
"id": "db5_q401",
"title": "postgres · analytics.orders [db5_q401]",
"path": "templates/db5_q401/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q401/page.md": {
"text": "# db5_q401\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q401/usage.json": {
"json": {
"stats": {
"executions": 4,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 20,
"error_rate": 0,
"rows_produced": 8
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -0,0 +1,200 @@
import { describe, expect, it, vi } from 'vitest';
import { BigQueryHistoricSqlQueryHistoryReader } from './bigquery-query-history-reader.js';
import { HistoricSqlGrantsMissingError } from './errors.js';
interface FakeQueryResult {
headers: string[];
rows: unknown[][];
totalRows: number;
error?: string;
}
function queryClient(results: FakeQueryResult[]) {
const executeQuery = vi.fn(async (_query: string) => {
const next = results.shift();
if (!next) {
throw new Error('unexpected query');
}
return next;
});
return { executeQuery };
}
function firstQuery(client: ReturnType<typeof queryClient>): string {
const call = client.executeQuery.mock.calls[0];
if (!call) {
throw new Error('expected query client to be called');
}
return call[0];
}
describe('BigQueryHistoricSqlQueryHistoryReader', () => {
it('probes region-qualified INFORMATION_SCHEMA.JOBS_BY_PROJECT', async () => {
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(reader.probe(client)).resolves.toBeUndefined();
expect(client.executeQuery).toHaveBeenCalledWith(
'SELECT 1 FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` LIMIT 1',
);
});
it('turns probe result errors into HistoricSqlGrantsMissingError', async () => {
const client = queryClient([{ headers: [], rows: [], totalRows: 0, error: 'Access Denied: jobs.listAll' }]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'us-central1' });
await expect(reader.probe(client)).rejects.toMatchObject({
name: 'HistoricSqlGrantsMissingError',
dialect: 'bigquery',
remediation:
'Grant roles/bigquery.resourceViewer on the BigQuery project, or grant a custom role containing bigquery.jobs.listAll.',
});
});
it('turns thrown probe failures into HistoricSqlGrantsMissingError', async () => {
const client = {
executeQuery: vi.fn(async () => {
throw new Error('permission denied');
}),
};
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('fetches BigQuery jobs with cursor and maps them into RawQueryRow shape without rowsProduced', async () => {
const client = queryClient([
{
headers: [
'job_id',
'query',
'user_email',
'creation_time',
'end_time',
'runtime_ms',
'total_slot_ms',
'total_bytes_processed',
'state',
'error_reason',
'error_message',
'statement_type',
],
rows: [
[
'bquxjob_1',
"SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'",
'analyst-a@example.test',
'2026-05-04T10:00:00.000Z',
'2026-05-04T10:00:01.250Z',
1250,
3106,
161164718,
'DONE',
null,
null,
'SELECT',
],
[
'bquxjob_2',
'SELECT * FROM `project-1.analytics.missing_table`',
'analyst-b@example.test',
new Date('2026-05-04T10:05:00.000Z'),
null,
null,
0,
0,
'DONE',
'notFound',
'Not found: Table project-1.analytics.missing_table',
'SELECT',
],
],
totalRows: 2,
},
]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
const rows = [];
for await (const row of reader.fetch(
client,
{
start: new Date('2026-05-01T00:00:00.000Z'),
end: new Date('2026-05-04T12:00:00.000Z'),
},
'2026-05-03T00:00:00.000Z',
)) {
rows.push(row);
}
expect(client.executeQuery).toHaveBeenCalledTimes(1);
const sql = firstQuery(client);
expect(sql).toContain('FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`');
expect(sql).toContain("creation_time >= TIMESTAMP('2026-05-03T00:00:00.000Z')");
expect(sql).toContain("creation_time < TIMESTAMP('2026-05-04T12:00:00.000Z')");
expect(sql).toContain("job_type = 'QUERY'");
expect(sql).toContain("(statement_type IS NULL OR statement_type != 'SCRIPT')");
expect(sql).toContain('ORDER BY creation_time ASC, job_id ASC');
expect(sql).toContain('total_slot_ms');
expect(sql).toContain('total_bytes_processed');
expect(sql).not.toMatch(/total_rows/i);
expect(rows).toEqual([
{
id: 'bquxjob_1',
sql: "SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'",
user: 'analyst-a@example.test',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: '2026-05-04T10:00:01.250Z',
runtimeMs: 1250,
success: true,
errorMessage: null,
},
{
id: 'bquxjob_2',
sql: 'SELECT * FROM `project-1.analytics.missing_table`',
user: 'analyst-b@example.test',
startedAt: '2026-05-04T10:05:00.000Z',
endedAt: null,
runtimeMs: null,
success: false,
errorMessage: 'notFound: Not found: Table project-1.analytics.missing_table',
},
]);
});
it('uses the window start when no cursor is available', async () => {
const client = queryClient([{ headers: ['job_id'], rows: [], totalRows: 0 }]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'EU' });
for await (const _row of reader.fetch(client, {
start: new Date('2026-02-03T12:00:00.000Z'),
end: new Date('2026-05-04T12:00:00.000Z'),
})) {
throw new Error('empty result should not yield rows');
}
const sql = firstQuery(client);
expect(sql).toContain('FROM `project-1.region-eu.INFORMATION_SCHEMA.JOBS_BY_PROJECT`');
expect(sql).toContain("creation_time >= TIMESTAMP('2026-02-03T12:00:00.000Z')");
});
it('throws a clear error when the query client cannot execute SQL', async () => {
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(async () => {
for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) {
throw new Error('unreachable');
}
}).rejects.toThrow('Historic SQL BigQuery reader requires a query client with executeQuery(query)');
});
it('rejects unsafe project and region identifiers before building SQL', () => {
expect(() => new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project`1', region: 'US' })).toThrow(
'Invalid BigQuery project id for historic-SQL ingest: project`1',
);
expect(() => new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US;DROP' })).toThrow(
'Invalid BigQuery region for historic-SQL ingest: US;DROP',
);
});
});

View file

@ -0,0 +1,219 @@
import { HistoricSqlGrantsMissingError } from './errors.js';
import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js';
interface QueryResultLike {
headers: string[];
rows: unknown[][];
totalRows: number;
error?: string;
}
interface QueryClientLike {
executeQuery(query: string): Promise<QueryResultLike>;
}
export interface BigQueryHistoricSqlQueryHistoryReaderOptions {
projectId: string;
region: string;
}
const BIGQUERY_GRANTS_REMEDIATION =
'Grant roles/bigquery.resourceViewer on the BigQuery project, or grant a custom role containing bigquery.jobs.listAll.';
function queryClient(client: unknown): QueryClientLike {
if (
client &&
typeof client === 'object' &&
'executeQuery' in client &&
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
) {
return client as QueryClientLike;
}
throw new Error('Historic SQL BigQuery reader requires a query client with executeQuery(query)');
}
function grantsError(cause: unknown): HistoricSqlGrantsMissingError {
const message =
cause instanceof Error
? cause.message
: typeof cause === 'string'
? cause
: 'BigQuery principal cannot query INFORMATION_SCHEMA.JOBS_BY_PROJECT.';
return new HistoricSqlGrantsMissingError({
dialect: 'bigquery',
message: `Missing BigQuery audit grants for historic-SQL ingest: ${message}`,
remediation: BIGQUERY_GRANTS_REMEDIATION,
cause,
});
}
function normalizeProjectId(value: string): string {
if (!/^[A-Za-z0-9_-]+$/.test(value)) {
throw new Error(`Invalid BigQuery project id for historic-SQL ingest: ${value}`);
}
return value;
}
function normalizeRegion(value: string): string {
const region = value.trim().toLowerCase().replace(/^region-/, '');
if (!/^[a-z0-9-]+$/.test(region)) {
throw new Error(`Invalid BigQuery region for historic-SQL ingest: ${value}`);
}
return region;
}
function timestampExpression(value: Date | string): string {
const date = value instanceof Date ? value : new Date(value);
if (Number.isNaN(date.getTime())) {
throw new Error(`Invalid BigQuery query-history timestamp: ${String(value)}`);
}
return `TIMESTAMP('${date.toISOString().replace(/'/g, "\\'")}')`;
}
function indexByHeader(headers: string[]): Map<string, number> {
const out = new Map<string, number>();
headers.forEach((header, index) => {
out.set(header.toUpperCase(), index);
});
return out;
}
function value(row: unknown[], indexes: Map<string, number>, name: string): unknown {
const index = indexes.get(name.toUpperCase());
return index === undefined ? null : row[index];
}
function nullableString(raw: unknown): string | null {
if (raw === null || raw === undefined) {
return null;
}
const text = String(raw);
return text.length > 0 ? text : null;
}
function requiredString(raw: unknown, field: string): string {
const text = nullableString(raw);
if (!text) {
throw new Error(`BigQuery JOBS_BY_PROJECT row is missing ${field}`);
}
return text;
}
function nullableNumber(raw: unknown): number | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
const number = typeof raw === 'number' ? raw : Number(raw);
if (!Number.isFinite(number)) {
return null;
}
return Math.max(0, number);
}
function isoTimestamp(raw: unknown, field: string): string {
if (raw instanceof Date) {
return raw.toISOString();
}
const text = requiredString(raw, field);
const date = new Date(text);
if (Number.isNaN(date.getTime())) {
throw new Error(`BigQuery JOBS_BY_PROJECT row has invalid ${field}: ${text}`);
}
return date.toISOString();
}
function nullableIsoTimestamp(raw: unknown): string | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
return isoTimestamp(raw, 'end_time');
}
function executionSucceeded(state: string | null, errorReason: string | null, errorMessage: string | null): boolean {
if (errorReason || errorMessage) {
return false;
}
return state === null || state.toUpperCase() === 'DONE';
}
function combinedErrorMessage(errorReason: string | null, errorMessage: string | null): string | null {
if (errorReason && errorMessage) {
return `${errorReason}: ${errorMessage}`;
}
return errorMessage ?? errorReason;
}
function mapRow(row: unknown[], indexes: Map<string, number>): HistoricSqlRawQueryRow {
const errorReason = nullableString(value(row, indexes, 'error_reason'));
const errorMessage = nullableString(value(row, indexes, 'error_message'));
return {
id: requiredString(value(row, indexes, 'job_id'), 'job_id'),
sql: requiredString(value(row, indexes, 'query'), 'query'),
user: nullableString(value(row, indexes, 'user_email')),
startedAt: isoTimestamp(value(row, indexes, 'creation_time'), 'creation_time'),
endedAt: nullableIsoTimestamp(value(row, indexes, 'end_time')),
runtimeMs: nullableNumber(value(row, indexes, 'runtime_ms')),
success: executionSucceeded(nullableString(value(row, indexes, 'state')), errorReason, errorMessage),
errorMessage: combinedErrorMessage(errorReason, errorMessage),
};
}
export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader {
private readonly viewPath: string;
constructor(options: BigQueryHistoricSqlQueryHistoryReaderOptions) {
const projectId = normalizeProjectId(options.projectId);
const region = normalizeRegion(options.region);
this.viewPath = `\`${projectId}.region-${region}.INFORMATION_SCHEMA.JOBS_BY_PROJECT\``;
}
async probe(client: unknown): Promise<void> {
let result: QueryResultLike;
try {
result = await queryClient(client).executeQuery(`SELECT 1 FROM ${this.viewPath} LIMIT 1`);
} catch (error) {
throw grantsError(error);
}
if (result.error) {
throw grantsError(result.error);
}
}
async *fetch(
client: unknown,
window: HistoricSqlTimeWindow,
cursor?: string | null,
): AsyncIterable<HistoricSqlRawQueryRow> {
const start = timestampExpression(cursor ?? window.start);
const end = timestampExpression(window.end);
const sql = `
SELECT
job_id,
query,
user_email,
creation_time,
end_time,
TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND) AS runtime_ms,
total_slot_ms,
total_bytes_processed,
state,
error_result.reason AS error_reason,
error_result.message AS error_message,
statement_type
FROM ${this.viewPath}
WHERE creation_time >= ${start}
AND creation_time < ${end}
AND job_type = 'QUERY'
AND query IS NOT NULL
AND (statement_type IS NULL OR statement_type != 'SCRIPT')
ORDER BY creation_time ASC, job_id ASC`.trim();
const result = await queryClient(client).executeQuery(sql);
if (result.error) {
throw grantsError(result.error);
}
const indexes = indexByHeader(result.headers);
for (const row of result.rows) {
yield mapRow(row, indexes);
}
}
}

View file

@ -0,0 +1,251 @@
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-chunk-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
async function writeTemplate(root: string): Promise<void> {
await writeJson(root, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 1,
capped: false,
warnings: ['source warning'],
templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }],
});
await writeJson(root, 'templates/fp_1/metadata.json', {
id: 'fp_1',
title: 'snowflake · analytics.orders [fp_1]',
path: 'templates/fp_1/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_1',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
},
});
await writeFile(join(root, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8');
await writeJson(root, 'templates/fp_1/usage.json', {
stats: {
executions: 20,
distinct_users: 3,
first_seen: '2026-05-01T00:00:00.000Z',
last_seen: '2026-05-04T11:55:00.000Z',
p50_runtime_ms: 100,
p95_runtime_ms: 200,
error_rate: 0,
rows_produced: 20,
},
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }],
samples: [],
});
}
async function writeSubclusterTemplates(root: string): Promise<void> {
await writeJson(root, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 2,
capped: false,
warnings: [],
templates: [
{
id: 'fp_order_status__cat_2b2ff2318877',
fingerprint: 'fp_order_status',
subClusterId: 'cat_2b2ff2318877',
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
},
{
id: 'fp_order_status__cat_34f037ddcbfa',
fingerprint: 'fp_order_status',
subClusterId: 'cat_34f037ddcbfa',
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
},
],
});
for (const template of [
{ id: 'fp_order_status__cat_2b2ff2318877', subClusterId: 'cat_2b2ff2318877' },
{ id: 'fp_order_status__cat_34f037ddcbfa', subClusterId: 'cat_34f037ddcbfa' },
]) {
await writeJson(root, `templates/${template.id}/metadata.json`, {
id: template.id,
title: `snowflake · analytics.orders [fp_ord:${template.subClusterId.slice(-6)}]`,
path: `templates/${template.id}/page.md`,
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_order_status',
sub_cluster_id: template.subClusterId,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
triage_signals: {
executions_bucket: 'mid',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '0 constant, 0 runtime',
},
},
});
await writeFile(join(root, `templates/${template.id}/page.md`), `# ${template.id}\n`, 'utf-8');
await writeJson(root, `templates/${template.id}/usage.json`, {
stats: {
executions: 3,
distinct_users: 3,
first_seen: '2026-05-04T10:00:00.000Z',
last_seen: '2026-05-04T10:05:00.000Z',
p50_runtime_ms: 120,
p95_runtime_ms: 150,
error_rate: 0,
rows_produced: 36,
},
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }],
samples: [],
});
}
}
describe('chunkHistoricSqlStagedDir', () => {
it('emits one WorkUnit per changed template and keeps usage as dependency', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: ['templates/fp_1/metadata.json'],
modified: [],
deleted: [],
unchanged: ['templates/fp_1/page.md', 'templates/fp_1/usage.json', 'manifest.json'],
});
expect(result.workUnits).toEqual([
{
unitKey: 'historic-sql-fp-1',
displayLabel: 'snowflake · analytics.orders [fp_1]',
rawFiles: ['templates/fp_1/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_1/usage.json'],
peerFileIndex: ['templates/fp_1/page.md'],
notes:
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
},
]);
expect(result.contextReport).toEqual({ capped: false, warnings: ['source warning'] });
});
it('emits one WorkUnit per changed categorical sub-cluster', async () => {
const stagedDir = await tempDir();
await writeSubclusterTemplates(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [
'templates/fp_order_status__cat_2b2ff2318877/metadata.json',
'templates/fp_order_status__cat_34f037ddcbfa/metadata.json',
],
modified: [],
deleted: [],
unchanged: [
'manifest.json',
'templates/fp_order_status__cat_2b2ff2318877/page.md',
'templates/fp_order_status__cat_2b2ff2318877/usage.json',
'templates/fp_order_status__cat_34f037ddcbfa/page.md',
'templates/fp_order_status__cat_34f037ddcbfa/usage.json',
],
});
expect(
result.workUnits.map((unit) => ({
unitKey: unit.unitKey,
displayLabel: unit.displayLabel,
rawFiles: unit.rawFiles,
dependencyPaths: unit.dependencyPaths,
})),
).toEqual([
{
unitKey: 'historic-sql-fp-order-status-cat-2b2ff2318877',
displayLabel: 'snowflake · analytics.orders [fp_ord:318877]',
rawFiles: ['templates/fp_order_status__cat_2b2ff2318877/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_2b2ff2318877/usage.json'],
},
{
unitKey: 'historic-sql-fp-order-status-cat-34f037ddcbfa',
displayLabel: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
rawFiles: ['templates/fp_order_status__cat_34f037ddcbfa/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'],
},
]);
});
it('emits zero WorkUnits for usage-only diffs', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [],
modified: ['templates/fp_1/usage.json'],
deleted: [],
unchanged: ['templates/fp_1/metadata.json', 'templates/fp_1/page.md', 'manifest.json'],
});
expect(result.workUnits).toEqual([]);
expect(result.eviction).toBeUndefined();
});
it('emits eviction only for deleted metadata or page files', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [],
modified: [],
deleted: ['templates/fp_1/usage.json', 'templates/fp_2/page.md'],
unchanged: [],
});
expect(result.eviction).toEqual({ deletedRawPaths: ['templates/fp_2/page.md'] });
});
it('describes historic-sql scope without including unrelated paths', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const scope = await describeHistoricSqlScope(stagedDir);
expect(scope.fingerprint).toHaveLength(64);
expect(scope.isPathInScope('manifest.json')).toBe(true);
expect(scope.isPathInScope('templates/fp_1/usage.json')).toBe(true);
expect(scope.isPathInScope('pages/notion/page.md')).toBe(false);
});
});

View file

@ -0,0 +1,86 @@
import { createHash } from 'node:crypto';
import { readFile, readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js';
import { historicSqlManifestSchema, historicSqlMetadataSchema } from './types.js';
async function walk(root: string): Promise<string[]> {
const entries = await readdir(root, { withFileTypes: true, recursive: true });
return entries
.filter((entry) => entry.isFile())
.map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/'))
.sort();
}
function safeUnitKey(id: string): string {
return `historic-sql-${id.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
}
async function readManifest(stagedDir: string) {
try {
return historicSqlManifestSchema.parse(JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')));
} catch (error) {
throw new Error(`Invalid historic-SQL manifest: ${error instanceof Error ? error.message : String(error)}`);
}
}
export async function chunkHistoricSqlStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const files = await walk(stagedDir);
const manifest = await readManifest(stagedDir);
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
const workUnits: WorkUnit[] = [];
for (const pagePath of files.filter((path) => /^templates\/[^/]+\/page\.md$/.test(path))) {
const metadataPath = pagePath.replace(/\/page\.md$/, '/metadata.json');
const usagePath = pagePath.replace(/\/page\.md$/, '/usage.json');
const primary = [metadataPath, pagePath].filter((path) => files.includes(path));
if (touched && !primary.some((path) => touched.has(path))) {
continue;
}
const metadata = historicSqlMetadataSchema.parse(JSON.parse(await readFile(join(stagedDir, metadataPath), 'utf-8')));
const rawFiles = touched ? primary.filter((path) => touched.has(path)).sort() : primary.sort();
const dependencyPaths = ['manifest.json', files.includes(usagePath) ? usagePath : null]
.filter((path): path is string => typeof path === 'string' && !rawFiles.includes(path))
.sort();
const excluded = new Set([...rawFiles, ...dependencyPaths]);
const peerFileIndex = files.filter((path) => !excluded.has(path)).sort();
workUnits.push({
unitKey: safeUnitKey(metadata.id),
displayLabel: metadata.title,
rawFiles,
dependencyPaths,
peerFileIndex,
notes:
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
});
}
const deletedPrimary = diffSet?.deleted.filter((path) => /^templates\/[^/]+\/(metadata\.json|page\.md)$/.test(path));
return {
workUnits,
eviction: deletedPrimary && deletedPrimary.length > 0 ? { deletedRawPaths: deletedPrimary.sort() } : undefined,
reconcileNotes: [`Historic-SQL staged templates=${manifest.templateCount}`],
contextReport: {
capped: manifest.capped,
warnings: manifest.warnings,
},
};
}
export async function describeHistoricSqlScope(stagedDir: string): Promise<ScopeDescriptor> {
const manifest = await readManifest(stagedDir);
const scopeKey = JSON.stringify({
connectionId: manifest.connectionId,
dialect: manifest.dialect,
windowStart: manifest.windowStart,
windowEnd: manifest.windowEnd,
});
const fingerprint = createHash('sha256').update(scopeKey).digest('hex');
return {
fingerprint,
isPathInScope: (rawPath) => rawPath === 'manifest.json' || rawPath.startsWith('templates/'),
};
}

View file

@ -0,0 +1,197 @@
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { detectHistoricSqlStagedDir } from './detect.js';
import {
HISTORIC_SQL_SOURCE_KEY,
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlPullConfigSchema,
historicSqlUsageSchema,
} from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-detect-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
describe('historic-sql staged dir detection', () => {
it('detects manifest source', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 0,
capped: false,
warnings: [],
templates: [],
});
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
});
it('detects document-shaped template structure without manifest', async () => {
const stagedDir = await tempDir();
await writeFile(join(stagedDir, 'not-a-match.txt'), 'x', 'utf-8');
await mkdir(join(stagedDir, 'templates', 'fp_1'), { recursive: true });
await writeFile(join(stagedDir, 'templates', 'fp_1', 'metadata.json'), '{}', 'utf-8');
await writeFile(join(stagedDir, 'templates', 'fp_1', 'page.md'), '# fp_1\n', 'utf-8');
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
});
it('does not detect unrelated directories', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', { source: 'notion' });
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(false);
});
});
describe('historic-sql schemas', () => {
it('defaults disabled optional pull-config fields through the parser', () => {
expect(
historicSqlPullConfigSchema.parse({
dialect: 'bigquery',
}),
).toEqual({
dialect: 'bigquery',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
});
});
it('accepts postgres pull config with a minCalls floor', () => {
expect(
historicSqlPullConfigSchema.parse({
dialect: 'postgres',
minCalls: 12,
}),
).toEqual({
dialect: 'postgres',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 12,
});
});
it('accepts postgres manifest fields with defaults for older dialects', () => {
expect(
historicSqlManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_pg',
dialect: 'postgres',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowStart: '2026-05-08T11:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
templateCount: 0,
capped: false,
warnings: [],
templates: [],
degraded: true,
statsResetAt: '2026-05-01T00:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 3,
}),
).toMatchObject({
dialect: 'postgres',
degraded: true,
statsResetAt: '2026-05-01T00:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 3,
});
expect(
historicSqlManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_sf',
dialect: 'snowflake',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowStart: '2026-05-01T12:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: null,
templateCount: 0,
capped: false,
warnings: [],
templates: [],
}),
).toMatchObject({
degraded: false,
statsResetAt: null,
baselineFirstRun: false,
pgServerVersion: null,
deallocCount: null,
});
});
it('accepts postgres usage stats with mean_runtime_ms and empty samples', () => {
const parsed = historicSqlUsageSchema.parse({
stats: {
executions: 25,
distinct_users: 2,
first_seen: '2026-05-08T10:00:00.000Z',
last_seen: '2026-05-08T12:00:00.000Z',
p50_runtime_ms: null,
p95_runtime_ms: null,
mean_runtime_ms: 32.5,
error_rate: 0,
rows_produced: 1042,
},
literal_slots: [],
samples: [],
});
expect(parsed.stats.mean_runtime_ms).toBe(32.5);
expect(parsed.samples).toEqual([]);
});
it('pins the Notion-compatible metadata envelope', () => {
const parsed = historicSqlMetadataSchema.parse({
id: 'fp_1',
title: 'snowflake · analytics.orders [fp_1]',
path: 'templates/fp_1/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_1',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
},
});
expect(parsed.objectType).toBe('historic_sql_template');
expect(parsed.lastEditedAt).toBeNull();
expect(parsed.properties.triage_signals.service_account_only).toBe('false');
});
});

View file

@ -0,0 +1,37 @@
import { readFile, readdir } from 'node:fs/promises';
import { join } from 'node:path';
import { HISTORIC_SQL_SOURCE_KEY } from './types.js';
export async function detectHistoricSqlStagedDir(stagedDir: string): Promise<boolean> {
try {
const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as { source?: unknown };
if (manifest.source === HISTORIC_SQL_SOURCE_KEY) {
return true;
}
if (manifest.source !== undefined) {
return false;
}
} catch {
// Fall through to structural detection for stage-only fixtures.
}
try {
const entries = await readdir(join(stagedDir, 'templates'), { withFileTypes: true, recursive: true });
const metadataDirs = new Set<string>();
const pageDirs = new Set<string>();
for (const entry of entries) {
if (!entry.isFile()) {
continue;
}
if (entry.name === 'metadata.json') {
metadataDirs.add(entry.parentPath);
}
if (entry.name === 'page.md') {
pageDirs.add(entry.parentPath);
}
}
return [...metadataDirs].some((dir) => pageDirs.has(dir));
} catch {
return false;
}
}

View file

@ -0,0 +1,61 @@
import type { HistoricSqlDialect } from './types.js';
interface HistoricSqlGrantsMissingErrorOptions {
dialect: HistoricSqlDialect;
message: string;
remediation: string;
cause?: unknown;
}
export class HistoricSqlGrantsMissingError extends Error {
readonly dialect: HistoricSqlDialect;
readonly remediation: string;
constructor(options: HistoricSqlGrantsMissingErrorOptions) {
super(options.message, options.cause === undefined ? undefined : { cause: options.cause });
this.name = 'HistoricSqlGrantsMissingError';
this.dialect = options.dialect;
this.remediation = options.remediation;
}
}
interface HistoricSqlExtensionMissingErrorOptions {
dialect: HistoricSqlDialect;
message: string;
remediation: string;
cause?: unknown;
}
export class HistoricSqlExtensionMissingError extends Error {
readonly dialect: HistoricSqlDialect;
readonly remediation: string;
constructor(options: HistoricSqlExtensionMissingErrorOptions) {
super(options.message, options.cause === undefined ? undefined : { cause: options.cause });
this.name = 'HistoricSqlExtensionMissingError';
this.dialect = options.dialect;
this.remediation = options.remediation;
}
}
interface HistoricSqlVersionUnsupportedErrorOptions {
dialect: HistoricSqlDialect;
detectedVersion: string;
minimumVersion: string;
}
export class HistoricSqlVersionUnsupportedError extends Error {
readonly dialect: HistoricSqlDialect;
readonly detectedVersion: string;
readonly minimumVersion: string;
constructor(options: HistoricSqlVersionUnsupportedErrorOptions) {
super(
`Unsupported ${options.dialect} version for historic-SQL ingest: detected ${options.detectedVersion}; requires ${options.minimumVersion} or newer.`,
);
this.name = 'HistoricSqlVersionUnsupportedError';
this.dialect = options.dialect;
this.detectedVersion = options.detectedVersion;
this.minimumVersion = options.minimumVersion;
}
}

View file

@ -0,0 +1,304 @@
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it, vi } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js';
import { pgssBaselinePath } from './stage-pgss.js';
import type { HistoricSqlQueryHistoryReader, PostgresPgssReader } from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-adapter-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
const sqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint() {
return {
fingerprint: 'fp_1',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: 'paid' }],
};
},
};
const reader: HistoricSqlQueryHistoryReader = {
async probe() {},
async *fetch() {
yield {
id: 'q1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: null,
runtimeMs: 10,
rowsProduced: 1,
success: true,
errorMessage: null,
};
},
};
describe('HistoricSqlSourceAdapter', () => {
it('declares canonical adapter metadata', () => {
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
expect(adapter.source).toBe('historic-sql');
expect(adapter.skillNames).toEqual(['historic_sql_ingest']);
expect(adapter.reconcileSkillNames).toEqual(['historic_sql_curator']);
expect(adapter.evidenceIndexing).toBe('documents');
expect(adapter.triageSupported).toBe(true);
});
it('fetches staged templates through injected reader and SqlAnalysisPort', async () => {
const stagedDir = await tempDir();
const adapter = new HistoricSqlSourceAdapter({
sqlAnalysis,
reader,
queryClient: {},
now: () => new Date('2026-05-04T12:00:00.000Z'),
});
await adapter.fetch(
{
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
},
stagedDir,
{ connectionId: 'conn_1', sourceKey: 'historic-sql' },
);
await expect(adapter.detect(stagedDir)).resolves.toBe(true);
});
it('reads triage signals from usage.json and metadata properties', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 1,
capped: false,
warnings: [],
templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }],
});
await writeJson(stagedDir, 'templates/fp_1/metadata.json', {
id: 'fp_1',
title: 'snowflake · analytics.orders [fp_1]',
path: 'templates/fp_1/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_1',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
},
});
await writeFile(join(stagedDir, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8');
await writeJson(stagedDir, 'templates/fp_1/usage.json', {
stats: {
executions: 20,
distinct_users: 3,
first_seen: '2026-05-01T00:00:00.000Z',
last_seen: '2026-05-04T11:55:00.000Z',
p50_runtime_ms: 100,
p95_runtime_ms: 200,
error_rate: 0,
},
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }],
samples: [],
});
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
await expect(adapter.getTriageSignals(stagedDir, 'fp_1')).resolves.toEqual({
objectType: 'historic_sql_template',
lastEditedAt: '2026-05-04T11:55:00.000Z',
propertyHints: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
});
});
it('dispatches postgres fetches through PGSS staging and writes the baseline only after pull success', async () => {
const stagedDir = await tempDir();
const baselineRootDir = await tempDir();
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
const unusedPerExecutionReader: HistoricSqlQueryHistoryReader = {
async probe() {
throw new Error('per-execution reader must not be used for postgres');
},
async *fetch() {
throw new Error('per-execution reader must not be used for postgres');
},
};
const postgresReader: PostgresPgssReader = {
async probe() {
return { pgServerVersion: 'PostgreSQL 16.4', warnings: [] };
},
async readSnapshot() {
return {
statsResetAt: '2026-05-08T08:00:00.000Z',
deallocCount: 0,
rows: [
{
queryid: '901',
userid: '11',
username: 'analyst',
dbid: '5',
database: 'warehouse',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 9,
totalExecTime: 90,
meanExecTime: 10,
totalRows: 18,
},
],
};
},
};
const adapter = new HistoricSqlSourceAdapter({
sqlAnalysis,
reader: unusedPerExecutionReader,
queryClient: {},
postgresReader,
postgresQueryClient: {
async executeQuery() {
return { headers: [], rows: [] };
},
},
postgresBaselineRootDir: baselineRootDir,
now: () => new Date('2026-05-08T12:00:00.000Z'),
});
await adapter.fetch(
{
dialect: 'postgres',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
stagedDir,
{ connectionId: 'conn_pg', sourceKey: 'historic-sql' },
);
const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as {
dialect: string;
baselineFirstRun: boolean;
templates: Array<{ id: string }>;
};
expect(manifest.dialect).toBe('postgres');
expect(manifest.baselineFirstRun).toBe(true);
expect(manifest.templates).toEqual([
{ id: 'db5_q901', fingerprint: 'fp_1', subClusterId: null, path: 'templates/db5_q901/page.md' },
]);
await expect(readFile(baselinePath, 'utf-8')).rejects.toMatchObject({ code: 'ENOENT' });
await adapter.onPullSucceeded({
connectionId: 'conn_pg',
sourceKey: 'historic-sql',
syncId: 'sync_pg',
trigger: 'scheduled_pull',
completedAt: new Date('2026-05-08T12:01:00.000Z'),
stagedDir,
});
const baseline = JSON.parse(await readFile(baselinePath, 'utf-8')) as {
fetchedAt: string;
templates: Record<string, { perUser: Record<string, { calls: number }> }>;
};
expect(baseline.fetchedAt).toBe('2026-05-08T12:00:00.000Z');
expect(baseline.templates.db5_q901.perUser['11'].calls).toBe(9);
});
it('fails postgres fetches clearly when no PGSS reader is configured', async () => {
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
await expect(
adapter.fetch(
{
dialect: 'postgres',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
await tempDir(),
{ connectionId: 'conn_pg', sourceKey: 'historic-sql' },
),
).rejects.toThrow('Historic SQL Postgres fetch requires deps.postgresReader');
});
it('forwards manifest cursor through onPullSucceeded without changing the SourceAdapter signature', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 0,
capped: false,
warnings: [],
templates: [],
});
const onPullSucceeded = vi.fn(async () => {});
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {}, onPullSucceeded });
const completedAt = new Date('2026-05-04T12:01:00.000Z');
await adapter.onPullSucceeded({
connectionId: 'conn_1',
sourceKey: 'historic-sql',
syncId: 'sync_1',
trigger: 'scheduled_pull',
completedAt,
stagedDir,
});
expect(onPullSucceeded).toHaveBeenCalledWith({
connectionId: 'conn_1',
sourceKey: 'historic-sql',
syncId: 'sync_1',
trigger: 'scheduled_pull',
completedAt,
stagedDir,
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
});
});
});

View file

@ -0,0 +1,135 @@
import { readFile } from 'node:fs/promises';
import { join } from 'node:path';
import type {
ChunkResult,
DiffSet,
FetchContext,
IngestTrigger,
ScopeDescriptor,
SourceAdapter,
TriageSignals,
} from '../../types.js';
import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js';
import { detectHistoricSqlStagedDir } from './detect.js';
import { stageHistoricSqlTemplates } from './stage.js';
import {
pgssBaselinePath,
stagePgStatStatementsTemplates,
writePgssBaselineAtomic,
type StagePgStatStatementsTemplatesResult,
} from './stage-pgss.js';
import {
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlPullConfigSchema,
historicSqlUsageSchema,
type HistoricSqlSourceAdapterDeps,
} from './types.js';
export class HistoricSqlSourceAdapter implements SourceAdapter {
readonly source = 'historic-sql';
readonly skillNames = ['historic_sql_ingest'];
readonly reconcileSkillNames = ['historic_sql_curator'];
readonly evidenceIndexing = 'documents' as const;
readonly triageSupported = true;
private readonly pendingPgssBaselines = new Map<string, StagePgStatStatementsTemplatesResult>();
constructor(private readonly deps: HistoricSqlSourceAdapterDeps) {}
detect(stagedDir: string): Promise<boolean> {
return detectHistoricSqlStagedDir(stagedDir);
}
async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
const config = historicSqlPullConfigSchema.parse(pullConfig);
if (config.dialect === 'postgres') {
if (!this.deps.postgresReader) {
throw new Error('Historic SQL Postgres fetch requires deps.postgresReader');
}
const postgresQueryClient = this.deps.postgresQueryClient ?? this.deps.queryClient;
if (
!postgresQueryClient ||
typeof postgresQueryClient !== 'object' ||
!('executeQuery' in postgresQueryClient) ||
typeof (postgresQueryClient as { executeQuery?: unknown }).executeQuery !== 'function'
) {
throw new Error('Historic SQL Postgres fetch requires deps.postgresQueryClient with executeQuery(sql, params?)');
}
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: ctx.connectionId,
queryClient: postgresQueryClient as NonNullable<HistoricSqlSourceAdapterDeps['postgresQueryClient']>,
reader: this.deps.postgresReader,
sqlAnalysis: this.deps.sqlAnalysis,
pullConfig: config,
baselinePath: pgssBaselinePath(this.deps.postgresBaselineRootDir, ctx.connectionId),
now: this.deps.now?.(),
});
this.pendingPgssBaselines.set(stagedDir, result);
return;
}
await stageHistoricSqlTemplates({
stagedDir,
connectionId: ctx.connectionId,
queryClient: this.deps.queryClient,
reader: this.deps.reader,
sqlAnalysis: this.deps.sqlAnalysis,
pullConfig: config,
now: this.deps.now?.(),
});
}
chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
return chunkHistoricSqlStagedDir(stagedDir, diffSet);
}
describeScope(stagedDir: string): Promise<ScopeDescriptor> {
return describeHistoricSqlScope(stagedDir);
}
async getTriageSignals(stagedDir: string, externalId: string): Promise<TriageSignals> {
const manifest = historicSqlManifestSchema.parse(
JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')),
);
const template = manifest.templates.find((entry) => entry.id === externalId);
if (!template) {
return {};
}
const templateDir = template.path.replace(/\/page\.md$/, '');
const metadata = historicSqlMetadataSchema.parse(
JSON.parse(await readFile(join(stagedDir, templateDir, 'metadata.json'), 'utf-8')),
);
const usage = historicSqlUsageSchema.parse(
JSON.parse(await readFile(join(stagedDir, templateDir, 'usage.json'), 'utf-8')),
);
return {
objectType: metadata.objectType,
lastEditedAt: usage.stats.last_seen,
propertyHints: metadata.properties.triage_signals,
};
}
async onPullSucceeded(ctx: {
connectionId: string;
sourceKey: string;
syncId: string;
trigger: IngestTrigger;
completedAt: Date;
stagedDir: string;
}): Promise<void> {
const manifest = historicSqlManifestSchema.parse(
JSON.parse(await readFile(join(ctx.stagedDir, 'manifest.json'), 'utf-8')),
);
if (manifest.dialect === 'postgres') {
const pending = this.pendingPgssBaselines.get(ctx.stagedDir);
if (pending) {
await writePgssBaselineAtomic(pending.baselinePath, pending.baseline);
this.pendingPgssBaselines.delete(ctx.stagedDir);
}
}
await this.deps.onPullSucceeded?.({ ...ctx, nextSuccessfulCursor: manifest.nextSuccessfulCursor });
}
}

View file

@ -0,0 +1,281 @@
import { describe, expect, it, vi } from 'vitest';
import {
HistoricSqlExtensionMissingError,
HistoricSqlGrantsMissingError,
HistoricSqlVersionUnsupportedError,
} from './errors.js';
import { PostgresPgssQueryHistoryReader } from './postgres-pgss-query-history-reader.js';
interface FakeQueryResult {
headers: string[];
rows: unknown[][];
totalRows?: number;
error?: string;
}
function queryClient(results: Array<FakeQueryResult | Error>) {
const executeQuery = vi.fn(async (_query: string, _params?: unknown[]) => {
const next = results.shift();
if (!next) {
throw new Error('unexpected query');
}
if (next instanceof Error) {
throw next;
}
return next;
});
return { executeQuery };
}
function executedSql(client: ReturnType<typeof queryClient>, index: number): string {
const call = client.executeQuery.mock.calls[index];
if (!call) {
throw new Error(`expected query client call ${index}`);
}
return call[0];
}
describe('PostgresPgssQueryHistoryReader', () => {
it('probes version, extension presence, grants, and tracking state', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4 on x86_64-apple-darwin']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['top']] },
{ headers: ['max'], rows: [['5000']] },
]);
const reader = new PostgresPgssQueryHistoryReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4 on x86_64-apple-darwin',
warnings: [],
});
expect(executedSql(client, 0)).toContain("current_setting('server_version_num')::int");
expect(executedSql(client, 1)).toBe('SELECT 1 FROM pg_stat_statements LIMIT 1');
expect(executedSql(client, 2)).toBe(
"SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role",
);
expect(executedSql(client, 3)).toBe("SELECT current_setting('pg_stat_statements.track') AS track");
expect(executedSql(client, 4)).toBe("SELECT current_setting('pg_stat_statements.max') AS max");
});
it('rejects PostgreSQL versions older than 14 without probing the extension', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[130012, 'PostgreSQL 13.12']],
},
{
headers: ['stats_reset', 'dealloc'],
rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]],
},
]);
const reader = new PostgresPgssQueryHistoryReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlVersionUnsupportedError',
dialect: 'postgres',
detectedVersion: 'PostgreSQL 13.12',
minimumVersion: 'PostgreSQL 14',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlVersionUnsupportedError);
expect(client.executeQuery).toHaveBeenCalledTimes(1);
});
it('maps a missing pg_stat_statements relation to HistoricSqlExtensionMissingError', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
new Error('relation "pg_stat_statements" does not exist'),
]);
const reader = new PostgresPgssQueryHistoryReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlExtensionMissingError',
dialect: 'postgres',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlExtensionMissingError);
});
it('maps pg_stat_statements preload failures to HistoricSqlExtensionMissingError with preload remediation', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
new Error('pg_stat_statements must be loaded via shared_preload_libraries'),
]);
const reader = new PostgresPgssQueryHistoryReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlExtensionMissingError',
dialect: 'postgres',
message: 'pg_stat_statements is installed but not loaded via shared_preload_libraries.',
remediation: expect.stringContaining("shared_preload_libraries includes 'pg_stat_statements'"),
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlExtensionMissingError);
});
it('maps missing pg_read_all_stats membership to HistoricSqlGrantsMissingError', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[false]] },
]);
const reader = new PostgresPgssQueryHistoryReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlGrantsMissingError',
dialect: 'postgres',
remediation: 'GRANT pg_read_all_stats TO <connection role>;',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('returns a warning instead of failing when pg_stat_statements.track is none', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['none']] },
{ headers: ['max'], rows: [['5000']] },
]);
const reader = new PostgresPgssQueryHistoryReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4',
warnings: [
"pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config",
],
});
});
it('warns when pg_stat_statements.max is below the recommended floor', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['top']] },
{ headers: ['max'], rows: [['1000']] },
]);
const reader = new PostgresPgssQueryHistoryReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4',
warnings: [
'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn',
],
});
});
it('reads a parameterized pg_stat_statements snapshot and stats info', async () => {
const client = queryClient([
{
headers: [
'queryid',
'userid',
'username',
'dbid',
'database',
'query',
'calls',
'total_exec_time',
'mean_exec_time',
'total_rows',
],
rows: [
[
'922337203685477580',
'16384',
'analyst',
'16385',
'warehouse',
'SELECT count(*) FROM public.orders WHERE status = $1',
'42',
'2100.5',
'50.0119',
'9001',
],
[
'922337203685477581',
'16386',
'unknown',
'16385',
'warehouse',
'SELECT * FROM public.customers WHERE id = $1',
5,
30,
6,
5,
],
],
},
{
headers: ['stats_reset', 'dealloc'],
rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]],
},
]);
const reader = new PostgresPgssQueryHistoryReader();
await expect(reader.readSnapshot(client, { minCalls: 5, maxTemplates: 500 })).resolves.toEqual({
statsResetAt: '2026-05-01T00:00:00.000Z',
deallocCount: 7,
rows: [
{
queryid: '922337203685477580',
userid: '16384',
username: 'analyst',
dbid: '16385',
database: 'warehouse',
query: 'SELECT count(*) FROM public.orders WHERE status = $1',
calls: 42,
totalExecTime: 2100.5,
meanExecTime: 50.0119,
totalRows: 9001,
},
{
queryid: '922337203685477581',
userid: '16386',
username: 'unknown',
dbid: '16385',
database: 'warehouse',
query: 'SELECT * FROM public.customers WHERE id = $1',
calls: 5,
totalExecTime: 30,
meanExecTime: 6,
totalRows: 5,
},
],
});
const snapshotSql = executedSql(client, 0);
expect(snapshotSql).toContain('FROM pg_stat_statements s');
expect(snapshotSql).toContain('LEFT JOIN pg_roles');
expect(snapshotSql).toContain('LEFT JOIN pg_database');
expect(snapshotSql).toContain('WHERE s.toplevel = true');
expect(snapshotSql).toContain('AND s.calls >= $1');
expect(snapshotSql).toContain('ORDER BY s.total_exec_time DESC');
expect(snapshotSql).toContain('LIMIT $2');
expect(client.executeQuery.mock.calls[0]?.[1]).toEqual([5, 500]);
expect(executedSql(client, 1)).toBe('SELECT stats_reset, dealloc FROM pg_stat_statements_info');
});
});

View file

@ -0,0 +1,262 @@
import {
HistoricSqlExtensionMissingError,
HistoricSqlGrantsMissingError,
HistoricSqlVersionUnsupportedError,
} from './errors.js';
import type {
KloPostgresQueryClient,
PostgresPgssProbeResult,
PostgresPgssReader,
PostgresPgssRow,
PostgresPgssSnapshot,
} from './types.js';
interface QueryResultLike {
headers: string[];
rows: unknown[][];
totalRows?: number;
error?: string;
}
const VERSION_SQL = `
SELECT current_setting('server_version_num')::int AS server_version_num,
version() AS server_version
`.trim();
const EXTENSION_PROBE_SQL = 'SELECT 1 FROM pg_stat_statements LIMIT 1';
const GRANTS_PROBE_SQL = "SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role";
const TRACKING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.track') AS track";
const MAX_SETTING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.max') AS max";
const RECOMMENDED_PGSS_MAX = 5000;
const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info';
const SNAPSHOT_SQL = `
SELECT
s.queryid::text AS queryid,
s.userid::text AS userid,
COALESCE(r.rolname, 'unknown') AS username,
s.dbid::text AS dbid,
d.datname AS database,
s.query,
s.calls,
s.total_exec_time,
s.mean_exec_time,
s.rows AS total_rows
FROM pg_stat_statements s
LEFT JOIN pg_roles r ON s.userid = r.oid
LEFT JOIN pg_database d ON s.dbid = d.oid
WHERE s.toplevel = true
AND s.calls >= $1
ORDER BY s.total_exec_time DESC
LIMIT $2
`.trim();
const POSTGRES_EXTENSION_REMEDIATION = [
'Run CREATE EXTENSION pg_stat_statements; against the connection database.',
"Ensure shared_preload_libraries includes 'pg_stat_statements' in the Postgres parameter group or config.",
].join(' ');
const POSTGRES_GRANTS_REMEDIATION = 'GRANT pg_read_all_stats TO <connection role>;';
function queryClient(client: unknown): KloPostgresQueryClient {
if (
client &&
typeof client === 'object' &&
'executeQuery' in client &&
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
) {
return client as KloPostgresQueryClient;
}
throw new Error('Historic SQL Postgres PGSS reader requires a query client with executeQuery(sql, params?)');
}
async function execute(client: KloPostgresQueryClient, sql: string, params?: unknown[]): Promise<QueryResultLike> {
const result = await client.executeQuery(sql, params);
if ('error' in result && typeof result.error === 'string' && result.error.length > 0) {
throw new Error(result.error);
}
return result;
}
function indexes(headers: string[]): Map<string, number> {
const out = new Map<string, number>();
headers.forEach((header, index) => out.set(header.toLowerCase(), index));
return out;
}
function value(row: unknown[], headerIndexes: Map<string, number>, header: string): unknown {
const index = headerIndexes.get(header.toLowerCase());
return index === undefined ? null : row[index];
}
function nullableString(raw: unknown): string | null {
if (raw === null || raw === undefined) {
return null;
}
const text = String(raw);
return text.length > 0 ? text : null;
}
function requiredString(raw: unknown, field: string): string {
const text = nullableString(raw);
if (!text) {
throw new Error(`Postgres pg_stat_statements row is missing ${field}`);
}
return text;
}
function requiredFiniteNumber(raw: unknown, field: string): number {
const number = typeof raw === 'number' ? raw : Number(raw);
if (!Number.isFinite(number)) {
throw new Error(`Postgres pg_stat_statements row has invalid ${field}: ${String(raw)}`);
}
return number;
}
function nullableInteger(raw: unknown): number | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
const number = typeof raw === 'number' ? raw : Number(raw);
return Number.isFinite(number) ? Math.trunc(number) : null;
}
function nullableIsoTimestamp(raw: unknown): string | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
if (raw instanceof Date) {
return raw.toISOString();
}
const date = new Date(String(raw));
return Number.isNaN(date.getTime()) ? null : date.toISOString();
}
function firstRow(result: QueryResultLike, context: string): { row: unknown[]; headers: Map<string, number> } {
const row = result.rows[0];
if (!row) {
throw new Error(`Postgres historic-SQL ${context} query returned no rows`);
}
return { row, headers: indexes(result.headers) };
}
function isMissingPgssRelation(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /relation ["']?pg_stat_statements["']? does not exist/i.test(message);
}
function isPgssPreloadRequired(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /pg_stat_statements.*shared_preload_libraries/i.test(message);
}
function extensionMissingError(cause: unknown, message?: string): HistoricSqlExtensionMissingError {
return new HistoricSqlExtensionMissingError({
dialect: 'postgres',
message: message ?? 'pg_stat_statements extension is not installed in the connection database.',
remediation: POSTGRES_EXTENSION_REMEDIATION,
cause,
});
}
function grantsMissingError(): HistoricSqlGrantsMissingError {
return new HistoricSqlGrantsMissingError({
dialect: 'postgres',
message: 'Postgres connection role lacks pg_read_all_stats for historic-SQL ingest.',
remediation: POSTGRES_GRANTS_REMEDIATION,
});
}
function mapSnapshotRow(row: unknown[], headerIndexes: Map<string, number>): PostgresPgssRow {
return {
queryid: requiredString(value(row, headerIndexes, 'queryid'), 'queryid'),
userid: requiredString(value(row, headerIndexes, 'userid'), 'userid'),
username: nullableString(value(row, headerIndexes, 'username')),
dbid: requiredString(value(row, headerIndexes, 'dbid'), 'dbid'),
database: nullableString(value(row, headerIndexes, 'database')),
query: requiredString(value(row, headerIndexes, 'query'), 'query'),
calls: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'calls'), 'calls')),
totalExecTime: requiredFiniteNumber(value(row, headerIndexes, 'total_exec_time'), 'total_exec_time'),
meanExecTime: requiredFiniteNumber(value(row, headerIndexes, 'mean_exec_time'), 'mean_exec_time'),
totalRows: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'total_rows'), 'total_rows')),
};
}
export class PostgresPgssQueryHistoryReader implements PostgresPgssReader {
async probe(client: unknown): Promise<PostgresPgssProbeResult> {
const pgClient = queryClient(client);
const versionResult = await execute(pgClient, VERSION_SQL);
const { row: versionRow, headers: versionHeaders } = firstRow(versionResult, 'version probe');
const serverVersionNum = requiredFiniteNumber(
value(versionRow, versionHeaders, 'server_version_num'),
'server_version_num',
);
const pgServerVersion = requiredString(value(versionRow, versionHeaders, 'server_version'), 'server_version');
if (serverVersionNum < 140000) {
throw new HistoricSqlVersionUnsupportedError({
dialect: 'postgres',
detectedVersion: pgServerVersion,
minimumVersion: 'PostgreSQL 14',
});
}
try {
await execute(pgClient, EXTENSION_PROBE_SQL);
} catch (error) {
if (isMissingPgssRelation(error)) {
throw extensionMissingError(error);
}
if (isPgssPreloadRequired(error)) {
throw extensionMissingError(
error,
'pg_stat_statements is installed but not loaded via shared_preload_libraries.',
);
}
throw error;
}
const grantsResult = await execute(pgClient, GRANTS_PROBE_SQL);
const { row: grantsRow, headers: grantsHeaders } = firstRow(grantsResult, 'grant probe');
if (value(grantsRow, grantsHeaders, 'has_role') !== true) {
throw grantsMissingError();
}
const trackingResult = await execute(pgClient, TRACKING_PROBE_SQL);
const { row: trackingRow, headers: trackingHeaders } = firstRow(trackingResult, 'tracking probe');
const track = nullableString(value(trackingRow, trackingHeaders, 'track'));
const maxResult = await execute(pgClient, MAX_SETTING_PROBE_SQL);
const { row: maxRow, headers: maxHeaders } = firstRow(maxResult, 'max-setting probe');
const pgssMax = nullableInteger(value(maxRow, maxHeaders, 'max'));
const warnings: string[] = [];
if (track === 'none') {
warnings.push('pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config');
}
if (pgssMax !== null && pgssMax < RECOMMENDED_PGSS_MAX) {
warnings.push(
`pg_stat_statements.max is ${pgssMax}; set it to at least ${RECOMMENDED_PGSS_MAX} to reduce query-template eviction churn`,
);
}
return { pgServerVersion, warnings };
}
async readSnapshot(
client: unknown,
options: { minCalls: number; maxTemplates: number },
): Promise<PostgresPgssSnapshot> {
const pgClient = queryClient(client);
const snapshotResult = await execute(pgClient, SNAPSHOT_SQL, [options.minCalls, options.maxTemplates]);
const snapshotHeaders = indexes(snapshotResult.headers);
const statsResult = await execute(pgClient, STATS_INFO_SQL);
const { row: statsRow, headers: statsHeaders } = firstRow(statsResult, 'stats-info');
return {
statsResetAt: nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')),
deallocCount: nullableInteger(value(statsRow, statsHeaders, 'dealloc')),
rows: snapshotResult.rows.map((row) => mapSnapshotRow(row, snapshotHeaders)),
};
}
}

View file

@ -0,0 +1,193 @@
import { describe, expect, it, vi } from 'vitest';
import { HistoricSqlGrantsMissingError } from './errors.js';
import { SnowflakeHistoricSqlQueryHistoryReader } from './snowflake-query-history-reader.js';
interface FakeQueryResult {
headers: string[];
rows: unknown[][];
totalRows: number;
error?: string;
}
function queryClient(results: FakeQueryResult[]) {
const executeQuery = vi.fn(async (_query: string) => {
const next = results.shift();
if (!next) {
throw new Error('unexpected query');
}
return next;
});
return { executeQuery };
}
function firstQuery(client: ReturnType<typeof queryClient>): string {
const call = client.executeQuery.mock.calls[0];
if (!call) {
throw new Error('expected query client to be called');
}
return call[0];
}
describe('SnowflakeHistoricSqlQueryHistoryReader', () => {
it('probes SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY', async () => {
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
await expect(reader.probe(client)).resolves.toBeUndefined();
expect(client.executeQuery).toHaveBeenCalledWith(
'SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY LIMIT 1',
);
});
it('turns probe result errors into HistoricSqlGrantsMissingError', async () => {
const client = queryClient([{ headers: [], rows: [], totalRows: 0, error: 'Object does not exist or not authorized' }]);
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
await expect(reader.probe(client)).rejects.toMatchObject({
name: 'HistoricSqlGrantsMissingError',
dialect: 'snowflake',
remediation: 'GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE <connection role>;',
});
});
it('turns thrown probe failures into HistoricSqlGrantsMissingError', async () => {
const client = {
executeQuery: vi.fn(async () => {
throw new Error('permission denied');
}),
};
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('fetches query-history rows with cursor and maps them into RawQueryRow shape', async () => {
const client = queryClient([
{
headers: [
'QUERY_ID',
'QUERY_TEXT',
'USER_NAME',
'ROLE_NAME',
'WAREHOUSE_NAME',
'DATABASE_NAME',
'SCHEMA_NAME',
'START_TIME',
'END_TIME',
'TOTAL_ELAPSED_TIME',
'ROWS_PRODUCED',
'EXECUTION_STATUS',
'ERROR_CODE',
'ERROR_MESSAGE',
],
rows: [
[
'01a',
"SELECT count(*) FROM ANALYTICS.ORDERS WHERE STATUS = 'paid'",
'ANALYST_A',
'ANALYST_ROLE',
'WH_XS',
'ANALYTICS',
'PUBLIC',
'2026-05-04T10:00:00.000Z',
'2026-05-04T10:00:01.250Z',
1250,
12,
'SUCCESS',
null,
null,
],
[
'01b',
'SELECT * FROM MISSING_TABLE',
'ANALYST_B',
'ANALYST_ROLE',
'WH_XS',
'ANALYTICS',
'PUBLIC',
new Date('2026-05-04T10:05:00.000Z'),
null,
null,
null,
'FAILED_WITH_ERROR',
'002003',
'SQL compilation error',
],
],
totalRows: 2,
},
]);
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
const rows = [];
for await (const row of reader.fetch(
client,
{
start: new Date('2026-05-01T00:00:00.000Z'),
end: new Date('2026-05-04T12:00:00.000Z'),
},
'2026-05-03T00:00:00.000Z',
)) {
rows.push(row);
}
expect(client.executeQuery).toHaveBeenCalledTimes(1);
const sql = firstQuery(client);
expect(sql).toContain('FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY');
expect(sql).toContain("START_TIME >= '2026-05-03T00:00:00.000Z'::TIMESTAMP_TZ");
expect(sql).toContain("START_TIME < '2026-05-04T12:00:00.000Z'::TIMESTAMP_TZ");
expect(sql).toContain('ORDER BY START_TIME ASC, QUERY_ID ASC');
expect(sql).toContain('ROWS_PRODUCED');
expect(rows).toEqual([
{
id: '01a',
sql: "SELECT count(*) FROM ANALYTICS.ORDERS WHERE STATUS = 'paid'",
user: 'ANALYST_A',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: '2026-05-04T10:00:01.250Z',
runtimeMs: 1250,
rowsProduced: 12,
success: true,
errorMessage: null,
},
{
id: '01b',
sql: 'SELECT * FROM MISSING_TABLE',
user: 'ANALYST_B',
startedAt: '2026-05-04T10:05:00.000Z',
endedAt: null,
runtimeMs: null,
rowsProduced: null,
success: false,
errorMessage: '002003: SQL compilation error',
},
]);
});
it('uses the window start when no cursor is available', async () => {
const client = queryClient([{ headers: ['QUERY_ID'], rows: [], totalRows: 0 }]);
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
for await (const _row of reader.fetch(client, {
start: new Date('2026-02-03T12:00:00.000Z'),
end: new Date('2026-05-04T12:00:00.000Z'),
})) {
throw new Error('empty result should not yield rows');
}
const sql = firstQuery(client);
expect(sql).toContain("START_TIME >= '2026-02-03T12:00:00.000Z'::TIMESTAMP_TZ");
});
it('throws a clear error when the query client cannot execute SQL', async () => {
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
await expect(async () => {
for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) {
throw new Error('unreachable');
}
}).rejects.toThrow('Historic SQL Snowflake reader requires a query client with executeQuery(query)');
});
});

View file

@ -0,0 +1,203 @@
import { HistoricSqlGrantsMissingError } from './errors.js';
import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js';
interface QueryResultLike {
headers: string[];
rows: unknown[][];
totalRows: number;
error?: string;
}
interface QueryClientLike {
executeQuery(query: string): Promise<QueryResultLike>;
}
const PROBE_SQL = 'SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY LIMIT 1';
const SNOWFLAKE_GRANTS_REMEDIATION =
'GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE <connection role>;';
function queryClient(client: unknown): QueryClientLike {
if (
client &&
typeof client === 'object' &&
'executeQuery' in client &&
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
) {
return client as QueryClientLike;
}
throw new Error('Historic SQL Snowflake reader requires a query client with executeQuery(query)');
}
function grantsError(cause: unknown): HistoricSqlGrantsMissingError {
const message =
cause instanceof Error
? cause.message
: typeof cause === 'string'
? cause
: 'Snowflake role cannot query SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY.';
return new HistoricSqlGrantsMissingError({
dialect: 'snowflake',
message: `Missing Snowflake audit grants for historic-SQL ingest: ${message}`,
remediation: SNOWFLAKE_GRANTS_REMEDIATION,
cause,
});
}
function timestampLiteral(value: Date | string): string {
const date = value instanceof Date ? value : new Date(value);
if (Number.isNaN(date.getTime())) {
throw new Error(`Invalid Snowflake query-history timestamp: ${String(value)}`);
}
return `'${date.toISOString().replace(/'/g, "''")}'::TIMESTAMP_TZ`;
}
function queryHistorySql(window: HistoricSqlTimeWindow, cursor?: string | null): string {
const start = timestampLiteral(cursor ?? window.start);
const end = timestampLiteral(window.end);
return `
SELECT
QUERY_ID,
QUERY_TEXT,
USER_NAME,
ROLE_NAME,
WAREHOUSE_NAME,
DATABASE_NAME,
SCHEMA_NAME,
START_TIME,
END_TIME,
TOTAL_ELAPSED_TIME,
ROWS_PRODUCED,
EXECUTION_STATUS,
ERROR_CODE,
ERROR_MESSAGE
FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
WHERE START_TIME >= ${start}
AND START_TIME < ${end}
AND QUERY_TEXT IS NOT NULL
ORDER BY START_TIME ASC, QUERY_ID ASC`.trim();
}
function indexByHeader(headers: string[]): Map<string, number> {
const out = new Map<string, number>();
headers.forEach((header, index) => {
out.set(header.toUpperCase(), index);
});
return out;
}
function value(row: unknown[], indexes: Map<string, number>, name: string): unknown {
const index = indexes.get(name);
return index === undefined ? null : row[index];
}
function nullableString(raw: unknown): string | null {
if (raw === null || raw === undefined) {
return null;
}
const text = String(raw);
return text.length > 0 ? text : null;
}
function requiredString(raw: unknown, field: string): string {
const text = nullableString(raw);
if (!text) {
throw new Error(`Snowflake QUERY_HISTORY row is missing ${field}`);
}
return text;
}
function nullableNumber(raw: unknown): number | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
const number = typeof raw === 'number' ? raw : Number(raw);
if (!Number.isFinite(number)) {
return null;
}
return number;
}
function nullableInteger(raw: unknown): number | null {
const number = nullableNumber(raw);
return number === null ? null : Math.trunc(number);
}
function isoTimestamp(raw: unknown, field: string): string {
if (raw instanceof Date) {
return raw.toISOString();
}
const text = requiredString(raw, field);
const date = new Date(text);
if (Number.isNaN(date.getTime())) {
throw new Error(`Snowflake QUERY_HISTORY row has invalid ${field}: ${text}`);
}
return date.toISOString();
}
function nullableIsoTimestamp(raw: unknown): string | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
return isoTimestamp(raw, 'END_TIME');
}
function executionSucceeded(status: string | null, errorCode: string | null, errorMessage: string | null): boolean {
if (errorCode || errorMessage) {
return false;
}
return status === null || status.toUpperCase().startsWith('SUCCESS');
}
function combinedErrorMessage(errorCode: string | null, errorMessage: string | null): string | null {
if (errorCode && errorMessage) {
return `${errorCode}: ${errorMessage}`;
}
return errorMessage ?? errorCode;
}
function mapRow(row: unknown[], indexes: Map<string, number>): HistoricSqlRawQueryRow {
const errorCode = nullableString(value(row, indexes, 'ERROR_CODE'));
const errorMessage = nullableString(value(row, indexes, 'ERROR_MESSAGE'));
const rowsProduced = nullableInteger(value(row, indexes, 'ROWS_PRODUCED'));
return {
id: requiredString(value(row, indexes, 'QUERY_ID'), 'QUERY_ID'),
sql: requiredString(value(row, indexes, 'QUERY_TEXT'), 'QUERY_TEXT'),
user: nullableString(value(row, indexes, 'USER_NAME')),
startedAt: isoTimestamp(value(row, indexes, 'START_TIME'), 'START_TIME'),
endedAt: nullableIsoTimestamp(value(row, indexes, 'END_TIME')),
runtimeMs: nullableNumber(value(row, indexes, 'TOTAL_ELAPSED_TIME')),
rowsProduced,
success: executionSucceeded(nullableString(value(row, indexes, 'EXECUTION_STATUS')), errorCode, errorMessage),
errorMessage: combinedErrorMessage(errorCode, errorMessage),
};
}
export class SnowflakeHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader {
async probe(client: unknown): Promise<void> {
let result: QueryResultLike;
try {
result = await queryClient(client).executeQuery(PROBE_SQL);
} catch (error) {
throw grantsError(error);
}
if (result.error) {
throw grantsError(result.error);
}
}
async *fetch(
client: unknown,
window: HistoricSqlTimeWindow,
cursor?: string | null,
): AsyncIterable<HistoricSqlRawQueryRow> {
const result = await queryClient(client).executeQuery(queryHistorySql(window, cursor));
if (result.error) {
throw grantsError(result.error);
}
const indexes = indexByHeader(result.headers);
for (const row of result.rows) {
yield mapRow(row, indexes);
}
}
}

View file

@ -0,0 +1,152 @@
import { mkdir, mkdtemp, readdir, readFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { dirname, join, relative } from 'node:path';
import { describe, expect, it } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import { stagePgStatStatementsTemplates, writePgssBaselineAtomic, type PgssBaseline } from './stage-pgss.js';
import type { HistoricSqlPullConfig, KloPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js';
const FIXTURE_ROOT = join(__dirname, '__fixtures__/postgres');
interface GoldenFixture {
name: string;
now: string;
connectionId: string;
probe: {
pgServerVersion: string;
warnings: string[];
};
snapshot: {
statsResetAt: string | null;
deallocCount: number | null;
rows: PostgresPgssRow[];
};
pullConfig: HistoricSqlPullConfig & { dialect: 'postgres' };
analysisBySql: Record<
string,
{
fingerprint: string;
normalizedSql: string;
tablesTouched: string[];
literalSlots: [];
error?: string;
}
>;
baseline: PgssBaseline | null;
expectedBaseline: PgssBaseline;
expectedFiles: Record<string, { json?: unknown; text?: string }>;
}
async function readFixture(name: string): Promise<GoldenFixture> {
return JSON.parse(await readFile(join(FIXTURE_ROOT, name, 'input.json'), 'utf-8')) as GoldenFixture;
}
async function tempDir(prefix: string): Promise<string> {
return mkdtemp(join(tmpdir(), prefix));
}
function fakePgClient(): KloPostgresQueryClient {
return {
async executeQuery() {
return { headers: [], rows: [] };
},
};
}
function fixtureReader(fixture: GoldenFixture): PostgresPgssReader {
return {
async probe() {
return fixture.probe;
},
async readSnapshot(_client, options) {
return {
statsResetAt: fixture.snapshot.statsResetAt,
deallocCount: fixture.snapshot.deallocCount,
rows: fixture.snapshot.rows.slice(0, options.maxTemplates),
};
},
};
}
function fixtureSqlAnalysis(fixture: GoldenFixture): SqlAnalysisPort {
return {
async analyzeForFingerprint(sql) {
const result = fixture.analysisBySql[sql];
if (!result) {
return {
fingerprint: '',
normalizedSql: '',
tablesTouched: [],
literalSlots: [],
error: `missing fixture analysis for ${sql}`,
};
}
return result;
},
};
}
async function writeFixtureBaseline(path: string, baseline: PgssBaseline | null): Promise<void> {
if (!baseline) {
return;
}
await writePgssBaselineAtomic(path, baseline);
}
async function listFiles(root: string, current = root): Promise<string[]> {
const entries = await readdir(current, { withFileTypes: true });
const files: string[] = [];
for (const entry of entries) {
const fullPath = join(current, entry.name);
if (entry.isDirectory()) {
files.push(...(await listFiles(root, fullPath)));
} else {
files.push(relative(root, fullPath));
}
}
return files;
}
async function expectGoldenFiles(stagedDir: string, expectedFiles: GoldenFixture['expectedFiles']): Promise<void> {
const actualFiles = await listFiles(stagedDir);
const expectedPaths = Object.keys(expectedFiles).sort();
expect(actualFiles.sort()).toEqual(expectedPaths);
for (const path of expectedPaths) {
const expected = expectedFiles[path];
const actual = await readFile(join(stagedDir, path), 'utf-8');
if ('json' in expected) {
expect(JSON.parse(actual)).toEqual(expected.json);
} else {
expect(actual).toBe(expected.text);
}
}
}
describe('stagePgStatStatementsTemplates golden fixtures', () => {
it.each(['first-run', 'normal-delta', 'reset-detected', 'version-change', 'eviction-churn'] as const)(
'matches the committed %s golden output',
async (fixtureName) => {
const fixture = await readFixture(fixtureName);
const root = await tempDir(`pgss-golden-${fixtureName}-`);
const stagedDir = join(root, 'staged');
const baselinePath = join(root, 'cache', fixture.connectionId, 'pgss-baseline.json');
await mkdir(dirname(baselinePath), { recursive: true });
await writeFixtureBaseline(baselinePath, fixture.baseline);
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: fixture.connectionId,
queryClient: fakePgClient(),
reader: fixtureReader(fixture),
sqlAnalysis: fixtureSqlAnalysis(fixture),
pullConfig: fixture.pullConfig,
baselinePath,
now: new Date(fixture.now),
});
await expectGoldenFiles(stagedDir, fixture.expectedFiles);
expect(result.baseline).toEqual(fixture.expectedBaseline);
},
);
});

View file

@ -0,0 +1,652 @@
import { mkdtemp, readFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it, vi } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import {
pgssBaselinePath,
readPgssBaseline,
stagePgStatStatementsTemplates,
writePgssBaselineAtomic,
type PgssBaseline,
} from './stage-pgss.js';
import { historicSqlManifestSchema, historicSqlMetadataSchema, historicSqlUsageSchema } from './types.js';
import type { KloPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js';
async function tempDir(prefix: string): Promise<string> {
return mkdtemp(join(tmpdir(), prefix));
}
async function readJson<T>(root: string, relPath: string): Promise<T> {
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
}
function fakePgClient(): KloPostgresQueryClient {
return {
async executeQuery() {
return { headers: [], rows: [] };
},
};
}
function row(overrides: Partial<PostgresPgssRow> & Pick<PostgresPgssRow, 'queryid' | 'query'>): PostgresPgssRow {
return {
userid: '11',
username: 'analyst',
dbid: '5',
database: 'warehouse',
calls: 10,
totalExecTime: 250,
meanExecTime: 25,
totalRows: 20,
...overrides,
};
}
function fakeReader(input: {
pgServerVersion?: string;
warnings?: string[];
statsResetAt?: string | null;
deallocCount?: number | null;
rows: PostgresPgssRow[];
}): PostgresPgssReader {
return {
probe: vi.fn(async () => ({
pgServerVersion: input.pgServerVersion ?? 'PostgreSQL 16.4',
warnings: input.warnings ?? [],
})),
readSnapshot: vi.fn(async (_client, options) => ({
statsResetAt: input.statsResetAt ?? '2026-05-08T08:00:00.000Z',
deallocCount: input.deallocCount ?? 0,
rows: input.rows.slice(0, options.maxTemplates),
})),
};
}
const sqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
if (sql.includes('broken')) {
return {
fingerprint: '',
normalizedSql: '',
tablesTouched: [],
literalSlots: [],
error: 'parse failed',
};
}
if (sql.includes('customers')) {
return {
fingerprint: 'fp_customers',
normalizedSql: 'SELECT count(*) FROM analytics.customers',
tablesTouched: ['analytics.customers'],
literalSlots: [],
};
}
return {
fingerprint: 'fp_orders',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
tablesTouched: ['analytics.orders'],
literalSlots: [],
};
},
};
function postgresPullConfig(maxTemplatesPerRun = 5000) {
return {
dialect: 'postgres' as const,
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: ['^svc_'],
redactionPatterns: ['secret'],
maxTemplatesPerRun,
minCalls: 5,
};
}
describe('stagePgStatStatementsTemplates', () => {
it('stages first-run PGSS templates as degraded aggregate templates and builds a next baseline', async () => {
const stagedDir = await tempDir('pgss-stage-first-');
const baselineRootDir = await tempDir('pgss-baseline-first-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
warnings: ['pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config'],
deallocCount: 2,
rows: [
row({
queryid: '101',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 10,
totalExecTime: 250,
totalRows: 20,
}),
row({
queryid: '102',
query: 'SELECT * FROM pg_catalog.pg_class',
calls: 50,
totalExecTime: 500,
}),
row({
queryid: '103',
query: 'BEGIN',
calls: 75,
totalExecTime: 75,
}),
row({
queryid: '104',
query: 'SELECT broken FROM analytics.orders',
calls: 8,
totalExecTime: 80,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest).toMatchObject({
source: 'historic-sql',
connectionId: 'conn_pg',
dialect: 'postgres',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
templateCount: 1,
capped: false,
degraded: true,
statsResetAt: '2026-05-08T08:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 2,
});
expect(manifest.warnings).toEqual([
'pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config',
'pgss_dealloc_count:2; pg_stat_statements.max may be too low, causing template eviction churn',
'baseline_first_run:no_previous_pgss_baseline',
'analysis_failed:db5_q104',
]);
expect(manifest.templates).toEqual([
{
id: 'db5_q101',
fingerprint: 'fp_orders',
subClusterId: null,
path: 'templates/db5_q101/page.md',
},
]);
const metadata = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q101/metadata.json'));
expect(metadata).toMatchObject({
id: 'db5_q101',
title: 'postgres · analytics.orders [db5_q101]',
path: 'templates/db5_q101/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_orders',
sub_cluster_id: null,
dialect: 'postgres',
tables_touched: ['analytics.orders'],
literal_slots: [],
},
});
expect(metadata.properties.triage_signals).toEqual({
executions_bucket: 'mid',
distinct_users_bucket: 'solo',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
runtime_bucket: 'fast',
});
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q101/usage.json'));
expect(usage).toEqual({
stats: {
executions: 10,
distinct_users: 1,
first_seen: '2026-05-08T12:00:00.000Z',
last_seen: '2026-05-08T12:00:00.000Z',
p50_runtime_ms: null,
p95_runtime_ms: null,
mean_runtime_ms: 25,
error_rate: 0,
rows_produced: 20,
},
literal_slots: [],
samples: [],
});
expect(await readFile(join(stagedDir, 'templates/db5_q101/page.md'), 'utf-8')).toContain(
'SELECT count(*) FROM analytics.orders WHERE status = $1',
);
expect(result.baselinePath).toBe(baselinePath);
expect(result.baseline.templates.db5_q101.perUser['11']).toEqual({
calls: 10,
totalExecTime: 250,
totalRows: 20,
});
await expect(readPgssBaseline(baselinePath)).resolves.toBeNull();
});
it('warns when pg_stat_statements reports dealloc churn', async () => {
const root = await tempDir('pgss-churn-');
const stagedDir = join(root, 'staged');
const baselinePath = join(root, 'cache', 'warehouse', 'pgss-baseline.json');
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'warehouse',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '901',
query: 'SELECT COUNT(*) FROM public.orders WHERE status = $1',
calls: 20,
totalExecTime: 500,
meanExecTime: 25,
}),
],
deallocCount: 3,
}),
sqlAnalysis,
pullConfig: postgresPullConfig(50),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = await readJson<{ warnings: string[]; deallocCount: number }>(stagedDir, 'manifest.json');
expect(manifest.deallocCount).toBe(3);
expect(manifest.warnings).toContain(
'pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn',
);
});
it('uses the saved cumulative baseline to stage only positive deltas on later runs', async () => {
const stagedDir = await tempDir('pgss-stage-delta-');
const baselineRootDir = await tempDir('pgss-baseline-delta-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
const baseline: PgssBaseline = {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q201: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 10, totalExecTime: 100, totalRows: 50 },
'12': { calls: 5, totalExecTime: 50, totalRows: 25 },
},
},
},
};
await writePgssBaselineAtomic(baselinePath, baseline);
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '201',
userid: '11',
username: 'analyst',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 12,
totalExecTime: 160,
totalRows: 58,
}),
row({
queryid: '201',
userid: '12',
username: 'svc_loader',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 5,
totalExecTime: 50,
totalRows: 25,
}),
row({
queryid: '202',
userid: '13',
username: 'analyst_2',
query: 'SELECT count(*) FROM analytics.customers',
calls: 7,
totalExecTime: 210,
totalRows: 7,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.baselineFirstRun).toBe(false);
expect(manifest.windowStart).toBe('2026-05-08T10:00:00.000Z');
expect(manifest.templateCount).toBe(2);
expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q202', 'db5_q201']);
const usage201 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q201/usage.json'));
expect(usage201.stats).toMatchObject({
executions: 2,
distinct_users: 1,
first_seen: '2026-05-08T09:00:00.000Z',
last_seen: '2026-05-08T12:00:00.000Z',
mean_runtime_ms: 30,
rows_produced: 8,
});
const metadata201 = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q201/metadata.json'));
expect(metadata201.properties.triage_signals.service_account_only).toBe('false');
const usage202 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q202/usage.json'));
expect(usage202.stats).toMatchObject({
executions: 7,
distinct_users: 1,
first_seen: '2026-05-08T12:00:00.000Z',
mean_runtime_ms: 30,
rows_produced: 7,
});
});
it('keeps matching queryid values from different databases as distinct templates and baseline entries', async () => {
const stagedDir = await tempDir('pgss-stage-db-key-');
const baselineRootDir = await tempDir('pgss-baseline-db-key-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(baselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q701: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 10, totalExecTime: 100, totalRows: 50 },
},
},
db6_q701: {
firstObservedAt: '2026-05-08T09:30:00.000Z',
perUser: {
'11': { calls: 4, totalExecTime: 40, totalRows: 20 },
},
},
},
});
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '701',
dbid: '5',
database: 'warehouse',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 12,
totalExecTime: 160,
totalRows: 58,
}),
row({
queryid: '701',
dbid: '6',
database: 'app',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 9,
totalExecTime: 130,
totalRows: 35,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templates.map((template) => template.id).sort()).toEqual(['db5_q701', 'db6_q701']);
const warehouseUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q701/usage.json'));
expect(warehouseUsage.stats).toMatchObject({
executions: 2,
rows_produced: 8,
first_seen: '2026-05-08T09:00:00.000Z',
});
const appUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db6_q701/usage.json'));
expect(appUsage.stats).toMatchObject({
executions: 5,
rows_produced: 15,
first_seen: '2026-05-08T09:30:00.000Z',
});
expect(result.baseline.templates.db5_q701.perUser['11']).toEqual({
calls: 12,
totalExecTime: 160,
totalRows: 58,
});
expect(result.baseline.templates.db6_q701.perUser['11']).toEqual({
calls: 9,
totalExecTime: 130,
totalRows: 35,
});
});
it('treats stats_reset advancement and major-version changes as fresh baselines', async () => {
const resetStagedDir = await tempDir('pgss-stage-reset-');
const resetBaselineRootDir = await tempDir('pgss-baseline-reset-');
const resetBaselinePath = pgssBaselinePath(resetBaselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(resetBaselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q301: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
},
},
},
});
await stagePgStatStatementsTemplates({
stagedDir: resetStagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
statsResetAt: '2026-05-08T11:00:00.000Z',
rows: [
row({
queryid: '301',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 3,
totalExecTime: 90,
totalRows: 9,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath: resetBaselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const resetManifest = historicSqlManifestSchema.parse(await readJson(resetStagedDir, 'manifest.json'));
expect(resetManifest.baselineFirstRun).toBe(true);
expect(resetManifest.warnings).toContain(
'baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z',
);
const resetUsage = historicSqlUsageSchema.parse(await readJson(resetStagedDir, 'templates/db5_q301/usage.json'));
expect(resetUsage.stats.executions).toBe(3);
const versionStagedDir = await tempDir('pgss-stage-version-');
const versionBaselineRootDir = await tempDir('pgss-baseline-version-');
const versionBaselinePath = pgssBaselinePath(versionBaselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(versionBaselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 15.7',
templates: {
db5_q302: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
},
},
},
});
await stagePgStatStatementsTemplates({
stagedDir: versionStagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
pgServerVersion: 'PostgreSQL 16.4',
rows: [
row({
queryid: '302',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 4,
totalExecTime: 80,
totalRows: 8,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath: versionBaselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const versionManifest = historicSqlManifestSchema.parse(await readJson(versionStagedDir, 'manifest.json'));
expect(versionManifest.baselineFirstRun).toBe(true);
expect(versionManifest.warnings).toContain('baseline_reset:pg_server_major changed from 15 to 16');
});
it('handles scoped counter regressions without forcing a global first-run baseline', async () => {
const stagedDir = await tempDir('pgss-stage-scoped-');
const baselineRootDir = await tempDir('pgss-baseline-scoped-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(baselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q401: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
'12': { calls: 50, totalExecTime: 500, totalRows: 250 },
},
},
},
});
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
statsResetAt: '2026-05-08T08:00:00.000Z',
rows: [
row({
queryid: '401',
userid: '11',
username: 'analyst',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 2,
totalExecTime: 30,
totalRows: 6,
}),
row({
queryid: '401',
userid: '12',
username: 'svc_loader',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 55,
totalExecTime: 650,
totalRows: 275,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.baselineFirstRun).toBe(false);
expect(manifest.warnings).toContain('scoped_reset:dbid=5 queryid=401 userid=11');
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q401/usage.json'));
expect(usage.stats).toMatchObject({
executions: 7,
distinct_users: 2,
mean_runtime_ms: 25.714285714285715,
rows_produced: 31,
});
});
it('ranks and caps selected PGSS templates after skip and analysis filtering', async () => {
const stagedDir = await tempDir('pgss-stage-cap-');
const baselineRootDir = await tempDir('pgss-baseline-cap-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '501',
username: 'analyst_a',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 2,
totalExecTime: 20,
}),
row({
queryid: '502',
username: 'analyst_b',
query: 'SELECT count(*) FROM analytics.customers',
calls: 20,
totalExecTime: 200,
}),
row({
queryid: '503',
username: 'analyst_c',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 10,
totalExecTime: 100,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(2),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.capped).toBe(true);
expect(manifest.warnings).toContain('templates_truncated: kept 2 of 3 templates');
expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q502', 'db5_q503']);
});
});

View file

@ -0,0 +1,508 @@
import { mkdir, readFile, rename, writeFile } from 'node:fs/promises';
import { dirname, join } from 'node:path';
import { z } from 'zod';
import type { SqlAnalysisFingerprintResult, SqlAnalysisPort } from '../../../sql-analysis/index.js';
import {
HISTORIC_SQL_OBJECT_TYPE,
HISTORIC_SQL_SOURCE_KEY,
historicSqlPullConfigSchema,
type HistoricSqlManifest,
type HistoricSqlMetadata,
type HistoricSqlPullConfig,
type HistoricSqlUsage,
type KloPostgresQueryClient,
type PostgresPgssAggregateRow,
type PostgresPgssReader,
type PostgresPgssRow,
} from './types.js';
const PGSS_BASELINE_VERSION = 1 as const;
const pgssCounterSchema = z.object({
calls: z.number().int().nonnegative(),
totalExecTime: z.number().nonnegative(),
totalRows: z.number().int().nonnegative(),
});
const pgssBaselineSchema = z.object({
version: z.literal(PGSS_BASELINE_VERSION),
fetchedAt: z.string().datetime(),
statsResetAt: z.string().datetime().nullable(),
pgServerVersion: z.string(),
templates: z.record(
z.string(),
z.object({
firstObservedAt: z.string().datetime(),
perUser: z.record(z.string(), pgssCounterSchema),
}),
),
});
export type PgssBaseline = z.infer<typeof pgssBaselineSchema>;
export interface StagePgStatStatementsTemplatesInput {
stagedDir: string;
connectionId: string;
queryClient: KloPostgresQueryClient;
reader: PostgresPgssReader;
sqlAnalysis: SqlAnalysisPort;
pullConfig: HistoricSqlPullConfig;
baselinePath: string;
now?: Date;
}
export interface StagePgStatStatementsTemplatesResult {
baselinePath: string;
baseline: PgssBaseline;
}
interface PgssBaselineCounter {
calls: number;
totalExecTime: number;
totalRows: number;
}
interface PgssAggregateMutable {
id: string;
queryid: string;
dbid: string;
database: string | null;
query: string;
deltaCalls: number;
deltaExecTime: number;
deltaRows: number;
users: Set<string>;
firstObservedAt: string;
}
interface AnalyzedPgssTemplate {
aggregate: PostgresPgssAggregateRow;
analysis: SqlAnalysisFingerprintResult;
}
const ZERO_COUNTER: PgssBaselineCounter = {
calls: 0,
totalExecTime: 0,
totalRows: 0,
};
const PGSS_SNAPSHOT_READ_LIMIT = 5000;
const PGSS_HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET|BEGIN|COMMIT|ROLLBACK|VACUUM|ANALYZE)\b/i;
const PGSS_HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|pg_catalog\.|pg_toast\.|pg_stat_)/i;
function pgssTemplateId(row: Pick<PostgresPgssRow, 'dbid' | 'queryid'>): string {
return `db${row.dbid}_q${row.queryid}`;
}
export function pgssBaselinePath(rootDir: string | undefined, connectionId: string): string {
return join(rootDir ?? join(process.cwd(), '.klo/cache/historic-sql'), connectionId, 'pgss-baseline.json');
}
export async function readPgssBaseline(path: string): Promise<PgssBaseline | null> {
try {
return pgssBaselineSchema.parse(JSON.parse(await readFile(path, 'utf-8')));
} catch (error) {
if (error && typeof error === 'object' && 'code' in error && error.code === 'ENOENT') {
return null;
}
throw error;
}
}
export async function writePgssBaselineAtomic(path: string, baseline: PgssBaseline): Promise<void> {
const parsed = pgssBaselineSchema.parse(baseline);
await mkdir(dirname(path), { recursive: true });
const tempPath = `${path}.tmp`;
await writeFile(tempPath, `${JSON.stringify(parsed, null, 2)}\n`, 'utf-8');
await rename(tempPath, path);
}
export async function stagePgStatStatementsTemplates(
input: StagePgStatStatementsTemplatesInput,
): Promise<StagePgStatStatementsTemplatesResult> {
const config = historicSqlPullConfigSchema.parse(input.pullConfig);
if (config.dialect !== 'postgres') {
throw new Error(`stagePgStatStatementsTemplates requires dialect postgres, got ${config.dialect}`);
}
const now = input.now ?? new Date();
const fetchedAt = now.toISOString();
const probe = await input.reader.probe(input.queryClient);
const warnings = [...probe.warnings];
const baseline = await readPgssBaseline(input.baselinePath);
const snapshot = await input.reader.readSnapshot(input.queryClient, {
minCalls: config.minCalls,
maxTemplates: PGSS_SNAPSHOT_READ_LIMIT,
});
if (snapshot.deallocCount !== null && snapshot.deallocCount > 0) {
warnings.push(
`pgss_dealloc_count:${snapshot.deallocCount}; pg_stat_statements.max may be too low, causing template eviction churn`,
);
}
const reset = detectBaselineReset({
baseline,
snapshotStatsResetAt: snapshot.statsResetAt,
currentPgServerVersion: probe.pgServerVersion,
});
warnings.push(...reset.warnings);
const aggregates = aggregatePgssRows({
rows: snapshot.rows,
baseline,
baselineFirstRun: reset.baselineFirstRun,
fetchedAt,
warnings,
}).filter((aggregate) => !shouldSkipPgssSql(aggregate.query));
const analyzed: AnalyzedPgssTemplate[] = [];
for (const aggregate of aggregates) {
const analysis = await input.sqlAnalysis.analyzeForFingerprint(aggregate.query, 'postgres');
if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) {
warnings.push(`analysis_failed:${aggregate.id}`);
continue;
}
analyzed.push({ aggregate, analysis });
}
const selected = selectPgssTemplates(analyzed, config.maxTemplatesPerRun);
if (selected.length < analyzed.length) {
warnings.push(`templates_truncated: kept ${selected.length} of ${analyzed.length} templates`);
}
await mkdir(input.stagedDir, { recursive: true });
const templates: HistoricSqlManifest['templates'] = [];
for (const template of selected) {
const staged = buildPgssStagedTemplate(template, config, now);
const basePath = `templates/${staged.metadata.id}`;
await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata);
await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown);
await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage);
templates.push({
id: staged.metadata.id,
fingerprint: staged.metadata.properties.fingerprint,
subClusterId: staged.metadata.properties.sub_cluster_id,
path: staged.metadata.path,
});
}
await writeJson(input.stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: input.connectionId,
dialect: 'postgres',
fetchedAt,
windowStart: baseline?.fetchedAt ?? snapshot.statsResetAt ?? fetchedAt,
windowEnd: fetchedAt,
nextSuccessfulCursor: fetchedAt,
templateCount: selected.length,
capped: selected.length < analyzed.length,
warnings,
degraded: true,
statsResetAt: snapshot.statsResetAt,
baselineFirstRun: reset.baselineFirstRun,
pgServerVersion: probe.pgServerVersion,
deallocCount: snapshot.deallocCount,
templates,
} satisfies HistoricSqlManifest);
return {
baselinePath: input.baselinePath,
baseline: buildNextBaseline({
rows: snapshot.rows,
fetchedAt,
statsResetAt: snapshot.statsResetAt,
pgServerVersion: probe.pgServerVersion,
previousBaseline: reset.baselineFirstRun ? null : baseline,
}),
};
}
function detectBaselineReset(input: {
baseline: PgssBaseline | null;
snapshotStatsResetAt: string | null;
currentPgServerVersion: string;
}): { baselineFirstRun: boolean; warnings: string[] } {
if (!input.baseline) {
return { baselineFirstRun: true, warnings: ['baseline_first_run:no_previous_pgss_baseline'] };
}
const warnings: string[] = [];
if (
input.baseline.statsResetAt &&
input.snapshotStatsResetAt &&
input.baseline.statsResetAt < input.snapshotStatsResetAt
) {
warnings.push(
`baseline_reset:stats_reset advanced from ${input.baseline.statsResetAt} to ${input.snapshotStatsResetAt}`,
);
}
const previousMajor = postgresMajor(input.baseline.pgServerVersion);
const currentMajor = postgresMajor(input.currentPgServerVersion);
if (previousMajor && currentMajor && previousMajor !== currentMajor) {
warnings.push(`baseline_reset:pg_server_major changed from ${previousMajor} to ${currentMajor}`);
}
return { baselineFirstRun: warnings.length > 0, warnings };
}
function postgresMajor(version: string): string | null {
return version.match(/PostgreSQL\s+(\d+)/i)?.[1] ?? version.match(/^(\d+)(?:\.|$)/)?.[1] ?? null;
}
function aggregatePgssRows(input: {
rows: PostgresPgssRow[];
baseline: PgssBaseline | null;
baselineFirstRun: boolean;
fetchedAt: string;
warnings: string[];
}): PostgresPgssAggregateRow[] {
const aggregates = new Map<string, PgssAggregateMutable>();
for (const row of input.rows) {
const templateId = pgssTemplateId(row);
const baselineTemplate = input.baselineFirstRun ? undefined : input.baseline?.templates[templateId];
const baselineCounter = baselineTemplate?.perUser[row.userid];
const previous = scopedCounterBaseline(row, baselineCounter, input.baselineFirstRun, input.warnings);
const deltaCalls = row.calls - previous.calls;
const deltaExecTime = row.totalExecTime - previous.totalExecTime;
const deltaRows = row.totalRows - previous.totalRows;
if (deltaCalls === 0 && !input.baselineFirstRun) {
continue;
}
const existing =
aggregates.get(templateId) ??
({
id: templateId,
queryid: row.queryid,
dbid: row.dbid,
database: row.database,
query: row.query,
deltaCalls: 0,
deltaExecTime: 0,
deltaRows: 0,
users: new Set<string>(),
firstObservedAt: baselineTemplate?.firstObservedAt ?? input.fetchedAt,
} satisfies PgssAggregateMutable);
existing.deltaCalls += Math.max(0, deltaCalls);
existing.deltaExecTime += Math.max(0, deltaExecTime);
existing.deltaRows += Math.max(0, deltaRows);
if (deltaCalls > 0) {
existing.users.add(row.username ?? 'unknown');
}
aggregates.set(templateId, existing);
}
return [...aggregates.values()]
.filter((aggregate) => aggregate.deltaCalls > 0)
.map((aggregate) => ({
id: aggregate.id,
queryid: aggregate.queryid,
dbid: aggregate.dbid,
database: aggregate.database,
query: aggregate.query,
deltaCalls: aggregate.deltaCalls,
deltaExecTime: aggregate.deltaExecTime,
deltaRows: aggregate.deltaRows,
meanExecTime: aggregate.deltaExecTime / Math.max(aggregate.deltaCalls, 1),
distinctUsersDelta: aggregate.users.size,
users: [...aggregate.users].sort(),
firstObservedAt: aggregate.firstObservedAt,
}));
}
function scopedCounterBaseline(
row: PostgresPgssRow,
baselineCounter: PgssBaselineCounter | undefined,
baselineFirstRun: boolean,
warnings: string[],
): PgssBaselineCounter {
if (!baselineCounter || baselineFirstRun) {
return ZERO_COUNTER;
}
if (
baselineCounter.calls > row.calls ||
baselineCounter.totalExecTime > row.totalExecTime ||
baselineCounter.totalRows > row.totalRows
) {
warnings.push(`scoped_reset:dbid=${row.dbid} queryid=${row.queryid} userid=${row.userid}`);
return ZERO_COUNTER;
}
return baselineCounter;
}
function shouldSkipPgssSql(sql: string): boolean {
return PGSS_HARD_SKIP_PREFIX_RE.test(sql) || PGSS_HARD_SKIP_TABLE_RE.test(sql);
}
function selectPgssTemplates(templates: AnalyzedPgssTemplate[], maxTemplatesPerRun: number): AnalyzedPgssTemplate[] {
return templates
.map((template) => ({
template,
score: template.aggregate.users.length * Math.log1p(template.aggregate.deltaCalls),
}))
.sort(
(left, right) => right.score - left.score || left.template.aggregate.id.localeCompare(right.template.aggregate.id),
)
.slice(0, maxTemplatesPerRun)
.map((entry) => entry.template);
}
function buildPgssStagedTemplate(
template: AnalyzedPgssTemplate,
config: HistoricSqlPullConfig,
now: Date,
): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } {
const tablesTouched = [...template.analysis.tablesTouched].sort();
const firstTable = tablesTouched[0] ?? 'query';
const id = template.aggregate.id;
const metadata: HistoricSqlMetadata = {
id,
title: `postgres · ${firstTable} [${id.slice(0, 12)}]`,
path: `templates/${id}/page.md`,
objectType: HISTORIC_SQL_OBJECT_TYPE,
lastEditedAt: null,
properties: {
fingerprint: template.analysis.fingerprint,
sub_cluster_id: null,
dialect: 'postgres',
tables_touched: tablesTouched,
literal_slots: [],
triage_signals: buildPgssTriageSignals({
executions: template.aggregate.deltaCalls,
distinctUsers: template.aggregate.distinctUsersDelta,
firstSeen: template.aggregate.firstObservedAt,
lastSeen: now.toISOString(),
meanRuntimeMs: template.aggregate.meanExecTime,
serviceAccountOnly: isServiceAccountOnly(template.aggregate.users, config.serviceAccountUserPatterns),
now,
}),
},
};
return {
metadata,
pageMarkdown: renderTemplatePage(id, template.analysis.normalizedSql, tablesTouched),
usage: {
stats: {
executions: template.aggregate.deltaCalls,
distinct_users: template.aggregate.distinctUsersDelta,
first_seen: template.aggregate.firstObservedAt,
last_seen: now.toISOString(),
p50_runtime_ms: null,
p95_runtime_ms: null,
mean_runtime_ms: template.aggregate.meanExecTime,
error_rate: 0,
rows_produced: template.aggregate.deltaRows,
},
literal_slots: [],
samples: [],
},
};
}
function buildPgssTriageSignals(input: {
executions: number;
distinctUsers: number;
firstSeen: string;
lastSeen: string;
meanRuntimeMs: number;
serviceAccountOnly: boolean;
now: Date;
}): Record<string, string> {
return {
executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high',
distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad',
error_rate_bucket: 'ok',
recency_bucket: recencyBucket(input.lastSeen, input.now),
service_account_only: String(input.serviceAccountOnly),
runtime_bucket: runtimeBucket(input.meanRuntimeMs),
};
}
function runtimeBucket(meanRuntimeMs: number): string {
if (meanRuntimeMs < 100) {
return 'fast';
}
if (meanRuntimeMs < 1000) {
return 'moderate';
}
return 'slow';
}
function recencyBucket(lastSeen: string, now: Date): string {
const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / 86400000);
if (ageDays <= 14) {
return 'active';
}
if (ageDays <= 60) {
return 'warm';
}
return 'cold';
}
function isServiceAccountOnly(users: string[], patterns: string[]): boolean {
if (users.length === 0 || patterns.length === 0) {
return false;
}
const regexes = patterns.map((pattern) => new RegExp(pattern));
return users.every((user) => regexes.some((regex) => regex.test(user)));
}
function renderTemplatePage(id: string, normalizedSql: string, tablesTouched: string[]): string {
return [
`# ${id}`,
'',
'## Normalized SQL',
'```sql',
normalizedSql,
'```',
'',
'## Tables touched',
...tablesTouched.map((table) => `- ${table}`),
'',
].join('\n');
}
function buildNextBaseline(input: {
rows: PostgresPgssRow[];
fetchedAt: string;
statsResetAt: string | null;
pgServerVersion: string;
previousBaseline: PgssBaseline | null;
}): PgssBaseline {
const templates: PgssBaseline['templates'] = {};
for (const row of input.rows) {
const templateId = pgssTemplateId(row);
const previous = input.previousBaseline?.templates[templateId];
const template = templates[templateId] ?? {
firstObservedAt: previous?.firstObservedAt ?? input.fetchedAt,
perUser: {},
};
template.perUser[row.userid] = {
calls: row.calls,
totalExecTime: row.totalExecTime,
totalRows: row.totalRows,
};
templates[templateId] = template;
}
return {
version: PGSS_BASELINE_VERSION,
fetchedAt: input.fetchedAt,
statsResetAt: input.statsResetAt,
pgServerVersion: input.pgServerVersion,
templates,
};
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`);
}
async function writeText(root: string, relPath: string, value: string): Promise<void> {
const target = join(root, relPath);
await mkdir(dirname(target), { recursive: true });
await writeFile(target, value, 'utf-8');
}

View file

@ -0,0 +1,798 @@
import { mkdtemp, readFile, readdir } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import { stageHistoricSqlTemplates } from './stage.js';
import {
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlUsageSchema,
type HistoricSqlQueryHistoryReader,
type HistoricSqlRawQueryRow,
} from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-stage-'));
}
async function readJson<T>(root: string, relPath: string): Promise<T> {
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
}
function fakeReader(rows: HistoricSqlRawQueryRow[]): HistoricSqlQueryHistoryReader {
return {
async probe() {},
async *fetch() {
for (const row of rows) {
yield row;
}
},
};
}
const fakeSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
if (sql.includes('paid')) {
return {
fingerprint: 'fp_paid_orders',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?',
tablesTouched: ['analytics.orders'],
literalSlots: [
{ position: 1, type: 'string', exampleValue: 'paid' },
{ position: 2, type: 'date', exampleValue: '2026-04-01' },
],
};
}
return {
fingerprint: 'fp_refunds',
normalizedSql: 'SELECT count(*) FROM analytics.refunds WHERE state = ?',
tablesTouched: ['analytics.refunds'],
literalSlots: [{ position: 1, type: 'string', exampleValue: 'complete' }],
};
},
};
const categoricalSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
const status = sql.includes("'refunded'") ? 'refunded' : 'paid';
return {
fingerprint: 'fp_order_status',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: status }],
};
},
};
function categoricalRows(): HistoricSqlRawQueryRow[] {
return [
{
id: 'paid-1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-a',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 11,
success: true,
errorMessage: null,
},
{
id: 'paid-2',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-b',
startedAt: '2026-05-04T10:01:00.000Z',
endedAt: null,
runtimeMs: 110,
rowsProduced: 12,
success: true,
errorMessage: null,
},
{
id: 'paid-3',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-c',
startedAt: '2026-05-04T10:02:00.000Z',
endedAt: null,
runtimeMs: 120,
rowsProduced: 13,
success: true,
errorMessage: null,
},
{
id: 'refunded-1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
user: 'analyst-a',
startedAt: '2026-05-04T10:03:00.000Z',
endedAt: null,
runtimeMs: 130,
rowsProduced: 21,
success: true,
errorMessage: null,
},
{
id: 'refunded-2',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
user: 'analyst-b',
startedAt: '2026-05-04T10:04:00.000Z',
endedAt: null,
runtimeMs: 140,
rowsProduced: 22,
success: true,
errorMessage: null,
},
{
id: 'refunded-3',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
user: 'analyst-c',
startedAt: '2026-05-04T10:05:00.000Z',
endedAt: null,
runtimeMs: 150,
rowsProduced: 23,
success: true,
errorMessage: null,
},
];
}
const diverseSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
const value = sql.match(/status = '([^']+)'/)?.[1] ?? 'unknown';
return {
fingerprint: 'fp_diverse_samples',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: value }],
};
},
};
const classificationMatrixSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
if (sql.includes('stale_orders')) {
return {
fingerprint: 'fp_stale_date',
normalizedSql: 'SELECT count(*) FROM analytics.stale_orders WHERE created_at >= ?',
tablesTouched: ['analytics.stale_orders'],
literalSlots: [{ position: 1, type: 'date', exampleValue: '2026-04-01' }],
};
}
const stringValue = (field: string): string => sql.match(new RegExp(`${field} = '([^']+)'`))?.[1] ?? 'unknown';
const amount = sql.match(/amount >= (\d+)/)?.[1] ?? '0';
const asOf = sql.match(/created_at >= '([^']+)'/)?.[1] ?? '2026-05-01';
return {
fingerprint: 'fp_classification_matrix',
normalizedSql:
'SELECT count(*) FROM analytics.orders WHERE region = ? AND plan = ? AND status = ? AND amount >= ? AND created_at >= ?',
tablesTouched: ['analytics.orders'],
literalSlots: [
{ position: 1, type: 'string', exampleValue: stringValue('region') },
{ position: 2, type: 'string', exampleValue: stringValue('plan') },
{ position: 3, type: 'string', exampleValue: stringValue('status') },
{ position: 4, type: 'number', exampleValue: amount },
{ position: 5, type: 'date', exampleValue: asOf },
],
};
},
};
function classificationMatrixRows(): HistoricSqlRawQueryRow[] {
const rows: HistoricSqlRawQueryRow[] = Array.from({ length: 20 }, (_, index) => {
const status = index < 10 ? 'paid' : 'refunded';
const plan = index === 19 ? 'self_serve' : 'enterprise';
const amount = 100 + index;
const asOf = `2026-05-${String(1 + Math.floor(index / 5)).padStart(2, '0')}`;
return {
id: `matrix-${index + 1}`,
sql: `SELECT count(*) FROM analytics.orders WHERE region = 'us' AND plan = '${plan}' AND status = '${status}' AND amount >= ${amount} AND created_at >= '${asOf}'`,
user: `analyst-${(index % 4) + 1}`,
startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`,
endedAt: null,
runtimeMs: 100 + index,
rowsProduced: 1,
success: true,
errorMessage: null,
};
});
return [
...rows,
{
id: 'stale-date-1',
sql: "SELECT count(*) FROM analytics.stale_orders WHERE created_at >= '2026-04-01'",
user: 'analyst-1',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: null,
runtimeMs: 75,
rowsProduced: 1,
success: true,
errorMessage: null,
},
];
}
describe('stageHistoricSqlTemplates', () => {
it('compresses rows by fingerprint into document-shaped staged templates', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'q1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01' AND email = 'analyst@example.com'",
user: 'analyst@example.com',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: '2026-05-04T10:00:01.000Z',
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
{
id: 'q2',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-05-01' AND email = 'analyst-2@example.com'",
user: 'analyst-2@example.com',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: '2026-05-04T11:00:01.000Z',
runtimeMs: 300,
rowsProduced: 1,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: fakeSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: ['^svc_'],
redactionPatterns: ['[\\w.+-]+@[\\w-]+\\.[\\w.-]+'],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest).toMatchObject({
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
nextSuccessfulCursor: '2026-05-04T11:00:00.000Z',
templateCount: 1,
capped: false,
});
const files = (await readdir(join(stagedDir, 'templates', 'fp_paid_orders'))).sort();
expect(files).toEqual(['metadata.json', 'page.md', 'usage.json']);
const metadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, 'templates/fp_paid_orders/metadata.json'),
);
expect(metadata).toEqual({
id: 'fp_paid_orders',
title: 'snowflake · analytics.orders [fp_pai]',
path: 'templates/fp_paid_orders/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_paid_orders',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [
{ position: 1, type: 'string', classification: 'constant' },
{ position: 2, type: 'date', classification: 'runtime' },
],
triage_signals: {
executions_bucket: 'low',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 1 runtime',
},
},
});
const page = await readFile(join(stagedDir, 'templates/fp_paid_orders/page.md'), 'utf-8');
expect(page).toContain('## Normalized SQL');
expect(page).toContain('SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?');
expect(page).toContain('- analytics.orders');
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json'));
expect(usage.stats).toMatchObject({
executions: 2,
distinct_users: 2,
first_seen: '2026-05-04T10:00:00.000Z',
last_seen: '2026-05-04T11:00:00.000Z',
p50_runtime_ms: 100,
p95_runtime_ms: 300,
error_rate: 0,
});
expect(usage.samples).toHaveLength(1);
expect(usage.samples[0].bound_sql).toContain('<redacted>');
expect(usage.samples[0].bound_sql).not.toContain('analyst@example.com');
expect(usage.samples[0].bound_sql).not.toContain('analyst-2@example.com');
});
it('skips hard-noise SQL and caps templates deterministically', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'show-1',
sql: 'SHOW TABLES',
user: 'analyst',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: null,
success: true,
errorMessage: null,
},
{
id: 'q3',
sql: "SELECT count(*) FROM analytics.refunds WHERE state = 'complete'",
user: 'analyst',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: null,
runtimeMs: 50,
success: true,
errorMessage: null,
},
{
id: 'q4',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01'",
user: 'analyst',
startedAt: '2026-05-04T11:30:00.000Z',
endedAt: null,
runtimeMs: 40,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: fakeSqlAnalysis,
pullConfig: {
dialect: 'bigquery',
windowDays: 7,
lastSuccessfulCursor: '2026-05-01T00:00:00.000Z',
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 1,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templateCount).toBe(1);
expect(manifest.capped).toBe(true);
expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']);
expect(manifest.templates.map((template) => template.id)).toEqual(['fp_paid_orders']);
});
it('splits categorical fingerprints into one document directory per dominant value', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(categoricalRows()),
sqlAnalysis: categoricalSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const templates = manifest.templates
.map((template) => ({
id: template.id,
fingerprint: template.fingerprint,
subClusterId: template.subClusterId,
path: template.path,
}))
.sort((left, right) => left.id.localeCompare(right.id));
expect(manifest.templateCount).toBe(2);
expect(templates).toEqual([
{
id: 'fp_order_status__cat_2b2ff2318877',
fingerprint: 'fp_order_status',
subClusterId: 'cat_2b2ff2318877',
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
},
{
id: 'fp_order_status__cat_34f037ddcbfa',
fingerprint: 'fp_order_status',
subClusterId: 'cat_34f037ddcbfa',
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
},
]);
const paidMetadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/metadata.json'),
);
expect(paidMetadata).toMatchObject({
id: 'fp_order_status__cat_34f037ddcbfa',
title: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
properties: {
fingerprint: 'fp_order_status',
sub_cluster_id: 'cat_34f037ddcbfa',
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
},
});
const paidUsage = historicSqlUsageSchema.parse(
await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'),
);
expect(paidUsage.stats).toMatchObject({
executions: 3,
distinct_users: 3,
first_seen: '2026-05-04T10:00:00.000Z',
last_seen: '2026-05-04T10:02:00.000Z',
rows_produced: 36,
});
expect(paidUsage.literal_slots).toEqual([{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }]);
const refundedUsage = historicSqlUsageSchema.parse(
await readJson(stagedDir, 'templates/fp_order_status__cat_2b2ff2318877/usage.json'),
);
expect(refundedUsage.stats).toMatchObject({
executions: 3,
distinct_users: 3,
first_seen: '2026-05-04T10:03:00.000Z',
last_seen: '2026-05-04T10:05:00.000Z',
rows_produced: 66,
});
expect(refundedUsage.literal_slots).toEqual([
{ position: 1, distinct_values: 1, top_values: [['refunded', 3]] },
]);
});
it('classifies literal slots across the spec matrix and stale-date demotion', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(classificationMatrixRows()),
sqlAnalysis: classificationMatrixSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const matrixTemplates = manifest.templates.filter((template) => template.fingerprint === 'fp_classification_matrix');
expect(matrixTemplates).toHaveLength(2);
expect(matrixTemplates.every((template) => template.subClusterId?.startsWith('cat_'))).toBe(true);
const matrixTemplate = matrixTemplates[0];
if (!matrixTemplate) {
throw new Error('expected classification matrix template');
}
const matrixMetadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, matrixTemplate.path.replace('/page.md', '/metadata.json')),
);
expect(matrixMetadata.properties.literal_slots).toMatchInlineSnapshot(`
[
{
"classification": "constant",
"position": 1,
"type": "string",
},
{
"classification": "constant",
"position": 2,
"type": "string",
},
{
"classification": "categorical",
"position": 3,
"type": "string",
},
{
"classification": "runtime",
"position": 4,
"type": "number",
},
{
"classification": "runtime",
"position": 5,
"type": "date",
},
]
`);
expect(matrixMetadata.properties.triage_signals.slot_summary).toBe('2 constant, 2 runtime');
const staleMetadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, 'templates/fp_stale_date/metadata.json'),
);
expect(staleMetadata.properties.literal_slots).toMatchInlineSnapshot(`
[
{
"classification": "runtime",
"position": 1,
"type": "date",
},
]
`);
expect(staleMetadata.properties.triage_signals.slot_summary).toBe('0 constant, 1 runtime');
});
it('applies the templates-per-run cap after categorical expansion', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(categoricalRows()),
sqlAnalysis: categoricalSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 1,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templateCount).toBe(1);
expect(manifest.capped).toBe(true);
expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']);
expect(manifest.templates).toHaveLength(1);
expect(manifest.templates[0].id).toMatch(/^fp_order_status__cat_/);
});
it('omits rows_produced for BigQuery templates when reader rows have no row counts', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_bq',
queryClient: {},
reader: fakeReader([
{
id: 'bq-1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-a@example.com',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: fakeSqlAnalysis,
pullConfig: {
dialect: 'bigquery',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json'));
expect(usage.stats).not.toHaveProperty('rows_produced');
expect(usage.samples[0]).not.toHaveProperty('rows_produced');
});
it('keeps at most five diverse samples, preferring recent successful representatives per literal tuple', async () => {
const stagedDir = await tempDir();
const statuses = [
'paid',
'refunded',
'pending',
'failed',
'trial',
'cancelled',
'draft',
'returned',
'review',
'held',
'archived',
];
const rows: HistoricSqlRawQueryRow[] = statuses.flatMap((status, index) => [
{
id: `${status}-old`,
sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`,
user: 'analyst-a',
startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`,
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: false,
errorMessage: 'old failed sample',
},
{
id: `${status}-new`,
sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`,
user: 'analyst-a',
startedAt: `2026-05-04T11:${String(index).padStart(2, '0')}:00.000Z`,
endedAt: null,
runtimeMs: 90,
rowsProduced: 2,
success: true,
errorMessage: null,
},
]);
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(rows),
sqlAnalysis: diverseSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_diverse_samples/usage.json'));
expect(usage.samples).toHaveLength(5);
expect(usage.samples.every((sample) => sample.success)).toBe(true);
expect(new Set(usage.samples.map((sample) => sample.bound_sql.match(/status = '([^']+)'/)?.[1])).size).toBe(5);
expect(usage.samples.map((sample) => sample.started_at)).toEqual([
'2026-05-04T11:10:00.000Z',
'2026-05-04T11:09:00.000Z',
'2026-05-04T11:08:00.000Z',
'2026-05-04T11:07:00.000Z',
'2026-05-04T11:06:00.000Z',
]);
});
it('uses recency as a tie-breaker when the templates-per-run cap overflows', async () => {
const stagedDir = await tempDir();
const sqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
const table = sql.includes('fresh_orders') ? 'fresh_orders' : 'stale_orders';
return {
fingerprint: `fp_${table}`,
normalizedSql: `SELECT count(*) FROM analytics.${table}`,
tablesTouched: [`analytics.${table}`],
literalSlots: [],
};
},
};
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'stale-1',
sql: 'SELECT count(*) FROM analytics.stale_orders',
user: 'analyst-a',
startedAt: '2026-02-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
{
id: 'fresh-1',
sql: 'SELECT count(*) FROM analytics.fresh_orders',
user: 'analyst-a',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
]),
sqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 1,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templates.map((template) => template.id)).toEqual(['fp_fresh_orders']);
});
it('does not persist bound SQL samples when redaction patterns are invalid', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'q1',
sql: "SELECT * FROM analytics.orders WHERE email = 'analyst@example.com'",
user: 'analyst@example.com',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: {
async analyzeForFingerprint() {
return {
fingerprint: 'fp_redaction',
normalizedSql: 'SELECT * FROM analytics.orders WHERE email = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: 'analyst@example.com' }],
};
},
},
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: ['['],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_redaction/usage.json'));
expect(manifest.warnings.some((warning) => warning.startsWith('redaction_skipped:invalid_redaction_pattern'))).toBe(
true,
);
expect(usage.samples).toEqual([]);
});
});

View file

@ -0,0 +1,630 @@
import { createHash } from 'node:crypto';
import { mkdir, writeFile } from 'node:fs/promises';
import { dirname, join } from 'node:path';
import type {
SqlAnalysisFingerprintResult,
SqlAnalysisLiteralSlot,
SqlAnalysisLiteralSlotType,
SqlAnalysisPort,
} from '../../../sql-analysis/index.js';
import {
HISTORIC_SQL_OBJECT_TYPE,
HISTORIC_SQL_SOURCE_KEY,
historicSqlPullConfigSchema,
historicSqlRawQueryRowSchema,
type HistoricSqlLiteralSlotClassification,
type HistoricSqlManifest,
type HistoricSqlMetadata,
type HistoricSqlPullConfig,
type HistoricSqlQueryHistoryReader,
type HistoricSqlRawQueryRow,
type HistoricSqlUsage,
} from './types.js';
interface StageHistoricSqlTemplatesInput {
stagedDir: string;
connectionId: string;
queryClient: unknown;
reader: HistoricSqlQueryHistoryReader;
sqlAnalysis: SqlAnalysisPort;
pullConfig: HistoricSqlPullConfig;
now?: Date;
}
interface SlotObservation {
value: string;
rowStartedAt: string;
}
interface SlotStats {
position: number;
type: SqlAnalysisLiteralSlotType;
values: Map<string, number>;
observations: SlotObservation[];
}
interface TemplateAccumulator {
fingerprint: string;
normalizedSql: string;
tablesTouched: Set<string>;
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
slotStats: Map<number, SlotStats>;
}
interface ClassifiedLiteralSlot {
position: number;
type: SqlAnalysisLiteralSlotType;
classification: HistoricSqlLiteralSlotClassification;
}
interface TemplateVariant {
id: string;
fingerprint: string;
subClusterId: string | null;
normalizedSql: string;
tablesTouched: Set<string>;
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
slotStats: Map<number, SlotStats>;
slotClassifications: ClassifiedLiteralSlot[];
}
interface CategoricalTupleEntry {
position: number;
value: string;
}
interface RedactionPolicy {
redactors: RegExp[];
samplesAllowed: boolean;
}
const HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i;
const HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i;
export async function stageHistoricSqlTemplates(input: StageHistoricSqlTemplatesInput): Promise<void> {
const config = historicSqlPullConfigSchema.parse(input.pullConfig);
const now = input.now ?? new Date();
const windowStart = config.lastSuccessfulCursor
? new Date(config.lastSuccessfulCursor)
: new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000);
const warnings: string[] = [];
const redaction = compileRedactors(config.redactionPatterns, warnings);
const groups = new Map<string, TemplateAccumulator>();
let nextSuccessfulCursor: string | null = null;
await input.reader.probe(input.queryClient);
for await (const rawRow of input.reader.fetch(
input.queryClient,
{ start: windowStart, end: now },
config.lastSuccessfulCursor,
)) {
const row = historicSqlRawQueryRowSchema.parse(rawRow);
if (!nextSuccessfulCursor || row.startedAt > nextSuccessfulCursor) {
nextSuccessfulCursor = row.startedAt;
}
if (shouldSkipSql(row.sql)) {
continue;
}
const analysis = await input.sqlAnalysis.analyzeForFingerprint(row.sql, config.dialect);
if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) {
warnings.push(`analysis_failed:${row.id}`);
continue;
}
const group =
groups.get(analysis.fingerprint) ??
{
fingerprint: analysis.fingerprint,
normalizedSql: analysis.normalizedSql,
tablesTouched: new Set<string>(),
rows: [],
slotStats: new Map<number, SlotStats>(),
};
for (const table of analysis.tablesTouched) {
group.tablesTouched.add(table);
}
for (const slot of analysis.literalSlots) {
recordSlot(group.slotStats, slot, redaction.redactors, row.startedAt);
}
group.rows.push({ row, analysis });
groups.set(analysis.fingerprint, group);
}
const expandedTemplates = expandCategoricalTemplates([...groups.values()], redaction.redactors);
const selected = selectTemplates(expandedTemplates, config.maxTemplatesPerRun, now);
if (selected.length < expandedTemplates.length) {
warnings.push(`templates_truncated: kept ${selected.length} of ${expandedTemplates.length} templates`);
}
await mkdir(input.stagedDir, { recursive: true });
const templates: HistoricSqlManifest['templates'] = [];
for (const template of selected) {
const staged = buildStagedTemplate(template, config, redaction, now);
const basePath = `templates/${staged.metadata.id}`;
await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata);
await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown);
await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage);
templates.push({
id: staged.metadata.id,
fingerprint: staged.metadata.properties.fingerprint,
subClusterId: staged.metadata.properties.sub_cluster_id,
path: staged.metadata.path,
});
}
await writeJson(input.stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: input.connectionId,
dialect: config.dialect,
fetchedAt: now.toISOString(),
windowStart: windowStart.toISOString(),
windowEnd: now.toISOString(),
nextSuccessfulCursor,
templateCount: selected.length,
capped: selected.length < expandedTemplates.length,
warnings,
degraded: false,
statsResetAt: null,
baselineFirstRun: false,
pgServerVersion: null,
deallocCount: null,
templates,
} satisfies HistoricSqlManifest);
}
function shouldSkipSql(sql: string): boolean {
return HARD_SKIP_PREFIX_RE.test(sql) || HARD_SKIP_TABLE_RE.test(sql);
}
function recordSlot(
slotStats: Map<number, SlotStats>,
slot: SqlAnalysisLiteralSlot,
redactors: RegExp[],
rowStartedAt: string,
): void {
const existing = slotStats.get(slot.position) ?? {
position: slot.position,
type: slot.type,
values: new Map<string, number>(),
observations: [],
};
const persistedValue = redactText(slot.exampleValue, redactors);
existing.values.set(persistedValue, (existing.values.get(persistedValue) ?? 0) + 1);
existing.observations.push({ value: persistedValue, rowStartedAt });
slotStats.set(slot.position, existing);
}
function expandCategoricalTemplates(groups: TemplateAccumulator[], redactors: RegExp[]): TemplateVariant[] {
return groups.flatMap((group) => expandTemplateGroup(group, redactors));
}
function expandTemplateGroup(group: TemplateAccumulator, redactors: RegExp[]): TemplateVariant[] {
const rows = [...group.rows].sort((left, right) => left.row.startedAt.localeCompare(right.row.startedAt));
const firstSeen = rows[0]?.row.startedAt;
if (!firstSeen) {
return [];
}
const slotClassifications = classifySlots(group.slotStats, rows.length, firstSeen);
const categoricalPositions = slotClassifications
.filter((slot) => slot.classification === 'categorical')
.map((slot) => slot.position)
.sort((left, right) => left - right);
if (categoricalPositions.length === 0) {
return [
{
id: group.fingerprint,
fingerprint: group.fingerprint,
subClusterId: null,
normalizedSql: group.normalizedSql,
tablesTouched: group.tablesTouched,
rows,
slotStats: group.slotStats,
slotClassifications,
},
];
}
const byTuple = new Map<
string,
{
tuple: CategoricalTupleEntry[];
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
}
>();
for (const entry of rows) {
const tuple = categoricalTuple(entry.analysis.literalSlots, categoricalPositions, redactors);
const key = JSON.stringify(tuple);
const existing = byTuple.get(key) ?? { tuple, rows: [] };
existing.rows.push(entry);
byTuple.set(key, existing);
}
return [...byTuple.values()]
.map(({ tuple, rows: tupleRows }) => {
const subClusterId = subClusterIdForTuple(tuple);
return {
id: `${group.fingerprint}__${subClusterId}`,
fingerprint: group.fingerprint,
subClusterId,
normalizedSql: group.normalizedSql,
tablesTouched: group.tablesTouched,
rows: tupleRows,
slotStats: collectSlotStats(tupleRows, redactors),
slotClassifications,
};
})
.sort((left, right) => left.id.localeCompare(right.id));
}
function classifySlots(
slotStats: Map<number, SlotStats>,
executions: number,
firstSeen: string,
): ClassifiedLiteralSlot[] {
return [...slotStats.values()]
.sort((left, right) => left.position - right.position)
.map((slot) => ({
position: slot.position,
type: slot.type,
classification: classifySlot(slot, executions, firstSeen),
}));
}
function collectSlotStats(
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>,
redactors: RegExp[],
): Map<number, SlotStats> {
const slotStats = new Map<number, SlotStats>();
for (const entry of rows) {
for (const slot of entry.analysis.literalSlots) {
recordSlot(slotStats, slot, redactors, entry.row.startedAt);
}
}
return slotStats;
}
function categoricalTuple(
literalSlots: SqlAnalysisLiteralSlot[],
categoricalPositions: number[],
redactors: RegExp[],
): CategoricalTupleEntry[] {
const valuesByPosition = new Map(
literalSlots.map((slot) => [slot.position, redactText(slot.exampleValue, redactors)] as const),
);
return categoricalPositions.map((position) => ({
position,
value: valuesByPosition.get(position) ?? '<missing>',
}));
}
function subClusterIdForTuple(tuple: CategoricalTupleEntry[]): string {
return `cat_${createHash('sha256').update(JSON.stringify(tuple)).digest('hex').slice(0, 12)}`;
}
function buildStagedTemplate(
template: TemplateVariant,
config: HistoricSqlPullConfig,
redaction: RedactionPolicy,
now: Date,
): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } {
const rows = template.rows
.map((entry) => entry.row)
.sort((left, right) => left.startedAt.localeCompare(right.startedAt));
const firstSeen = rows[0].startedAt;
const lastSeen = rows[rows.length - 1].startedAt;
const distinctUsers = new Set(rows.map((row) => row.user).filter((user): user is string => !!user)).size;
const errorCount = rows.filter((row) => !row.success).length;
const runtimes = rows
.map((row) => row.runtimeMs)
.filter((runtime): runtime is number => typeof runtime === 'number')
.sort((left, right) => left - right);
const triageSignals = buildTriageSignals({
executions: rows.length,
distinctUsers,
errorRate: rows.length === 0 ? 0 : errorCount / rows.length,
lastSeen,
now,
serviceAccountOnly: isServiceAccountOnly(rows, config.serviceAccountUserPatterns),
slotClassifications: template.slotClassifications.map((slot) => slot.classification),
});
const tablesTouched = [...template.tablesTouched].sort();
const firstTable = tablesTouched[0] ?? 'query';
const id = template.id;
const rowsProduced = sumRowsProduced(rows);
const metadata: HistoricSqlMetadata = {
id,
title: buildTemplateTitle(config.dialect, firstTable, template.fingerprint, template.subClusterId),
path: `templates/${id}/page.md`,
objectType: HISTORIC_SQL_OBJECT_TYPE,
lastEditedAt: null,
properties: {
fingerprint: template.fingerprint,
sub_cluster_id: template.subClusterId,
dialect: config.dialect,
tables_touched: tablesTouched,
literal_slots: template.slotClassifications,
triage_signals: triageSignals,
},
};
return {
metadata,
pageMarkdown: renderTemplatePage(id, template.normalizedSql, tablesTouched),
usage: {
stats: {
executions: rows.length,
distinct_users: distinctUsers,
first_seen: firstSeen,
last_seen: lastSeen,
p50_runtime_ms: percentile(runtimes, 0.5),
p95_runtime_ms: percentile(runtimes, 0.95),
error_rate: rows.length === 0 ? 0 : errorCount / rows.length,
...(rowsProduced === null ? {} : { rows_produced: rowsProduced }),
},
literal_slots: [...template.slotStats.values()]
.sort((left, right) => left.position - right.position)
.map((slot) => ({
position: slot.position,
distinct_values: slot.values.size,
top_values: [...slot.values.entries()]
.sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))
.slice(0, 10),
})),
samples: selectSamples(template.rows, redaction),
},
};
}
const TEMPORAL_SLOT_TYPES = new Set<SqlAnalysisLiteralSlotType>(['date', 'timestamp']);
function isStaleDateConstant(slot: SlotStats, value: string, firstSeen: string): boolean {
return slot.type === 'date' && parseTemporalSlotValue(value) !== null && value < firstSeen.slice(0, 10);
}
function isMovingTemporalSlot(slot: SlotStats): boolean {
if (!TEMPORAL_SLOT_TYPES.has(slot.type) || slot.values.size < 2) {
return false;
}
const observations: Array<{ rowStartedAt: number; literalTime: number }> = [];
for (const observation of slot.observations) {
const rowStartedAt = Date.parse(observation.rowStartedAt);
const literalTime = parseTemporalSlotValue(observation.value);
if (Number.isNaN(rowStartedAt) || literalTime === null) {
return false;
}
observations.push({ rowStartedAt, literalTime });
}
const literalTimes = observations
.sort((left, right) => left.rowStartedAt - right.rowStartedAt)
.map((observation) => observation.literalTime);
return isMonotonic(literalTimes);
}
function parseTemporalSlotValue(value: string): number | null {
const parsed = Date.parse(value);
return Number.isNaN(parsed) ? null : parsed;
}
function isMonotonic(values: number[]): boolean {
if (values.length < 2) {
return false;
}
let nonDecreasing = true;
let nonIncreasing = true;
for (let index = 1; index < values.length; index += 1) {
if (values[index] < values[index - 1]) {
nonDecreasing = false;
}
if (values[index] > values[index - 1]) {
nonIncreasing = false;
}
}
return nonDecreasing || nonIncreasing;
}
function classifySlot(
slot: SlotStats,
executions: number,
firstSeen: string,
): HistoricSqlLiteralSlotClassification {
const ordered = [...slot.values.entries()].sort((left, right) => right[1] - left[1]);
const distinct = ordered.length;
const topCount = ordered[0]?.[1] ?? 0;
const topValue = ordered[0]?.[0] ?? '';
const staleDateConstant = isStaleDateConstant(slot, topValue, firstSeen);
if (distinct === 1 && !staleDateConstant) {
return 'constant';
}
if (executions > 0 && topCount / executions >= 0.95 && !staleDateConstant) {
return 'constant';
}
if (isMovingTemporalSlot(slot)) {
return 'runtime';
}
if (executions > 0 && distinct >= 2 && distinct <= 10 && ordered.every(([, count]) => count / executions >= 0.05)) {
return 'categorical';
}
return 'runtime';
}
function buildTriageSignals(input: {
executions: number;
distinctUsers: number;
errorRate: number;
lastSeen: string;
now: Date;
serviceAccountOnly: boolean;
slotClassifications: HistoricSqlLiteralSlotClassification[];
}): Record<string, string> {
const runtimeCount = input.slotClassifications.filter((classification) => classification === 'runtime').length;
const constantCount = input.slotClassifications.filter((classification) => classification === 'constant').length;
return {
executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high',
distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad',
error_rate_bucket: input.errorRate <= 0.01 ? 'ok' : input.errorRate <= 0.1 ? 'noisy' : 'broken',
recency_bucket: recencyBucket(input.lastSeen, input.now),
service_account_only: String(input.serviceAccountOnly),
slot_summary: `${constantCount} constant, ${runtimeCount} runtime`,
};
}
function recencyBucket(lastSeen: string, now: Date): string {
const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / (24 * 60 * 60 * 1000));
if (ageDays <= 14) {
return 'active';
}
if (ageDays <= 60) {
return 'warm';
}
return 'cold';
}
function isServiceAccountOnly(rows: HistoricSqlRawQueryRow[], patterns: string[]): boolean {
const users = rows.map((row) => row.user).filter((user): user is string => !!user);
if (users.length === 0 || patterns.length === 0) {
return false;
}
const regexes = patterns.map((pattern) => new RegExp(pattern));
return users.every((user) => regexes.some((regex) => regex.test(user)));
}
function buildTemplateTitle(
dialect: HistoricSqlPullConfig['dialect'],
firstTable: string,
fingerprint: string,
subClusterId: string | null,
): string {
if (!subClusterId) {
return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}]`;
}
return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}:${subClusterId.slice(-6)}]`;
}
function renderTemplatePage(fingerprint: string, normalizedSql: string, tablesTouched: string[]): string {
return [
`# ${fingerprint}`,
'',
'## Normalized SQL',
'```sql',
normalizedSql,
'```',
'',
'## Tables touched',
...tablesTouched.map((table) => `- ${table}`),
'',
].join('\n');
}
function selectSamples(
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>,
redaction: RedactionPolicy,
): HistoricSqlUsage['samples'] {
if (!redaction.samplesAllowed) {
return [];
}
const byLiteralTuple = new Map<string, { row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>();
const preferred = [...rows].sort((left, right) => {
if (left.row.success !== right.row.success) {
return left.row.success ? -1 : 1;
}
return right.row.startedAt.localeCompare(left.row.startedAt);
});
for (const entry of preferred) {
const key = [...entry.analysis.literalSlots]
.sort((left, right) => left.position - right.position)
.map((slot) => slot.exampleValue)
.join('\u001f');
if (!byLiteralTuple.has(key)) {
byLiteralTuple.set(key, entry);
}
}
return [...byLiteralTuple.values()]
.sort((left, right) => right.row.startedAt.localeCompare(left.row.startedAt))
.slice(0, 5)
.map(({ row }) => ({
started_at: row.startedAt,
user: row.user,
bound_sql: redactText(row.sql, redaction.redactors),
...(row.rowsProduced === undefined ? {} : { rows_produced: row.rowsProduced ?? null }),
runtime_ms: row.runtimeMs,
success: row.success,
}));
}
function selectTemplates(templates: TemplateVariant[], maxTemplatesPerRun: number, now: Date): TemplateVariant[] {
return templates
.map((template) => ({ template, score: rankTemplate(template, now) }))
.sort((left, right) => right.score - left.score || left.template.id.localeCompare(right.template.id))
.slice(0, maxTemplatesPerRun)
.map((entry) => entry.template);
}
function rankTemplate(template: TemplateVariant, now: Date): number {
const users = new Set(template.rows.map(({ row }) => row.user).filter((user): user is string => !!user)).size;
const latestStartedAt = template.rows.reduce<string | null>(
(latest, { row }) => (latest === null || row.startedAt > latest ? row.startedAt : latest),
null,
);
const ageDays =
latestStartedAt === null ? 365 : Math.max(0, (now.getTime() - new Date(latestStartedAt).getTime()) / 86400000);
const recencyWeight = 1 / (1 + ageDays / 30);
return users * Math.log1p(template.rows.length) * recencyWeight;
}
function percentile(values: number[], percentileValue: number): number | null {
if (values.length === 0) {
return null;
}
const index = Math.min(values.length - 1, Math.max(0, Math.ceil(values.length * percentileValue) - 1));
return values[index];
}
function sumRowsProduced(rows: HistoricSqlRawQueryRow[]): number | null {
const values = rows.map((row) => row.rowsProduced).filter((value): value is number => typeof value === 'number');
return values.length > 0 ? values.reduce((sum, value) => sum + value, 0) : null;
}
function compileRedactors(patterns: string[], warnings: string[]): RedactionPolicy {
let samplesAllowed = true;
const redactors = patterns.flatMap((pattern) => {
try {
return [new RegExp(pattern, 'g')];
} catch (error) {
samplesAllowed = false;
warnings.push(
`redaction_skipped:invalid_redaction_pattern:${pattern}:${error instanceof Error ? error.message : String(error)}`,
);
return [];
}
});
return { redactors, samplesAllowed };
}
function redactText(value: string, redactors: RegExp[]): string {
return redactors.reduce((current, regex) => current.replace(regex, '<redacted>'), value);
}
async function writeJson(stagedDir: string, relPath: string, value: unknown): Promise<void> {
await writeText(stagedDir, relPath, `${JSON.stringify(value, null, 2)}\n`);
}
async function writeText(stagedDir: string, relPath: string, value: string): Promise<void> {
const target = join(stagedDir, relPath);
await mkdir(dirname(target), { recursive: true });
await writeFile(target, value, 'utf-8');
}

View file

@ -0,0 +1,201 @@
import { z } from 'zod';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
export const HISTORIC_SQL_SOURCE_KEY = 'historic-sql' as const;
export const HISTORIC_SQL_OBJECT_TYPE = 'historic_sql_template' as const;
const historicSqlDialectSchema = z.enum(['snowflake', 'bigquery', 'postgres']);
export type HistoricSqlDialect = z.infer<typeof historicSqlDialectSchema>;
export const historicSqlPullConfigSchema = z.object({
dialect: historicSqlDialectSchema,
windowDays: z.number().int().min(1).max(365).default(90),
lastSuccessfulCursor: z.string().datetime().nullable().default(null),
serviceAccountUserPatterns: z.array(z.string()).default([]),
redactionPatterns: z.array(z.string()).default([]),
maxTemplatesPerRun: z.number().int().min(1).max(5000).default(5000),
minCalls: z.number().int().min(1).default(5),
});
export type HistoricSqlPullConfig = z.infer<typeof historicSqlPullConfigSchema>;
export interface HistoricSqlTimeWindow {
start: Date;
end: Date;
}
export const historicSqlRawQueryRowSchema = z.object({
id: z.string().min(1),
sql: z.string().min(1),
user: z.string().nullable().default(null),
startedAt: z.string().datetime(),
endedAt: z.string().datetime().nullable().default(null),
runtimeMs: z.number().nonnegative().nullable().default(null),
rowsProduced: z.number().int().nonnegative().nullable().optional(),
success: z.boolean().default(true),
errorMessage: z.string().nullable().default(null),
});
export type HistoricSqlRawQueryRow = z.infer<typeof historicSqlRawQueryRowSchema>;
export interface HistoricSqlQueryHistoryReader {
probe(client: unknown): Promise<void>;
fetch(
client: unknown,
window: HistoricSqlTimeWindow,
cursor?: string | null,
): AsyncIterable<HistoricSqlRawQueryRow>;
}
export interface KloPostgresQueryClient {
executeQuery(sql: string, params?: unknown[]): Promise<{ headers: string[]; rows: unknown[][]; totalRows?: number }>;
}
export interface PostgresPgssProbeResult {
pgServerVersion: string;
warnings: string[];
}
export interface PostgresPgssSnapshot {
statsResetAt: string | null;
deallocCount: number | null;
rows: PostgresPgssRow[];
}
export interface PostgresPgssReader {
probe(client: KloPostgresQueryClient): Promise<PostgresPgssProbeResult>;
readSnapshot(
client: KloPostgresQueryClient,
options: { minCalls: number; maxTemplates: number },
): Promise<PostgresPgssSnapshot>;
}
export interface PostgresPgssRow {
queryid: string;
userid: string;
username: string | null;
dbid: string;
database: string | null;
query: string;
calls: number;
totalExecTime: number;
meanExecTime: number;
totalRows: number;
}
export interface PostgresPgssAggregateRow {
id: string;
queryid: string;
dbid: string;
database: string | null;
query: string;
deltaCalls: number;
deltaExecTime: number;
deltaRows: number;
meanExecTime: number;
distinctUsersDelta: number;
users: string[];
firstObservedAt: string;
}
export interface HistoricSqlSourceAdapterDeps {
sqlAnalysis: SqlAnalysisPort;
reader: HistoricSqlQueryHistoryReader;
queryClient: unknown;
postgresReader?: PostgresPgssReader;
postgresQueryClient?: KloPostgresQueryClient;
postgresBaselineRootDir?: string;
now?: () => Date;
onPullSucceeded?: (ctx: {
connectionId: string;
sourceKey: string;
syncId: string;
trigger: import('../../types.js').IngestTrigger;
completedAt: Date;
stagedDir: string;
nextSuccessfulCursor: string | null;
}) => Promise<void>;
}
const historicSqlLiteralSlotClassificationSchema = z.enum(['constant', 'runtime', 'categorical']);
export type HistoricSqlLiteralSlotClassification = z.infer<typeof historicSqlLiteralSlotClassificationSchema>;
export const historicSqlMetadataSchema = z.object({
id: z.string().min(1),
title: z.string().min(1),
path: z.string().min(1),
objectType: z.literal(HISTORIC_SQL_OBJECT_TYPE),
lastEditedAt: z.null(),
properties: z.object({
fingerprint: z.string().min(1),
sub_cluster_id: z.string().nullable(),
dialect: historicSqlDialectSchema,
tables_touched: z.array(z.string()),
literal_slots: z.array(
z.object({
position: z.number().int().min(1),
type: z.enum(['string', 'number', 'timestamp', 'date', 'boolean', 'null', 'unknown']),
classification: historicSqlLiteralSlotClassificationSchema,
}),
),
triage_signals: z.record(z.string(), z.string()),
}),
});
export type HistoricSqlMetadata = z.infer<typeof historicSqlMetadataSchema>;
export const historicSqlUsageSchema = z.object({
stats: z.object({
executions: z.number().int().nonnegative(),
distinct_users: z.number().int().nonnegative(),
first_seen: z.string().datetime(),
last_seen: z.string().datetime(),
p50_runtime_ms: z.number().nonnegative().nullable(),
p95_runtime_ms: z.number().nonnegative().nullable(),
mean_runtime_ms: z.number().nonnegative().nullable().optional(),
error_rate: z.number().min(0).max(1),
rows_produced: z.number().int().nonnegative().nullable().optional(),
}),
literal_slots: z.array(
z.object({
position: z.number().int().min(1),
distinct_values: z.number().int().nonnegative(),
top_values: z.array(z.tuple([z.string(), z.number().int().nonnegative()])),
}),
),
samples: z.array(
z.object({
started_at: z.string().datetime(),
user: z.string().nullable(),
bound_sql: z.string(),
rows_produced: z.number().int().nonnegative().nullable().optional(),
runtime_ms: z.number().nonnegative().nullable(),
success: z.boolean(),
}),
),
});
export type HistoricSqlUsage = z.infer<typeof historicSqlUsageSchema>;
export const historicSqlManifestSchema = z.object({
source: z.literal(HISTORIC_SQL_SOURCE_KEY),
connectionId: z.string().min(1),
dialect: historicSqlDialectSchema,
fetchedAt: z.string().datetime(),
windowStart: z.string().datetime(),
windowEnd: z.string().datetime(),
nextSuccessfulCursor: z.string().datetime().nullable(),
templateCount: z.number().int().nonnegative(),
capped: z.boolean(),
warnings: z.array(z.string()),
degraded: z.boolean().default(false),
statsResetAt: z.string().datetime().nullable().default(null),
baselineFirstRun: z.boolean().default(false),
pgServerVersion: z.string().nullable().default(null),
deallocCount: z.number().int().nonnegative().nullable().default(null),
templates: z.array(
z.object({
id: z.string().min(1),
fingerprint: z.string().min(1),
subClusterId: z.string().nullable(),
path: z.string().min(1),
}),
),
});
export type HistoricSqlManifest = z.infer<typeof historicSqlManifestSchema>;

View file

@ -0,0 +1,107 @@
import { mkdtemp } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import type { KloSchemaSnapshot } from '../../../scan/types.js';
import { chunkLiveDatabaseStagedDir } from './chunk.js';
import { liveDatabaseTablePath, writeLiveDatabaseSnapshot } from './stage.js';
function snapshot(): KloSchemaSnapshot {
return {
connectionId: 'conn-1',
driver: 'postgres',
extractedAt: '2026-04-27T00:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
kind: 'table',
comment: null,
estimatedRows: null,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
],
foreignKeys: [],
},
{
name: 'customers',
catalog: null,
db: 'public',
kind: 'table',
comment: null,
estimatedRows: null,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
],
foreignKeys: [],
},
],
};
}
describe('chunkLiveDatabaseStagedDir', () => {
it('emits one work unit per table on the first run', async () => {
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-chunk-'));
await writeLiveDatabaseSnapshot(dir, snapshot());
const result = await chunkLiveDatabaseStagedDir(dir);
expect(result.workUnits.map((wu) => wu.unitKey)).toEqual([
'live-database-public-customers',
'live-database-public-orders',
]);
expect(result.workUnits[0]?.dependencyPaths).toEqual(['connection.json', 'foreign-keys.json']);
expect(result.workUnits[0]?.peerFileIndex).toContain(
liveDatabaseTablePath({ catalog: null, db: 'public', name: 'orders' }),
);
});
it('keeps only changed tables during incremental syncs and records table evictions', async () => {
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-diff-'));
await writeLiveDatabaseSnapshot(dir, snapshot());
const ordersPath = liveDatabaseTablePath({ catalog: null, db: 'public', name: 'orders' });
const customersPath = liveDatabaseTablePath({ catalog: null, db: 'public', name: 'customers' });
const result = await chunkLiveDatabaseStagedDir(dir, {
added: [],
modified: [ordersPath],
deleted: [customersPath],
unchanged: ['connection.json', 'foreign-keys.json'],
});
expect(result.workUnits.map((wu) => wu.unitKey)).toEqual(['live-database-public-orders']);
expect(result.eviction?.deletedRawPaths).toEqual([customersPath]);
});
it('fans out all table work units when the foreign-key index changes', async () => {
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-fk-'));
await writeLiveDatabaseSnapshot(dir, snapshot());
const result = await chunkLiveDatabaseStagedDir(dir, {
added: [],
modified: ['foreign-keys.json'],
deleted: [],
unchanged: [],
});
expect(result.workUnits).toHaveLength(2);
});
});

View file

@ -0,0 +1,58 @@
import type { ChunkResult, DiffSet, WorkUnit } from '../../types.js';
import type { KloSchemaTable } from '../../../scan/types.js';
import { LIVE_DATABASE_FOREIGN_KEYS_FILE, LIVE_DATABASE_META_FILE, readLiveDatabaseTableFiles } from './stage.js';
function unitKey(table: KloSchemaTable): string {
const parts = [table.catalog, table.db, table.name]
.filter((part): part is string => typeof part === 'string' && part.length > 0)
.map((part) =>
part
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, ''),
)
.filter(Boolean);
return `live-database-${parts.join('-') || 'table'}`;
}
function displayName(table: KloSchemaTable): string {
return [table.catalog, table.db, table.name].filter(Boolean).join('.');
}
function isTablePath(path: string): boolean {
return path.startsWith('tables/') && path.endsWith('.json');
}
export async function chunkLiveDatabaseStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const tableFiles = await readLiveDatabaseTableFiles(stagedDir);
const allTablePaths = tableFiles.map((file) => file.path);
const globalDeps = [LIVE_DATABASE_META_FILE, LIVE_DATABASE_FOREIGN_KEYS_FILE];
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
const globalTouched = Boolean(
touched && (touched.has(LIVE_DATABASE_META_FILE) || touched.has(LIVE_DATABASE_FOREIGN_KEYS_FILE)),
);
const workUnits: WorkUnit[] = [];
for (const file of tableFiles) {
if (touched && !globalTouched && !touched.has(file.path)) {
continue;
}
const peers = allTablePaths.filter((path) => path !== file.path).sort();
workUnits.push({
unitKey: unitKey(file.table),
displayLabel: `Live database table ${displayName(file.table)}`,
rawFiles: [file.path],
peerFileIndex: peers,
dependencyPaths: globalDeps,
notes: `Database catalog snapshot for ${displayName(file.table)} with ${file.table.columns.length} column${
file.table.columns.length === 1 ? '' : 's'
}.`,
});
}
const deletedRawPaths = diffSet ? diffSet.deleted.filter(isTablePath).sort() : [];
return {
workUnits,
...(deletedRawPaths.length > 0 ? { eviction: { deletedRawPaths } } : {}),
};
}

View file

@ -0,0 +1,224 @@
import { once } from 'node:events';
import { createServer } from 'node:http';
import { describe, expect, it, vi } from 'vitest';
import { createDaemonLiveDatabaseIntrospection } from './daemon-introspection.js';
const daemonResponse = {
connection_id: 'warehouse',
extracted_at: '2026-04-28T10:00:00+00:00',
metadata: { driver: 'postgres', schemas: ['public'] },
tables: [
{
catalog: 'warehouse',
db: 'public',
name: 'customers',
comment: null,
columns: [{ name: 'id', type: 'integer', nullable: false, primary_key: true, comment: null }],
foreign_keys: [],
},
{
catalog: 'warehouse',
db: 'public',
name: 'orders',
comment: 'Order facts',
columns: [
{ name: 'id', type: 'integer', nullable: false, primary_key: true, comment: 'Order id' },
{ name: 'customer_id', type: 'integer', nullable: false, primary_key: false, comment: null },
],
foreign_keys: [
{
from_column: 'customer_id',
to_table: 'customers',
to_column: 'id',
constraint_name: 'orders_customer_id_fkey',
},
],
},
],
};
describe('createDaemonLiveDatabaseIntrospection', () => {
it('calls the database-introspect daemon command and maps the snapshot response', async () => {
const runJson = vi.fn(async () => daemonResponse);
const introspection = createDaemonLiveDatabaseIntrospection({
connections: {
warehouse: {
driver: 'postgres',
url: 'postgres://localhost:5432/warehouse',
readonly: true,
},
},
schemas: ['public'],
runJson,
});
await expect(introspection.extractSchema('warehouse')).resolves.toEqual({
connectionId: 'warehouse',
driver: 'postgres',
extractedAt: '2026-04-28T10:00:00+00:00',
scope: { schemas: ['public'] },
metadata: { driver: 'postgres', schemas: ['public'] },
tables: [
{
catalog: 'warehouse',
db: 'public',
name: 'customers',
kind: 'table',
comment: null,
estimatedRows: null,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
],
foreignKeys: [],
},
{
catalog: 'warehouse',
db: 'public',
name: 'orders',
kind: 'table',
comment: 'Order facts',
estimatedRows: null,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Order id',
},
{
name: 'customer_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: null,
toTable: 'customers',
toColumn: 'id',
constraintName: 'orders_customer_id_fkey',
},
],
},
],
});
expect(runJson).toHaveBeenCalledWith('database-introspect', {
connection_id: 'warehouse',
driver: 'postgres',
url: 'postgres://localhost:5432/warehouse',
schemas: ['public'],
statement_timeout_ms: 30_000,
connection_timeout_seconds: 5,
});
});
it('calls a running daemon HTTP endpoint when baseUrl is configured', async () => {
const requests: Array<{ url: string | undefined; body: unknown }> = [];
const server = createServer((request, response) => {
const chunks: Buffer[] = [];
request.on('data', (chunk: Buffer) => chunks.push(chunk));
request.on('end', () => {
requests.push({
url: request.url,
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
});
response.writeHead(200, { 'content-type': 'application/json' });
response.end(JSON.stringify(daemonResponse));
});
});
server.listen(0, '127.0.0.1');
await once(server, 'listening');
try {
const address = server.address();
if (!address || typeof address === 'string') {
throw new Error('expected TCP server address');
}
const introspection = createDaemonLiveDatabaseIntrospection({
connections: {
warehouse: {
driver: 'postgresql',
url: 'postgres://localhost:5432/warehouse',
readonly: true,
},
},
baseUrl: `http://127.0.0.1:${address.port}`,
});
await expect(introspection.extractSchema('warehouse')).resolves.toMatchObject({
connectionId: 'warehouse',
tables: [{ name: 'customers' }, { name: 'orders' }],
});
expect(requests).toEqual([
{
url: '/database/introspect',
body: {
connection_id: 'warehouse',
driver: 'postgres',
url: 'postgres://localhost:5432/warehouse',
schemas: ['public'],
statement_timeout_ms: 30_000,
connection_timeout_seconds: 5,
},
},
]);
} finally {
server.close();
}
});
it('requires a configured read-only postgres connection with a url', async () => {
const introspection = createDaemonLiveDatabaseIntrospection({
connections: {
warehouse: {
driver: 'postgres',
url: 'postgres://localhost:5432/warehouse',
readonly: false,
},
},
runJson: vi.fn(async () => daemonResponse),
});
await expect(introspection.extractSchema('warehouse')).rejects.toThrow(
'Local live-database ingest requires connections.warehouse.readonly: true.',
);
});
it('rejects unsupported local connection drivers before calling the daemon', async () => {
const runJson = vi.fn(async () => daemonResponse);
const introspection = createDaemonLiveDatabaseIntrospection({
connections: {
warehouse: {
driver: 'snowflake',
url: 'snowflake://example',
readonly: true,
},
},
runJson,
});
await expect(introspection.extractSchema('warehouse')).rejects.toThrow(
'Local live-database ingest cannot run driver "snowflake".',
);
expect(runJson).not.toHaveBeenCalled();
});
});

View file

@ -0,0 +1,256 @@
import { spawn } from 'node:child_process';
import { request as httpRequest } from 'node:http';
import { request as httpsRequest } from 'node:https';
import { URL } from 'node:url';
import type { KloProjectConnectionConfig } from '../../../project/config.js';
import type { KloSchemaColumn, KloSchemaForeignKey, KloSchemaSnapshot, KloSchemaTable } from '../../../scan/types.js';
import { inferKloDimensionType, normalizeKloNativeType } from '../../../scan/type-normalization.js';
import type { LiveDatabaseIntrospectionPort } from './types.js';
export type KloDaemonDatabaseIntrospectionCommand = 'database-introspect';
export type KloDaemonDatabaseJsonRunner = (
subcommand: KloDaemonDatabaseIntrospectionCommand,
payload: Record<string, unknown>,
) => Promise<Record<string, unknown>>;
export type KloDaemonDatabaseHttpJsonRunner = (
path: string,
payload: Record<string, unknown>,
) => Promise<Record<string, unknown>>;
export interface DaemonLiveDatabaseIntrospectionOptions {
connections: Record<string, KloProjectConnectionConfig>;
schemas?: string[];
statementTimeoutMs?: number;
connectionTimeoutSeconds?: number;
command?: string;
args?: string[];
cwd?: string;
env?: NodeJS.ProcessEnv;
baseUrl?: string;
runJson?: KloDaemonDatabaseJsonRunner;
requestJson?: KloDaemonDatabaseHttpJsonRunner;
now?: () => Date;
}
const DEFAULT_SCHEMAS = ['public'];
function parseJsonObject(raw: string, subcommand: string): Record<string, unknown> {
const parsed = JSON.parse(raw) as unknown;
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
throw new Error(`klo-daemon ${subcommand} returned non-object JSON`);
}
return parsed as Record<string, unknown>;
}
function runProcessJson(
options: Required<Pick<DaemonLiveDatabaseIntrospectionOptions, 'command' | 'args'>> &
Pick<DaemonLiveDatabaseIntrospectionOptions, 'cwd' | 'env'>,
): KloDaemonDatabaseJsonRunner {
return async (subcommand, payload) =>
new Promise((resolve, reject) => {
const child = spawn(options.command, [...options.args, subcommand], {
cwd: options.cwd,
env: { ...process.env, ...options.env },
stdio: ['pipe', 'pipe', 'pipe'],
});
const stdout: Buffer[] = [];
const stderr: Buffer[] = [];
child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk));
child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk));
child.on('error', reject);
child.on('close', (code) => {
const stdoutText = Buffer.concat(stdout).toString('utf8').trim();
const stderrText = Buffer.concat(stderr).toString('utf8').trim();
if (code !== 0) {
reject(new Error(`klo-daemon ${subcommand} failed: ${stderrText || `exit code ${code}`}`));
return;
}
try {
resolve(parseJsonObject(stdoutText, subcommand));
} catch (error) {
reject(error);
}
});
child.stdin.end(`${JSON.stringify(payload)}\n`);
});
}
function normalizedBaseUrl(baseUrl: string): string {
return baseUrl.endsWith('/') ? baseUrl : `${baseUrl}/`;
}
function postJson(baseUrl: string): KloDaemonDatabaseHttpJsonRunner {
return async (path, payload) =>
new Promise((resolve, reject) => {
const target = new URL(path.replace(/^\//, ''), normalizedBaseUrl(baseUrl));
const body = JSON.stringify(payload);
const client = target.protocol === 'https:' ? httpsRequest : httpRequest;
const request = client(
target,
{
method: 'POST',
headers: {
accept: 'application/json',
'content-type': 'application/json',
'content-length': Buffer.byteLength(body),
},
},
(response) => {
const chunks: Buffer[] = [];
response.on('data', (chunk: Buffer) => chunks.push(chunk));
response.on('end', () => {
const text = Buffer.concat(chunks).toString('utf8');
const statusCode = response.statusCode ?? 0;
if (statusCode < 200 || statusCode >= 300) {
reject(new Error(`klo-daemon HTTP ${path} failed with ${statusCode}: ${text}`));
return;
}
try {
resolve(parseJsonObject(text, path));
} catch (error) {
reject(error);
}
});
},
);
request.on('error', reject);
request.end(body);
});
}
function recordValue(value: unknown): Record<string, unknown> {
return value && typeof value === 'object' && !Array.isArray(value) ? (value as Record<string, unknown>) : {};
}
function recordArray(value: unknown): Array<Record<string, unknown>> {
return Array.isArray(value)
? value.filter(
(item): item is Record<string, unknown> => item !== null && typeof item === 'object' && !Array.isArray(item),
)
: [];
}
function requiredString(value: unknown, field: string): string {
if (typeof value !== 'string' || value.length === 0) {
throw new Error(`klo-daemon database introspection response is missing string field ${field}`);
}
return value;
}
function nullableString(value: unknown): string | null {
return typeof value === 'string' ? value : null;
}
function optionalString(value: unknown): string | undefined {
return typeof value === 'string' ? value : undefined;
}
function normalizeDriver(driver: unknown): string {
const normalized = String(driver ?? '').trim().toLowerCase();
return normalized === 'postgresql' ? 'postgres' : normalized;
}
function requirePostgresConnection(
connections: Record<string, KloProjectConnectionConfig>,
connectionId: string,
): KloProjectConnectionConfig & { url: string } {
const connection = connections[connectionId];
const driver = normalizeDriver(connection?.driver);
if (driver !== 'postgres') {
throw new Error(`Local live-database ingest cannot run driver "${connection?.driver ?? 'unknown'}".`);
}
if (connection?.readonly !== true) {
throw new Error(`Local live-database ingest requires connections.${connectionId}.readonly: true.`);
}
if (typeof connection.url !== 'string' || connection.url.trim().length === 0) {
throw new Error(`Local live-database ingest requires connections.${connectionId}.url.`);
}
return connection as KloProjectConnectionConfig & { url: string };
}
function mapColumn(raw: Record<string, unknown>): KloSchemaColumn {
const nativeType = requiredString(raw.type, 'tables[].columns[].type');
return {
name: requiredString(raw.name, 'tables[].columns[].name'),
nativeType,
normalizedType: normalizeKloNativeType(nativeType),
dimensionType: inferKloDimensionType(nativeType),
nullable: raw.nullable !== false ? true : false,
primaryKey: raw.primary_key === true,
comment: nullableString(raw.comment),
};
}
function mapForeignKey(raw: Record<string, unknown>): KloSchemaForeignKey {
return {
fromColumn: requiredString(raw.from_column, 'tables[].foreign_keys[].from_column'),
toCatalog: null,
toDb: null,
toTable: requiredString(raw.to_table, 'tables[].foreign_keys[].to_table'),
toColumn: requiredString(raw.to_column, 'tables[].foreign_keys[].to_column'),
constraintName: nullableString(raw.constraint_name),
};
}
function mapTable(raw: Record<string, unknown>): KloSchemaTable {
return {
catalog: nullableString(raw.catalog),
db: nullableString(raw.db),
name: requiredString(raw.name, 'tables[].name'),
kind: 'table',
comment: nullableString(raw.comment),
estimatedRows: null,
columns: recordArray(raw.columns).map(mapColumn),
foreignKeys: recordArray(raw.foreign_keys).map(mapForeignKey),
};
}
function mapDaemonSnapshot(
raw: Record<string, unknown>,
input: { connectionId: string; extractedAt: string; schemas: string[] },
): KloSchemaSnapshot {
return {
connectionId: requiredString(raw.connection_id, 'connection_id') || input.connectionId,
driver: 'postgres',
extractedAt: optionalString(raw.extracted_at) ?? input.extractedAt,
scope: { schemas: input.schemas },
metadata: recordValue(raw.metadata),
tables: recordArray(raw.tables).map(mapTable),
};
}
export function createDaemonLiveDatabaseIntrospection(
options: DaemonLiveDatabaseIntrospectionOptions,
): LiveDatabaseIntrospectionPort {
const schemas = options.schemas ?? DEFAULT_SCHEMAS;
const command = options.command ?? 'python';
const args = options.args ?? ['-m', 'klo_daemon'];
const runJson = options.runJson ?? runProcessJson({ command, args, cwd: options.cwd, env: options.env });
const requestJson = options.requestJson ?? (options.baseUrl ? postJson(options.baseUrl) : undefined);
const now = options.now ?? (() => new Date());
return {
async extractSchema(connectionId: string): Promise<KloSchemaSnapshot> {
const connection = requirePostgresConnection(options.connections, connectionId);
const payload = {
connection_id: connectionId,
driver: normalizeDriver(connection.driver),
url: connection.url,
schemas,
statement_timeout_ms: options.statementTimeoutMs ?? 30_000,
connection_timeout_seconds: options.connectionTimeoutSeconds ?? 5,
};
const raw = requestJson
? await requestJson('/database/introspect', payload)
: await runJson('database-introspect', payload);
return mapDaemonSnapshot(raw, {
connectionId,
extractedAt: now().toISOString(),
schemas,
});
},
};
}

View file

@ -0,0 +1,136 @@
import { describe, expect, it } from 'vitest';
import type { KloSchemaSnapshot } from '../../../scan/types.js';
import { buildLiveDatabaseTableNaturalKey, kloSchemaSnapshotToExtractedSchema } from './extracted-schema.js';
function snapshot(): KloSchemaSnapshot {
return {
connectionId: 'conn-1',
driver: 'postgres',
extractedAt: '2026-04-27T00:00:00.000Z',
scope: { schemas: ['public'] },
metadata: { driver: 'postgres' },
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
kind: 'table',
comment: 'Orders placed by customers',
estimatedRows: null,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: 'Primary key',
},
{
name: 'customer_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: 'orders_customer_id_fkey',
},
],
},
{
name: 'customers',
catalog: null,
db: 'public',
kind: 'table',
comment: null,
estimatedRows: null,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
],
foreignKeys: [],
},
],
};
}
describe('kloSchemaSnapshotToExtractedSchema', () => {
it('preserves structural table, column, comment, and key metadata', () => {
const extracted = kloSchemaSnapshotToExtractedSchema(snapshot());
expect(extracted.tables).toEqual([
{
name: 'orders',
catalog: null,
db: 'public',
dbComment: 'Orders placed by customers',
columns: [
{
name: 'id',
type: 'integer',
nullable: false,
primaryKey: true,
dbComment: 'Primary key',
},
{
name: 'customer_id',
type: 'integer',
nullable: false,
primaryKey: false,
dbComment: null,
},
],
foreignKeys: [
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
constraintName: 'orders_customer_id_fkey',
},
],
},
{
name: 'customers',
catalog: null,
db: 'public',
dbComment: null,
columns: [
{
name: 'id',
type: 'integer',
nullable: false,
primaryKey: true,
dbComment: null,
},
],
foreignKeys: [],
},
]);
});
it('builds the same natural key shape used by schema sync', () => {
expect(buildLiveDatabaseTableNaturalKey({ catalog: null, db: 'public', name: 'orders' })).toBe('|public|orders');
expect(buildLiveDatabaseTableNaturalKey({ catalog: 'warehouse', db: 'analytics', name: 'events' })).toBe(
'warehouse|analytics|events',
);
});
});

View file

@ -0,0 +1,61 @@
import type { KloSchemaSnapshot, KloSchemaTable } from '../../../scan/types.js';
export interface LiveDatabaseExtractedForeignKey {
fromTable: string;
fromColumn: string;
toTable: string;
toColumn: string;
constraintName?: string;
}
export interface LiveDatabaseExtractedColumn {
name: string;
type: string;
nullable: boolean;
primaryKey: boolean;
dbComment: string | null;
}
export interface LiveDatabaseExtractedTable {
name: string;
catalog: string | null;
db: string | null;
dbComment: string | null;
columns: LiveDatabaseExtractedColumn[];
foreignKeys: LiveDatabaseExtractedForeignKey[];
}
export interface LiveDatabaseExtractedSchema {
connectionId?: string;
tables: LiveDatabaseExtractedTable[];
}
export function buildLiveDatabaseTableNaturalKey(table: Pick<KloSchemaTable, 'catalog' | 'db' | 'name'>): string {
return `${table.catalog ?? ''}|${table.db ?? ''}|${table.name}`;
}
export function kloSchemaSnapshotToExtractedSchema(snapshot: KloSchemaSnapshot): LiveDatabaseExtractedSchema {
return {
connectionId: snapshot.connectionId,
tables: snapshot.tables.map((table) => ({
name: table.name,
catalog: table.catalog ?? null,
db: table.db ?? null,
dbComment: table.comment ?? null,
columns: table.columns.map((column) => ({
name: column.name,
type: column.nativeType,
nullable: column.nullable,
primaryKey: column.primaryKey,
dbComment: column.comment ?? null,
})),
foreignKeys: table.foreignKeys.map((foreignKey) => ({
fromTable: table.name,
fromColumn: foreignKey.fromColumn,
toTable: foreignKey.toTable,
toColumn: foreignKey.toColumn,
...(foreignKey.constraintName ? { constraintName: foreignKey.constraintName } : {}),
})),
})),
};
}

View file

@ -0,0 +1,59 @@
import { mkdtemp } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it, vi } from 'vitest';
import { LiveDatabaseSourceAdapter } from './live-database.adapter.js';
describe('LiveDatabaseSourceAdapter', () => {
it('fetches a schema snapshot through the introspection port', async () => {
const extractSchema = vi.fn().mockResolvedValue({
connectionId: 'conn-1',
driver: 'postgres',
extractedAt: '2026-04-27T00:00:00.000Z',
scope: { schemas: ['public'] },
metadata: {},
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
kind: 'table',
comment: null,
estimatedRows: null,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
],
foreignKeys: [],
},
],
});
const adapter = new LiveDatabaseSourceAdapter({
introspection: { extractSchema },
now: () => new Date('2026-04-27T00:00:00.000Z'),
});
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-adapter-'));
await adapter.fetch(undefined, dir, { connectionId: 'conn-1', sourceKey: 'live-database' });
expect(extractSchema).toHaveBeenCalledWith('conn-1');
await expect(adapter.detect(dir)).resolves.toBe(true);
const chunked = await adapter.chunk(dir);
expect(chunked.workUnits.map((wu) => wu.unitKey)).toEqual(['live-database-public-orders']);
});
it('declares the live database source and skill', () => {
const adapter = new LiveDatabaseSourceAdapter({
introspection: { extractSchema: vi.fn() },
});
expect(adapter.source).toBe('live-database');
expect(adapter.skillNames).toEqual(['live_database_ingest']);
});
});

View file

@ -0,0 +1,28 @@
import type { ChunkResult, DiffSet, FetchContext, SourceAdapter } from '../../types.js';
import { chunkLiveDatabaseStagedDir } from './chunk.js';
import { detectLiveDatabaseStagedDir, writeLiveDatabaseSnapshot } from './stage.js';
import type { LiveDatabaseSourceAdapterDeps } from './types.js';
export class LiveDatabaseSourceAdapter implements SourceAdapter {
readonly source = 'live-database';
readonly skillNames = ['live_database_ingest'];
constructor(private readonly deps: LiveDatabaseSourceAdapterDeps) {}
detect(stagedDir: string): Promise<boolean> {
return detectLiveDatabaseStagedDir(stagedDir);
}
async fetch(_pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
const snapshot = await this.deps.introspection.extractSchema(ctx.connectionId);
await writeLiveDatabaseSnapshot(stagedDir, {
...snapshot,
connectionId: ctx.connectionId,
extractedAt: snapshot.extractedAt ?? (this.deps.now ?? (() => new Date()))().toISOString(),
});
}
chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
return chunkLiveDatabaseStagedDir(stagedDir, diffSet);
}
}

View file

@ -0,0 +1,252 @@
import { describe, expect, it } from 'vitest';
import {
buildLiveDatabaseManifestShards,
type LiveDatabaseManifestExistingDescriptions,
type LiveDatabaseManifestJoinEntry,
type LiveDatabaseManifestShard,
} from './manifest.js';
function shardObject(shards: Map<string, LiveDatabaseManifestShard>): Record<string, LiveDatabaseManifestShard> {
return Object.fromEntries([...shards.entries()].sort(([a], [b]) => a.localeCompare(b)));
}
describe('buildLiveDatabaseManifestShards', () => {
it('builds shard objects with generated joins and preserved external descriptions', () => {
const existingDescriptions = new Map<string, LiveDatabaseManifestExistingDescriptions>([
[
'orders',
{
table: { user: 'Pinned analyst description', db: 'Old db description' },
columns: new Map([['id', { user: 'Pinned id description', db: 'Old id description' }]]),
},
],
]);
const preservedJoins = new Map<string, LiveDatabaseManifestJoinEntry[]>([
[
'orders',
[
{
to: 'customers',
on: 'orders.account_id = customers.id',
relationship: 'many_to_one',
source: 'manual',
},
{
to: 'missing_accounts',
on: 'orders.account_id = missing_accounts.id',
relationship: 'many_to_one',
source: 'manual',
},
],
],
]);
const result = buildLiveDatabaseManifestShards({
connectionType: 'POSTGRESQL',
mapColumnType: (nativeType) => nativeType.toLowerCase(),
existingDescriptions,
existingPreservedJoins: preservedJoins,
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
descriptions: { db: 'Fresh db description', ai: 'Generated AI description' },
columns: [
{
name: 'id',
type: 'INTEGER',
pk: true,
nullable: false,
descriptions: { db: 'Fresh id description' },
},
{
name: 'customer_id',
type: 'INTEGER',
},
],
},
{
name: 'customers',
catalog: null,
db: 'public',
columns: [
{
name: 'id',
type: 'INTEGER',
pk: true,
nullable: false,
},
],
},
],
joins: [
{
fromTable: 'orders',
fromColumns: ['customer_id'],
toTable: 'customers',
toColumns: ['id'],
relationship: 'MANY_TO_ONE',
source: 'formal',
},
],
});
expect(result.tablesProcessed).toBe(2);
expect(shardObject(result.shards)).toEqual({
public: {
tables: {
orders: {
table: 'public.orders',
descriptions: {
user: 'Pinned analyst description',
db: 'Fresh db description',
ai: 'Generated AI description',
},
columns: [
{
name: 'id',
type: 'integer',
pk: true,
nullable: false,
descriptions: {
user: 'Pinned id description',
db: 'Fresh id description',
},
},
{
name: 'customer_id',
type: 'integer',
},
],
joins: [
{
to: 'customers',
on: 'orders.customer_id = customers.id',
relationship: 'many_to_one',
source: 'formal',
},
{
to: 'customers',
on: 'orders.account_id = customers.id',
relationship: 'many_to_one',
source: 'manual',
},
],
},
customers: {
table: 'public.customers',
columns: [
{
name: 'id',
type: 'integer',
pk: true,
nullable: false,
},
],
joins: [
{
to: 'orders',
on: 'customers.id = orders.customer_id',
relationship: 'one_to_many',
source: 'formal',
},
],
},
},
},
});
});
it('uses warehouse and schema shard keys for snowflake-style connections', () => {
const result = buildLiveDatabaseManifestShards({
connectionType: 'SNOWFLAKE',
mapColumnType: (nativeType) => nativeType.toLowerCase(),
tables: [
{
name: 'accounts',
catalog: 'ANALYTICS',
db: 'CORE',
columns: [{ name: 'id', type: 'NUMBER' }],
},
],
joins: [],
});
expect(shardObject(result.shards)).toEqual({
'ANALYTICS.CORE': {
tables: {
accounts: {
table: 'ANALYTICS.CORE.accounts',
columns: [{ name: 'id', type: 'number' }],
},
},
},
});
});
it('renders ordered multi-column joins in both directions', () => {
const result = buildLiveDatabaseManifestShards({
connectionType: 'POSTGRESQL',
mapColumnType: (nativeType) => nativeType,
tables: [
{
name: 'order_lines',
catalog: null,
db: 'public',
columns: [
{ name: 'order_id', type: 'integer' },
{ name: 'line_number', type: 'integer' },
],
},
{
name: 'order_line_allocations',
catalog: null,
db: 'public',
columns: [
{ name: 'order_id', type: 'integer' },
{ name: 'line_number', type: 'integer' },
],
},
],
joins: [
{
fromTable: 'order_line_allocations',
fromColumns: ['order_id', 'line_number'],
toTable: 'order_lines',
toColumns: ['order_id', 'line_number'],
relationship: 'many_to_one',
source: 'inferred',
},
],
});
expect(shardObject(result.shards)).toMatchObject({
public: {
tables: {
order_line_allocations: {
joins: [
{
to: 'order_lines',
on: 'order_line_allocations.order_id = order_lines.order_id AND order_line_allocations.line_number = order_lines.line_number',
relationship: 'many_to_one',
source: 'inferred',
},
],
},
order_lines: {
joins: [
{
to: 'order_line_allocations',
on: 'order_lines.order_id = order_line_allocations.order_id AND order_lines.line_number = order_line_allocations.line_number',
relationship: 'one_to_many',
source: 'inferred',
},
],
},
},
},
});
});
});

View file

@ -0,0 +1,270 @@
const RELATIONSHIP_MAP: Record<string, string> = {
MANY_TO_ONE: 'many_to_one',
ONE_TO_MANY: 'one_to_many',
ONE_TO_ONE: 'one_to_one',
};
const RELATIONSHIP_INVERSE: Record<string, string> = {
many_to_one: 'one_to_many',
one_to_many: 'many_to_one',
one_to_one: 'one_to_one',
};
const SCAN_MANAGED_DESCRIPTION_KEYS = new Set(['db', 'ai']);
export interface LiveDatabaseManifestColumn {
name: string;
type: string;
pk?: boolean;
nullable?: boolean;
descriptions?: Record<string, string>;
}
export interface LiveDatabaseManifestJoinEntry {
to: string;
on: string;
relationship: string;
source: string;
}
export interface LiveDatabaseManifestTableEntry {
table: string;
descriptions?: Record<string, string>;
columns: LiveDatabaseManifestColumn[];
joins?: LiveDatabaseManifestJoinEntry[];
}
export interface LiveDatabaseManifestShard {
tables: Record<string, LiveDatabaseManifestTableEntry>;
}
export interface LiveDatabaseManifestTableData {
name: string;
catalog: string | null;
db: string | null;
descriptions?: Record<string, string>;
columns: Array<{
name: string;
type: string;
pk?: boolean;
nullable?: boolean;
descriptions?: Record<string, string>;
}>;
}
export interface LiveDatabaseManifestJoinData {
fromTable: string;
fromColumns: string[];
toTable: string;
toColumns: string[];
relationship: string;
source: 'formal' | 'inferred' | 'manual';
}
export interface LiveDatabaseManifestExistingDescriptions {
table?: Record<string, string>;
columns: Map<string, Record<string, string>>;
}
export interface BuildLiveDatabaseManifestShardsInput {
connectionType: string;
tables: LiveDatabaseManifestTableData[];
joins: LiveDatabaseManifestJoinData[];
mapColumnType: (nativeType: string) => string;
existingPreservedJoins?: Map<string, LiveDatabaseManifestJoinEntry[]>;
existingDescriptions?: Map<string, LiveDatabaseManifestExistingDescriptions>;
}
export interface BuildLiveDatabaseManifestShardsResult {
shards: Map<string, LiveDatabaseManifestShard>;
tablesProcessed: number;
}
function mergeDescriptionsPreservingExternal(
existing: Record<string, string> | undefined,
incoming: Record<string, string> | undefined,
): Record<string, string> | undefined {
if (!existing && !incoming) {
return undefined;
}
const result: Record<string, string> = {};
if (existing) {
for (const [key, value] of Object.entries(existing)) {
if (!SCAN_MANAGED_DESCRIPTION_KEYS.has(key)) {
result[key] = value;
}
}
}
if (incoming) {
Object.assign(result, incoming);
}
return Object.keys(result).length > 0 ? result : undefined;
}
function getShardKey(connectionType: string, catalog: string | null, db: string | null): string {
const normalized = connectionType.toUpperCase();
switch (normalized) {
case 'SNOWFLAKE':
case 'DATABRICKS': {
const catalogPart = catalog ?? 'default';
const schemaPart = db ?? 'public';
return `${catalogPart}.${schemaPart}`;
}
case 'BIGQUERY': {
return db ?? catalog ?? 'default';
}
case 'MYSQL':
case 'CLICKHOUSE': {
return db ?? catalog ?? 'default';
}
default: {
return db ?? 'public';
}
}
}
function buildTableRef(name: string, catalog: string | null, db: string | null): string {
const parts: string[] = [];
if (catalog) {
parts.push(catalog);
}
if (db) {
parts.push(db);
}
parts.push(name);
return parts.join('.');
}
function addJoinOnce(
joinsByTable: Map<string, LiveDatabaseManifestJoinEntry[]>,
tableName: string,
join: LiveDatabaseManifestJoinEntry,
): void {
const joins = joinsByTable.get(tableName) ?? [];
const exists = joins.some((candidate) => candidate.to === join.to && candidate.on === join.on);
if (!exists) {
joins.push(join);
}
joinsByTable.set(tableName, joins);
}
function joinCondition(
leftTable: string,
leftColumns: readonly string[],
rightTable: string,
rightColumns: readonly string[],
): string {
if (leftColumns.length === 0 || leftColumns.length !== rightColumns.length) {
throw new Error(`Invalid relationship join from ${leftTable} to ${rightTable}: column tuple widths differ`);
}
return leftColumns
.map((leftColumn, index) => {
const rightColumn = rightColumns[index];
if (!rightColumn) {
throw new Error(`Invalid relationship join from ${leftTable} to ${rightTable}: missing target column`);
}
return `${leftTable}.${leftColumn} = ${rightTable}.${rightColumn}`;
})
.join(' AND ');
}
function buildJoinsByTable(
tableNames: Set<string>,
joins: LiveDatabaseManifestJoinData[],
preservedJoins: Map<string, LiveDatabaseManifestJoinEntry[]>,
): Map<string, LiveDatabaseManifestJoinEntry[]> {
const joinsByTable = new Map<string, LiveDatabaseManifestJoinEntry[]>();
for (const join of joins) {
if (!tableNames.has(join.fromTable) || !tableNames.has(join.toTable)) {
continue;
}
const relationship = RELATIONSHIP_MAP[join.relationship] ?? join.relationship;
addJoinOnce(joinsByTable, join.fromTable, {
to: join.toTable,
on: joinCondition(join.fromTable, join.fromColumns, join.toTable, join.toColumns),
relationship,
source: join.source,
});
const reverseRelationship = RELATIONSHIP_INVERSE[relationship] ?? 'one_to_many';
addJoinOnce(joinsByTable, join.toTable, {
to: join.fromTable,
on: joinCondition(join.toTable, join.toColumns, join.fromTable, join.fromColumns),
relationship: reverseRelationship,
source: join.source,
});
}
for (const [tableName, tableJoins] of preservedJoins) {
if (!tableNames.has(tableName)) {
continue;
}
for (const join of tableJoins) {
if (tableNames.has(join.to)) {
addJoinOnce(joinsByTable, tableName, join);
}
}
}
return joinsByTable;
}
export function buildLiveDatabaseManifestShards(
input: BuildLiveDatabaseManifestShardsInput,
): BuildLiveDatabaseManifestShardsResult {
const tableNames = new Set(input.tables.map((table) => table.name));
const joinsByTable = buildJoinsByTable(tableNames, input.joins, input.existingPreservedJoins ?? new Map());
const shards = new Map<string, LiveDatabaseManifestShard>();
for (const table of input.tables) {
const shardKey = getShardKey(input.connectionType, table.catalog, table.db);
const shard = shards.get(shardKey) ?? { tables: {} };
const existingDescriptions = input.existingDescriptions?.get(table.name);
const columns: LiveDatabaseManifestColumn[] = table.columns.map((column) => {
const manifestColumn: LiveDatabaseManifestColumn = {
name: column.name,
type: input.mapColumnType(column.type),
};
if (column.pk) {
manifestColumn.pk = true;
}
if (column.nullable === false) {
manifestColumn.nullable = false;
}
const descriptions = mergeDescriptionsPreservingExternal(
existingDescriptions?.columns.get(column.name),
column.descriptions,
);
if (descriptions) {
manifestColumn.descriptions = descriptions;
}
return manifestColumn;
});
const entry: LiveDatabaseManifestTableEntry = {
table: buildTableRef(table.name, table.catalog, table.db),
columns,
};
const tableDescriptions = mergeDescriptionsPreservingExternal(existingDescriptions?.table, table.descriptions);
if (tableDescriptions) {
entry.descriptions = tableDescriptions;
}
const tableJoins = joinsByTable.get(table.name);
if (tableJoins && tableJoins.length > 0) {
entry.joins = tableJoins;
}
shard.tables[table.name] = entry;
shards.set(shardKey, shard);
}
return {
shards,
tablesProcessed: input.tables.length,
};
}

View file

@ -0,0 +1,152 @@
import { mkdtemp, readFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import {
detectLiveDatabaseStagedDir,
LIVE_DATABASE_FOREIGN_KEYS_FILE,
LIVE_DATABASE_META_FILE,
liveDatabaseTablePath,
readLiveDatabaseTableFiles,
writeLiveDatabaseSnapshot,
} from './stage.js';
import type { KloSchemaSnapshot } from '../../../scan/types.js';
function snapshot(): KloSchemaSnapshot {
return {
connectionId: 'conn-1',
driver: 'postgres',
extractedAt: '2026-04-27T00:00:00.000Z',
scope: { schemas: ['public'] },
metadata: { dialect: 'postgres' },
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
kind: 'table',
comment: 'Orders placed by customers',
estimatedRows: 200,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
{
name: 'customer_id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
{
name: 'total',
nativeType: 'numeric',
normalizedType: 'numeric',
dimensionType: 'number',
nullable: false,
primaryKey: false,
comment: null,
},
],
foreignKeys: [
{
fromColumn: 'customer_id',
toCatalog: null,
toDb: 'public',
toTable: 'customers',
toColumn: 'id',
constraintName: null,
},
],
},
{
name: 'customers',
catalog: null,
db: 'public',
kind: 'table',
comment: null,
estimatedRows: 50,
columns: [
{
name: 'id',
nativeType: 'integer',
normalizedType: 'integer',
dimensionType: 'number',
nullable: false,
primaryKey: true,
comment: null,
},
],
foreignKeys: [],
},
],
};
}
describe('live-database staged snapshot files', () => {
it('writes deterministic metadata, table, and foreign-key files', async () => {
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-stage-'));
await writeLiveDatabaseSnapshot(dir, snapshot());
await expect(readFile(join(dir, LIVE_DATABASE_META_FILE), 'utf8')).resolves.toContain('"connectionId": "conn-1"');
await expect(readFile(join(dir, LIVE_DATABASE_FOREIGN_KEYS_FILE), 'utf8')).resolves.toContain(
'"fromTable": "orders"',
);
const connectionJson = await readFile(join(dir, LIVE_DATABASE_META_FILE), 'utf8');
expect(connectionJson).toContain('"driver": "postgres"');
expect(connectionJson).toContain('"schemas"');
const ordersPath = liveDatabaseTablePath({ catalog: null, db: 'public', name: 'orders' });
const customersPath = liveDatabaseTablePath({ catalog: null, db: 'public', name: 'customers' });
expect(ordersPath).toMatch(/^tables\/[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.json$/);
await expect(readFile(join(dir, ordersPath), 'utf8')).resolves.toContain('"name": "orders"');
await expect(readFile(join(dir, customersPath), 'utf8')).resolves.toContain('"name": "customers"');
const ordersJson = await readFile(join(dir, ordersPath), 'utf8');
expect(ordersJson).toContain('"kind": "table"');
expect(ordersJson).toContain('"estimatedRows": 200');
expect(ordersJson).toContain('"nativeType": "integer"');
expect(ordersJson).toContain('"normalizedType": "integer"');
expect(ordersJson).not.toContain('"type": "integer"');
const tableFiles = await readLiveDatabaseTableFiles(dir);
expect(tableFiles.map((file) => file.table.name)).toEqual(['customers', 'orders']);
expect(await detectLiveDatabaseStagedDir(dir)).toBe(true);
});
it('redacts sensitive snapshot metadata before writing connection metadata', async () => {
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-redacted-stage-'));
await writeLiveDatabaseSnapshot(dir, {
...snapshot(),
metadata: {
dialect: 'postgres',
url: 'postgres://reader:secret@example.test/db', // pragma: allowlist secret
serviceAccountJson: {
client_email: 'reader@example.test',
private_key: 'pem-value', // pragma: allowlist secret
},
},
});
const connectionJson = await readFile(join(dir, LIVE_DATABASE_META_FILE), 'utf8');
expect(connectionJson).toContain('"dialect": "postgres"');
expect(connectionJson).toContain('"client_email": "reader@example.test"');
expect(connectionJson).toContain('"url": "<redacted>"');
expect(connectionJson).toContain('"private_key": "<redacted>"');
expect(connectionJson).not.toContain('postgres://reader:secret@example.test/db'); // pragma: allowlist secret
expect(connectionJson).not.toContain('pem-value');
});
it('returns false for a directory that is missing live database metadata', async () => {
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-empty-'));
expect(await detectLiveDatabaseStagedDir(dir)).toBe(false);
});
});

View file

@ -0,0 +1,138 @@
import { Buffer } from 'node:buffer';
import type { Dirent } from 'node:fs';
import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises';
import { join, relative } from 'node:path';
import { redactKloSensitiveMetadata } from '../../../core/redaction.js';
import type { KloSchemaSnapshot, KloSchemaTable, KloTableRef } from '../../../scan/types.js';
export const LIVE_DATABASE_META_FILE = 'connection.json';
export const LIVE_DATABASE_FOREIGN_KEYS_FILE = 'foreign-keys.json';
const LIVE_DATABASE_TABLES_DIR = 'tables';
interface LiveDatabaseTableFile {
path: string;
table: KloSchemaTable;
}
interface ForeignKeyIndexEntry {
fromTable: string;
fromTablePath: string;
fromColumn: string;
toCatalog: string | null;
toDb: string | null;
toTable: string;
toColumn: string;
constraintName: string | null;
}
function encodePathPart(value: string | null | undefined): string {
return Buffer.from(value ?? '_', 'utf8').toString('base64url');
}
function tableSortKey(table: KloTableRef): string {
return `${table.catalog ?? ''}\u0000${table.db ?? ''}\u0000${table.name}`;
}
export function liveDatabaseTablePath(table: KloTableRef): string {
return `${LIVE_DATABASE_TABLES_DIR}/${encodePathPart(table.catalog)}.${encodePathPart(table.db)}.${encodePathPart(
table.name,
)}.json`;
}
async function walkFiles(root: string, dir = root): Promise<string[]> {
let entries: Dirent[];
try {
entries = await readdir(dir, { withFileTypes: true });
} catch {
return [];
}
const files: string[] = [];
for (const entry of entries) {
const absolute = join(dir, entry.name);
if (entry.isDirectory()) {
files.push(...(await walkFiles(root, absolute)));
} else if (entry.isFile()) {
files.push(relative(root, absolute).replace(/\\/g, '/'));
}
}
return files.sort();
}
function stableJson(value: unknown): string {
return `${JSON.stringify(value, null, 2)}\n`;
}
function foreignKeyIndex(snapshot: KloSchemaSnapshot): ForeignKeyIndexEntry[] {
const entries: ForeignKeyIndexEntry[] = [];
for (const table of snapshot.tables) {
for (const fk of table.foreignKeys) {
entries.push({
fromTable: table.name,
fromTablePath: liveDatabaseTablePath(table),
fromColumn: fk.fromColumn,
toCatalog: fk.toCatalog,
toDb: fk.toDb,
toTable: fk.toTable,
toColumn: fk.toColumn,
constraintName: fk.constraintName,
});
}
}
entries.sort(
(a, b) =>
a.fromTable.localeCompare(b.fromTable) ||
a.fromColumn.localeCompare(b.fromColumn) ||
a.toTable.localeCompare(b.toTable) ||
a.toColumn.localeCompare(b.toColumn),
);
return entries;
}
export async function writeLiveDatabaseSnapshot(stagedDir: string, snapshot: KloSchemaSnapshot): Promise<void> {
await mkdir(join(stagedDir, LIVE_DATABASE_TABLES_DIR), { recursive: true });
const sortedTables = [...snapshot.tables].sort((a, b) => tableSortKey(a).localeCompare(tableSortKey(b)));
const metadata = {
connectionId: snapshot.connectionId,
driver: snapshot.driver,
extractedAt: snapshot.extractedAt,
scope: snapshot.scope,
metadata: redactKloSensitiveMetadata(snapshot.metadata),
tableCount: sortedTables.length,
};
await writeFile(join(stagedDir, LIVE_DATABASE_META_FILE), stableJson(metadata));
await writeFile(
join(stagedDir, LIVE_DATABASE_FOREIGN_KEYS_FILE),
stableJson({ foreignKeys: foreignKeyIndex(snapshot) }),
);
for (const table of sortedTables) {
await writeFile(join(stagedDir, liveDatabaseTablePath(table)), stableJson(table));
}
}
export async function readLiveDatabaseTableFiles(stagedDir: string): Promise<LiveDatabaseTableFile[]> {
const files = await walkFiles(join(stagedDir, LIVE_DATABASE_TABLES_DIR));
const out: LiveDatabaseTableFile[] = [];
for (const file of files.filter((path) => path.endsWith('.json'))) {
const path = `${LIVE_DATABASE_TABLES_DIR}/${file}`;
const raw = await readFile(join(stagedDir, path), 'utf8');
const parsed = JSON.parse(raw) as KloSchemaTable;
if (parsed && typeof parsed.name === 'string' && Array.isArray(parsed.columns)) {
out.push({ path, table: parsed });
}
}
out.sort((a, b) => tableSortKey(a.table).localeCompare(tableSortKey(b.table)));
return out;
}
export async function detectLiveDatabaseStagedDir(stagedDir: string): Promise<boolean> {
try {
const meta = JSON.parse(await readFile(join(stagedDir, LIVE_DATABASE_META_FILE), 'utf8')) as unknown;
if (!meta || typeof meta !== 'object' || Array.isArray(meta)) {
return false;
}
const files = await readLiveDatabaseTableFiles(stagedDir);
return files.length > 0;
} catch {
return false;
}
}

View file

@ -0,0 +1,428 @@
import { describe, expect, it } from 'vitest';
import { type LiveDatabaseSyncedSchema, planLiveDatabaseStructuralSync } from './structural-sync.js';
function idFactory(): () => string {
let next = 1;
return () => `id-${next++}`;
}
describe('planLiveDatabaseStructuralSync', () => {
it('plans table and column creates, updates, deletes, and metadata invalidation', () => {
const current: LiveDatabaseSyncedSchema = {
connectionId: 'conn-1',
tables: [
{
id: 'tbl-orders',
name: 'orders',
catalog: null,
db: 'public',
enabled: true,
descriptions: { ai: 'Old AI order text', db: 'Old DB order text' },
columns: [
{
id: 'col-order-id',
name: 'id',
type: 'number',
nullable: false,
primaryKey: true,
parentColumnId: null,
descriptions: { db: 'Order id' },
embedding: [1, 2, 3],
sampleValues: null,
cardinality: null,
},
{
id: 'col-order-total',
name: 'total',
type: 'number',
nullable: true,
primaryKey: false,
parentColumnId: null,
descriptions: { ai: 'Old AI total text', db: 'Old total text' },
embedding: [4, 5, 6],
sampleValues: ['10'],
cardinality: 12,
},
{
id: 'col-order-removed',
name: 'removed',
type: 'string',
nullable: true,
primaryKey: false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
{
id: 'tbl-removed',
name: 'removed_table',
catalog: null,
db: 'public',
enabled: true,
descriptions: {},
columns: [
{
id: 'col-removed-id',
name: 'id',
type: 'number',
nullable: false,
primaryKey: true,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
],
links: [
{
id: 'inferred-total-link',
fromTableId: 'tbl-orders',
fromColumnId: 'col-order-total',
toTableId: 'tbl-orders',
toColumnId: 'col-order-id',
source: 'inferred',
confidence: 0.7,
relationshipType: 'MANY_TO_ONE',
isPrimaryKeyReference: true,
},
],
};
const plan = planLiveDatabaseStructuralSync({
connectionId: 'conn-1',
current,
extracted: {
connectionId: 'conn-1',
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
dbComment: 'Fresh DB order text',
columns: [
{
name: 'id',
type: 'number',
nullable: false,
primaryKey: true,
dbComment: 'Order id',
},
{
name: 'total',
type: 'string',
nullable: false,
primaryKey: false,
dbComment: 'Fresh total text',
},
{
name: 'created_at',
type: 'time',
nullable: false,
primaryKey: false,
dbComment: 'Creation timestamp',
},
],
foreignKeys: [],
},
{
name: 'customers',
catalog: null,
db: 'public',
dbComment: 'Customer table',
columns: [
{
name: 'id',
type: 'number',
nullable: false,
primaryKey: true,
dbComment: null,
},
],
foreignKeys: [],
},
],
},
idFactory: idFactory(),
});
expect(plan.stats).toEqual({
tablesCreated: 1,
tablesDeleted: 1,
columnsCreated: 2,
columnsDeleted: 2,
columnsModified: 1,
formalLinksCreated: 0,
formalLinksDeleted: 0,
});
expect(plan.operations.deleteTableIds).toEqual(['tbl-removed']);
expect(plan.operations.deleteColumnIds).toEqual(['col-order-removed']);
expect(plan.operations.insertTables).toEqual([
{
id: 'id-2',
connectionId: 'conn-1',
name: 'customers',
catalog: null,
db: 'public',
enabled: true,
},
]);
expect(plan.operations.insertColumns).toEqual([
{
id: 'id-1',
tableId: 'tbl-orders',
name: 'created_at',
parentColumnId: null,
},
{
id: 'id-3',
tableId: 'id-2',
name: 'id',
parentColumnId: null,
},
]);
expect(plan.operations.touchColumnIds).toEqual(['col-order-total']);
expect(plan.operations.invalidateColumnEmbeddingIds).toEqual(['col-order-total']);
expect(plan.inferredLinksToValidate).toEqual(['inferred-total-link']);
expect(plan.changes).toEqual({
newTableIds: ['id-2'],
newColumnIds: ['id-1', 'id-3'],
tablesWithStructuralChanges: ['tbl-orders', 'id-2'],
columnsWithTypeChange: ['col-order-total'],
columnsWithDescriptionChange: ['col-order-total'],
tablesWithDescriptionChange: ['tbl-orders'],
});
const orders = plan.schema.tables.find((table) => table.name === 'orders');
expect(orders?.descriptions).toEqual({ db: 'Fresh DB order text' });
expect(orders?.columns.map((column) => column.name)).toEqual(['id', 'total', 'created_at']);
expect(orders?.columns.find((column) => column.name === 'total')).toMatchObject({
id: 'col-order-total',
type: 'string',
nullable: false,
primaryKey: false,
descriptions: { db: 'Fresh total text' },
embedding: null,
sampleValues: ['10'],
cardinality: 12,
});
});
it('builds formal links from extracted foreign keys and preserves valid inferred links', () => {
const current: LiveDatabaseSyncedSchema = {
connectionId: 'conn-1',
tables: [
{
id: 'tbl-orders',
name: 'orders',
catalog: null,
db: 'public',
enabled: true,
descriptions: {},
columns: [
{
id: 'col-orders-id',
name: 'id',
type: 'number',
nullable: false,
primaryKey: true,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
{
id: 'col-orders-customer',
name: 'customer_id',
type: 'number',
nullable: false,
primaryKey: false,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
{
id: 'tbl-customers',
name: 'customers',
catalog: null,
db: 'public',
enabled: true,
descriptions: {},
columns: [
{
id: 'col-customers-id',
name: 'id',
type: 'number',
nullable: false,
primaryKey: true,
parentColumnId: null,
descriptions: {},
embedding: null,
sampleValues: null,
cardinality: null,
},
],
},
],
links: [
{
id: 'formal-existing',
fromTableId: 'tbl-orders',
fromColumnId: 'col-orders-customer',
toTableId: 'tbl-customers',
toColumnId: 'col-customers-id',
source: 'formal',
confidence: 1,
relationshipType: 'MANY_TO_ONE',
isPrimaryKeyReference: true,
},
{
id: 'inferred-existing',
fromTableId: 'tbl-orders',
fromColumnId: 'col-orders-id',
toTableId: 'tbl-customers',
toColumnId: 'col-customers-id',
source: 'inferred',
confidence: 0.6,
relationshipType: 'MANY_TO_ONE',
isPrimaryKeyReference: true,
},
],
};
const plan = planLiveDatabaseStructuralSync({
connectionId: 'conn-1',
current,
extracted: {
connectionId: 'conn-1',
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
dbComment: null,
columns: [
{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null },
{ name: 'customer_id', type: 'number', nullable: false, primaryKey: false, dbComment: null },
],
foreignKeys: [
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
},
],
},
{
name: 'customers',
catalog: null,
db: 'public',
dbComment: null,
columns: [{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null }],
foreignKeys: [],
},
],
},
idFactory: idFactory(),
});
expect(plan.stats.formalLinksCreated).toBe(0);
expect(plan.stats.formalLinksDeleted).toBe(0);
expect(plan.schema.links.map((link) => link.id)).toEqual(['formal-existing', 'inferred-existing']);
const planAfterForeignKeyRemoval = planLiveDatabaseStructuralSync({
connectionId: 'conn-1',
current,
extracted: {
connectionId: 'conn-1',
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
dbComment: null,
columns: [
{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null },
{ name: 'customer_id', type: 'number', nullable: false, primaryKey: false, dbComment: null },
],
foreignKeys: [],
},
{
name: 'customers',
catalog: null,
db: 'public',
dbComment: null,
columns: [{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null }],
foreignKeys: [],
},
],
},
idFactory: idFactory(),
});
expect(planAfterForeignKeyRemoval.stats.formalLinksDeleted).toBe(1);
expect(planAfterForeignKeyRemoval.schema.links.map((link) => link.id)).toEqual(['inferred-existing']);
const planAfterForeignKeyCreation = planLiveDatabaseStructuralSync({
connectionId: 'conn-1',
current: { ...current, links: [current.links[1]] },
extracted: {
connectionId: 'conn-1',
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
dbComment: null,
columns: [
{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null },
{ name: 'customer_id', type: 'number', nullable: false, primaryKey: false, dbComment: null },
],
foreignKeys: [
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
},
],
},
{
name: 'customers',
catalog: null,
db: 'public',
dbComment: null,
columns: [{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null }],
foreignKeys: [],
},
],
},
idFactory: idFactory(),
});
expect(planAfterForeignKeyCreation.stats.formalLinksCreated).toBe(1);
expect(planAfterForeignKeyCreation.schema.links[0]).toMatchObject({
id: 'id-1',
fromTableId: 'tbl-orders',
fromColumnId: 'col-orders-customer',
toTableId: 'tbl-customers',
toColumnId: 'col-customers-id',
source: 'formal',
confidence: 1,
relationshipType: 'MANY_TO_ONE',
isPrimaryKeyReference: true,
});
});
});

Some files were not shown because too many files have changed in this diff Show more