mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-19 08:28:06 +02:00
Initial open-source release
This commit is contained in:
commit
1a42152e6f
1199 changed files with 257054 additions and 0 deletions
330
packages/context/src/agent/agent-runner.service.test.ts
Normal file
330
packages/context/src/agent/agent-runner.service.test.ts
Normal file
|
|
@ -0,0 +1,330 @@
|
|||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
vi.mock('ai', () => ({
|
||||
generateText: vi.fn(),
|
||||
stepCountIs: (n: number) => n,
|
||||
tool: (def: unknown) => def,
|
||||
}));
|
||||
|
||||
import { generateText } from 'ai';
|
||||
import { AgentRunnerService, type RunLoopStepInfo } from './agent-runner.service.js';
|
||||
|
||||
describe('AgentRunnerService.runLoop', () => {
|
||||
let runner: AgentRunnerService;
|
||||
const llmProvider = {
|
||||
getModel: vi.fn().mockReturnValue({ modelId: 'claude-sonnet-4-6', provider: 'anthropic' }),
|
||||
getModelByName: vi.fn(),
|
||||
cacheMarker: vi.fn(),
|
||||
repairToolCallHandler: vi.fn(),
|
||||
thinkingProviderOptions: vi.fn(),
|
||||
telemetryConfig: vi.fn(),
|
||||
promptCachingConfig: vi.fn(() => ({
|
||||
enabled: false,
|
||||
systemTtl: '1h',
|
||||
toolsTtl: '1h',
|
||||
historyTtl: '5m',
|
||||
cacheSystem: true,
|
||||
cacheTools: true,
|
||||
cacheHistory: true,
|
||||
vertexFallbackTo5m: false,
|
||||
})),
|
||||
activeBackend: vi.fn(() => 'anthropic'),
|
||||
};
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
runner = new AgentRunnerService({ llmProvider: llmProvider as any });
|
||||
});
|
||||
|
||||
afterEach(() => vi.clearAllMocks());
|
||||
|
||||
it('passes systemPrompt, userPrompt, tools, and step budget through to generateText', async () => {
|
||||
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
|
||||
const tools = { noop: { description: 'noop', inputSchema: {}, execute: vi.fn() } };
|
||||
await runner.runLoop({
|
||||
modelRole: 'candidateExtraction',
|
||||
systemPrompt: 'SYS',
|
||||
userPrompt: 'USR',
|
||||
toolSet: tools as any,
|
||||
stepBudget: 17,
|
||||
telemetryTags: { source: 'test' },
|
||||
});
|
||||
const call = (generateText as any).mock.calls[0][0];
|
||||
expect(call.messages).toEqual([
|
||||
{ role: 'system', content: 'SYS' },
|
||||
{ role: 'user', content: 'USR' },
|
||||
]);
|
||||
expect(call.system).toBeUndefined();
|
||||
expect(call.prompt).toBeUndefined();
|
||||
expect(call.tools).toEqual(tools);
|
||||
expect(call.stopWhen).toBe(17);
|
||||
expect(call.temperature).toBe(0);
|
||||
expect(llmProvider.getModel).toHaveBeenCalledWith('candidateExtraction');
|
||||
});
|
||||
|
||||
it('returns stopReason=natural when the loop completes without error', async () => {
|
||||
(generateText as any).mockResolvedValue({ text: 'done', toolCalls: [], steps: [] });
|
||||
const result = await runner.runLoop({
|
||||
modelRole: 'candidateExtraction',
|
||||
systemPrompt: 'system',
|
||||
userPrompt: 'user',
|
||||
toolSet: {},
|
||||
stepBudget: 10,
|
||||
telemetryTags: {},
|
||||
});
|
||||
expect(result.stopReason).toBe('natural');
|
||||
expect(result.error).toBeUndefined();
|
||||
expect(llmProvider.getModel).toHaveBeenCalledWith('candidateExtraction');
|
||||
expect(generateText).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
messages: [
|
||||
{ role: 'system', content: 'system' },
|
||||
{ role: 'user', content: 'user' },
|
||||
],
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('returns stopReason=error with the error on generateText failure', async () => {
|
||||
const err = new Error('LLM unavailable');
|
||||
(generateText as any).mockRejectedValue(err);
|
||||
const result = await runner.runLoop({
|
||||
modelRole: 'candidateExtraction',
|
||||
systemPrompt: '',
|
||||
userPrompt: '',
|
||||
toolSet: {},
|
||||
stepBudget: 10,
|
||||
telemetryTags: {},
|
||||
});
|
||||
expect(result.stopReason).toBe('error');
|
||||
expect(result.error).toBe(err);
|
||||
});
|
||||
|
||||
it('invokes caller onStepFinish with incrementing stepIndex and total budget', async () => {
|
||||
const calls: RunLoopStepInfo[] = [];
|
||||
(generateText as any).mockImplementation(async (opts: any) => {
|
||||
for (let i = 0; i < 3; i++) {
|
||||
await opts.onStepFinish({});
|
||||
}
|
||||
return { text: 'ok', toolCalls: [], steps: [] };
|
||||
});
|
||||
|
||||
await runner.runLoop({
|
||||
modelRole: 'candidateExtraction',
|
||||
systemPrompt: '',
|
||||
userPrompt: '',
|
||||
toolSet: {},
|
||||
stepBudget: 10,
|
||||
telemetryTags: {},
|
||||
onStepFinish: (info) => {
|
||||
calls.push(info);
|
||||
},
|
||||
});
|
||||
|
||||
expect(calls).toEqual([
|
||||
{ stepIndex: 1, stepBudget: 10 },
|
||||
{ stepIndex: 2, stepBudget: 10 },
|
||||
{ stepIndex: 3, stepBudget: 10 },
|
||||
]);
|
||||
});
|
||||
|
||||
it('swallows errors thrown from caller onStepFinish without aborting the loop', async () => {
|
||||
(generateText as any).mockImplementation(async (opts: any) => {
|
||||
await opts.onStepFinish({});
|
||||
return { text: 'ok', toolCalls: [], steps: [] };
|
||||
});
|
||||
|
||||
const result = await runner.runLoop({
|
||||
modelRole: 'candidateExtraction',
|
||||
systemPrompt: '',
|
||||
userPrompt: '',
|
||||
toolSet: {},
|
||||
stepBudget: 10,
|
||||
telemetryTags: {},
|
||||
onStepFinish: () => {
|
||||
throw new Error('boom');
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.stopReason).toBe('natural');
|
||||
});
|
||||
|
||||
it('forwards telemetryTags.source through experimental_telemetry metadata', async () => {
|
||||
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
|
||||
const telemetryConfigEnabled = {
|
||||
isEnabled: () => true,
|
||||
devtoolsEnabled: false,
|
||||
appSettingsService: {
|
||||
settings: { telemetry: { recordInputs: false, recordOutputs: false } },
|
||||
},
|
||||
systemConfigService: {
|
||||
config: { instance: { name: 'test-instance' } },
|
||||
},
|
||||
} as any;
|
||||
const runnerWithTelemetry = new AgentRunnerService({
|
||||
llmProvider: llmProvider as any,
|
||||
telemetry: {
|
||||
createTelemetry: (tags) => ({
|
||||
isEnabled: telemetryConfigEnabled.isEnabled(),
|
||||
metadata: {
|
||||
source: tags.source ?? 'RESEARCH',
|
||||
jobId: tags.jobId,
|
||||
unitKey: tags.unitKey,
|
||||
},
|
||||
}),
|
||||
},
|
||||
});
|
||||
await runnerWithTelemetry.runLoop({
|
||||
modelRole: 'candidateExtraction',
|
||||
systemPrompt: '',
|
||||
userPrompt: '',
|
||||
toolSet: {},
|
||||
stepBudget: 10,
|
||||
telemetryTags: { source: 'metabase', jobId: 'job-123', unitKey: 'u/1' },
|
||||
});
|
||||
const call = (generateText as any).mock.calls[0][0];
|
||||
expect(call.experimental_telemetry.metadata.source).toBe('metabase');
|
||||
});
|
||||
|
||||
it('defaults to source=RESEARCH when telemetryTags omits source', async () => {
|
||||
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
|
||||
const telemetryConfigEnabled = {
|
||||
isEnabled: () => true,
|
||||
devtoolsEnabled: false,
|
||||
appSettingsService: {
|
||||
settings: { telemetry: { recordInputs: false, recordOutputs: false } },
|
||||
},
|
||||
systemConfigService: {
|
||||
config: { instance: { name: 'test-instance' } },
|
||||
},
|
||||
} as any;
|
||||
const runnerWithTelemetry = new AgentRunnerService({
|
||||
llmProvider: llmProvider as any,
|
||||
telemetry: {
|
||||
createTelemetry: (tags) => ({
|
||||
isEnabled: telemetryConfigEnabled.isEnabled(),
|
||||
metadata: {
|
||||
source: tags.source ?? 'RESEARCH',
|
||||
jobId: tags.jobId,
|
||||
unitKey: tags.unitKey,
|
||||
},
|
||||
}),
|
||||
},
|
||||
});
|
||||
await runnerWithTelemetry.runLoop({
|
||||
modelRole: 'candidateExtraction',
|
||||
systemPrompt: '',
|
||||
userPrompt: '',
|
||||
toolSet: {},
|
||||
stepBudget: 10,
|
||||
telemetryTags: { operationName: 'memory-agent-ingest' },
|
||||
});
|
||||
const call = (generateText as any).mock.calls[0][0];
|
||||
expect(call.experimental_telemetry.metadata.source).toBe('RESEARCH');
|
||||
});
|
||||
|
||||
it('forwards jobId and unitKey through experimental_telemetry metadata', async () => {
|
||||
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
|
||||
const telemetryConfigEnabled = {
|
||||
isEnabled: () => true,
|
||||
devtoolsEnabled: false,
|
||||
appSettingsService: {
|
||||
settings: { telemetry: { recordInputs: false, recordOutputs: false } },
|
||||
},
|
||||
systemConfigService: {
|
||||
config: { instance: { name: 'test-instance' } },
|
||||
},
|
||||
} as any;
|
||||
const runnerWithTelemetry = new AgentRunnerService({
|
||||
llmProvider: llmProvider as any,
|
||||
telemetry: {
|
||||
createTelemetry: (tags) => ({
|
||||
isEnabled: telemetryConfigEnabled.isEnabled(),
|
||||
metadata: {
|
||||
source: tags.source ?? 'RESEARCH',
|
||||
jobId: tags.jobId,
|
||||
unitKey: tags.unitKey,
|
||||
},
|
||||
}),
|
||||
},
|
||||
});
|
||||
await runnerWithTelemetry.runLoop({
|
||||
modelRole: 'candidateExtraction',
|
||||
systemPrompt: '',
|
||||
userPrompt: '',
|
||||
toolSet: {},
|
||||
stepBudget: 10,
|
||||
telemetryTags: { source: 'metabase', jobId: 'job-777', unitKey: 'sources/users' },
|
||||
});
|
||||
const call = (generateText as any).mock.calls[0][0];
|
||||
expect(call.experimental_telemetry.metadata.jobId).toBe('job-777');
|
||||
expect(call.experimental_telemetry.metadata.unitKey).toBe('sources/users');
|
||||
});
|
||||
|
||||
it('records a sanitized LLM debug request when a recorder is injected', async () => {
|
||||
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
|
||||
const record = vi.fn();
|
||||
const provider = {
|
||||
...llmProvider,
|
||||
cacheMarker: vi.fn((ttl: '5m' | '1h') => ({
|
||||
anthropic: { cacheControl: { type: 'ephemeral' as const, ttl } },
|
||||
})),
|
||||
promptCachingConfig: vi.fn(() => ({
|
||||
enabled: true,
|
||||
systemTtl: '1h',
|
||||
toolsTtl: '1h',
|
||||
historyTtl: '5m',
|
||||
cacheSystem: true,
|
||||
cacheTools: true,
|
||||
cacheHistory: true,
|
||||
vertexFallbackTo5m: false,
|
||||
})),
|
||||
};
|
||||
const runnerWithDebug = new AgentRunnerService({
|
||||
llmProvider: provider as any,
|
||||
debugRequestRecorder: { record },
|
||||
});
|
||||
|
||||
await runnerWithDebug.runLoop({
|
||||
modelRole: 'candidateExtraction',
|
||||
systemPrompt: 'SECRET SYSTEM PROMPT',
|
||||
userPrompt: 'SECRET USER PROMPT',
|
||||
toolSet: {
|
||||
emit_candidate: {
|
||||
description: 'SECRET TOOL DESCRIPTION',
|
||||
inputSchema: {},
|
||||
execute: vi.fn(),
|
||||
} as any,
|
||||
},
|
||||
stepBudget: 10,
|
||||
telemetryTags: { operationName: 'ingest-bundle-wu', source: 'metabase', jobId: 'job-1', unitKey: 'cards/1' },
|
||||
});
|
||||
|
||||
expect(record).toHaveBeenCalledTimes(1);
|
||||
expect(record).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
operationName: 'ingest-bundle-wu',
|
||||
source: 'metabase',
|
||||
jobId: 'job-1',
|
||||
unitKey: 'cards/1',
|
||||
modelRole: 'candidateExtraction',
|
||||
modelId: 'claude-sonnet-4-6',
|
||||
messageCount: 2,
|
||||
toolNames: ['emit_candidate'],
|
||||
}),
|
||||
);
|
||||
const providerOptions = record.mock.calls[0][0].providerOptions;
|
||||
expect(providerOptions).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({ target: 'message', index: 0, role: 'system' }),
|
||||
expect.objectContaining({ target: 'message-part', index: 1, role: 'user', partIndex: 0 }),
|
||||
expect.objectContaining({ target: 'tool', name: 'emit_candidate' }),
|
||||
]),
|
||||
);
|
||||
expect(providerOptions).toHaveLength(3);
|
||||
const serialized = JSON.stringify(record.mock.calls[0][0]);
|
||||
expect(serialized).not.toContain('SECRET SYSTEM PROMPT');
|
||||
expect(serialized).not.toContain('SECRET USER PROMPT');
|
||||
expect(serialized).not.toContain('SECRET TOOL DESCRIPTION');
|
||||
});
|
||||
});
|
||||
101
packages/context/src/agent/agent-runner.service.ts
Normal file
101
packages/context/src/agent/agent-runner.service.ts
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
import { KloMessageBuilder, type KloLlmProvider, type KloModelRole } from '@klo/llm';
|
||||
import { generateText, stepCountIs, type TelemetrySettings, type Tool } from 'ai';
|
||||
import { noopLogger, type KloLogger } from '../core/index.js';
|
||||
import { summarizeKloLlmDebugRequest, type KloLlmDebugRequestRecorder } from '../llm/index.js';
|
||||
|
||||
export type RunLoopStopReason = 'budget' | 'natural' | 'error';
|
||||
|
||||
export interface RunLoopStepInfo {
|
||||
stepIndex: number;
|
||||
stepBudget: number;
|
||||
}
|
||||
|
||||
export interface RunLoopParams {
|
||||
modelRole: KloModelRole;
|
||||
systemPrompt: string;
|
||||
userPrompt: string;
|
||||
toolSet: Record<string, Tool>;
|
||||
stepBudget: number;
|
||||
telemetryTags: Record<string, string>;
|
||||
onStepFinish?: (info: RunLoopStepInfo) => void | Promise<void>;
|
||||
}
|
||||
|
||||
export interface RunLoopResult {
|
||||
stopReason: RunLoopStopReason;
|
||||
error?: Error;
|
||||
}
|
||||
|
||||
export interface AgentTelemetryPort {
|
||||
createTelemetry(tags: Record<string, string>): TelemetrySettings;
|
||||
}
|
||||
|
||||
export interface AgentRunnerServiceDeps {
|
||||
llmProvider: KloLlmProvider;
|
||||
telemetry?: AgentTelemetryPort;
|
||||
debugRequestRecorder?: KloLlmDebugRequestRecorder;
|
||||
logger?: KloLogger;
|
||||
}
|
||||
|
||||
export class AgentRunnerService {
|
||||
private readonly logger: KloLogger;
|
||||
|
||||
constructor(private readonly deps: AgentRunnerServiceDeps) {
|
||||
this.logger = deps.logger ?? noopLogger;
|
||||
}
|
||||
|
||||
async runLoop(params: RunLoopParams): Promise<RunLoopResult> {
|
||||
let stepIndex = 0;
|
||||
try {
|
||||
const model = this.deps.llmProvider.getModel(params.modelRole);
|
||||
const builder = new KloMessageBuilder(this.deps.llmProvider);
|
||||
const built = builder.wrapSimple({
|
||||
system: params.systemPrompt,
|
||||
messages: [{ role: 'user', content: params.userPrompt }],
|
||||
tools: params.toolSet,
|
||||
model,
|
||||
});
|
||||
|
||||
await this.deps.debugRequestRecorder?.record(
|
||||
summarizeKloLlmDebugRequest({
|
||||
operationName: params.telemetryTags.operationName ?? 'klo-agent-runner',
|
||||
source: params.telemetryTags.source,
|
||||
jobId: params.telemetryTags.jobId,
|
||||
unitKey: params.telemetryTags.unitKey,
|
||||
modelRole: params.modelRole,
|
||||
modelId: (model as { modelId?: string }).modelId ?? params.modelRole,
|
||||
messages: built.messages,
|
||||
tools: built.tools as Record<string, { providerOptions?: unknown }>,
|
||||
}),
|
||||
);
|
||||
|
||||
await generateText({
|
||||
model,
|
||||
temperature: 0,
|
||||
stopWhen: stepCountIs(params.stepBudget),
|
||||
experimental_telemetry: this.deps.telemetry?.createTelemetry(params.telemetryTags),
|
||||
messages: built.messages,
|
||||
tools: built.tools as Record<string, Tool>,
|
||||
onStepFinish: async () => {
|
||||
stepIndex += 1;
|
||||
if (!params.onStepFinish) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await params.onStepFinish({ stepIndex, stepBudget: params.stepBudget });
|
||||
} catch (err) {
|
||||
this.logger.warn(
|
||||
`[agent-runner] onStepFinish callback threw; ignoring: ${
|
||||
err instanceof Error ? err.message : String(err)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
},
|
||||
});
|
||||
return { stopReason: 'natural' };
|
||||
} catch (error) {
|
||||
const err = error instanceof Error ? error : new Error(String(error));
|
||||
this.logger.warn(`[agent-runner] loop failed: ${err.message}`);
|
||||
return { stopReason: 'error', error: err };
|
||||
}
|
||||
}
|
||||
}
|
||||
9
packages/context/src/agent/index.ts
Normal file
9
packages/context/src/agent/index.ts
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
export type {
|
||||
AgentRunnerServiceDeps,
|
||||
AgentTelemetryPort,
|
||||
RunLoopParams,
|
||||
RunLoopResult,
|
||||
RunLoopStepInfo,
|
||||
RunLoopStopReason,
|
||||
} from './agent-runner.service.js';
|
||||
export { AgentRunnerService } from './agent-runner.service.js';
|
||||
28
packages/context/src/connections/connection-type.ts
Normal file
28
packages/context/src/connections/connection-type.ts
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
import { z } from 'zod';
|
||||
|
||||
export const connectionTypeSchema = z.enum([
|
||||
'POSTGRESQL',
|
||||
'SQLITE',
|
||||
'SQLSERVER',
|
||||
'BIGQUERY',
|
||||
'SNOWFLAKE',
|
||||
'CENTRALREACH',
|
||||
'EPIC',
|
||||
'CERNER',
|
||||
'ATHENA',
|
||||
'QUICKBOOKS',
|
||||
'WORKDAY',
|
||||
'REST',
|
||||
'S3',
|
||||
'SLACK',
|
||||
'METABASE',
|
||||
'LOOKER',
|
||||
'NOTION',
|
||||
'POSTHOG',
|
||||
'MYSQL',
|
||||
'CLICKHOUSE',
|
||||
'PLAIN',
|
||||
'BETTERSTACK',
|
||||
]);
|
||||
|
||||
export type ConnectionType = z.infer<typeof connectionTypeSchema>;
|
||||
27
packages/context/src/connections/index.ts
Normal file
27
packages/context/src/connections/index.ts
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
export type {
|
||||
KloSqlQueryExecutionInput,
|
||||
KloSqlQueryExecutionResult,
|
||||
KloSqlQueryExecutorPort,
|
||||
} from './query-executor.js';
|
||||
export { createDefaultLocalQueryExecutor, type DefaultLocalQueryExecutorOptions } from './local-query-executor.js';
|
||||
export { normalizeQueryRows } from './query-executor.js';
|
||||
export { createPostgresQueryExecutor } from './postgres-query-executor.js';
|
||||
export { assertReadOnlySql, limitSqlForExecution } from './read-only-sql.js';
|
||||
export { createSqliteQueryExecutor, sqliteDatabasePathFromConnection } from './sqlite-query-executor.js';
|
||||
export { connectionTypeSchema, type ConnectionType } from './connection-type.js';
|
||||
export {
|
||||
localConnectionInfoFromConfig,
|
||||
localConnectionToWarehouseDescriptor,
|
||||
localConnectionTypeForConfig,
|
||||
type LocalConnectionInfo,
|
||||
type LocalWarehouseDescriptor,
|
||||
} from './local-warehouse-descriptor.js';
|
||||
export {
|
||||
KLO_NOTION_ORG_KNOWLEDGE_WARNING,
|
||||
notionConnectionToPullConfig,
|
||||
parseNotionConnectionConfig,
|
||||
redactNotionConnectionConfig,
|
||||
resolveNotionAuthToken,
|
||||
type KloNotionConnectionConfig,
|
||||
type RedactedKloNotionConnectionConfig,
|
||||
} from './notion-config.js';
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { createDefaultLocalQueryExecutor } from './local-query-executor.js';
|
||||
|
||||
describe('createDefaultLocalQueryExecutor', () => {
|
||||
it('dispatches postgres and sqlite drivers to their executors', async () => {
|
||||
const postgres = {
|
||||
execute: vi.fn(async () => ({
|
||||
headers: ['pg'],
|
||||
rows: [[1]],
|
||||
totalRows: 1,
|
||||
command: 'SELECT',
|
||||
rowCount: 1,
|
||||
})),
|
||||
};
|
||||
const sqlite = {
|
||||
execute: vi.fn(async () => ({
|
||||
headers: ['sqlite'],
|
||||
rows: [[2]],
|
||||
totalRows: 1,
|
||||
command: 'SELECT',
|
||||
rowCount: 1,
|
||||
})),
|
||||
};
|
||||
const executor = createDefaultLocalQueryExecutor({ postgres, sqlite });
|
||||
|
||||
await expect(
|
||||
executor.execute({
|
||||
connectionId: 'pg',
|
||||
connection: { driver: 'postgres', readonly: true },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).resolves.toMatchObject({ headers: ['pg'] });
|
||||
await expect(
|
||||
executor.execute({
|
||||
connectionId: 'local',
|
||||
connection: { driver: 'sqlite', readonly: true },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).resolves.toMatchObject({ headers: ['sqlite'] });
|
||||
|
||||
expect(postgres.execute).toHaveBeenCalledTimes(1);
|
||||
expect(sqlite.execute).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('rejects unsupported local execution drivers', async () => {
|
||||
const executor = createDefaultLocalQueryExecutor({
|
||||
postgres: { execute: vi.fn() },
|
||||
sqlite: { execute: vi.fn() },
|
||||
});
|
||||
|
||||
await expect(
|
||||
executor.execute({
|
||||
connectionId: 'warehouse',
|
||||
connection: { driver: 'snowflake', readonly: true },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).rejects.toThrow('No local query executor is configured for driver "snowflake".');
|
||||
});
|
||||
});
|
||||
34
packages/context/src/connections/local-query-executor.ts
Normal file
34
packages/context/src/connections/local-query-executor.ts
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
import { createPostgresQueryExecutor } from './postgres-query-executor.js';
|
||||
import type {
|
||||
KloSqlQueryExecutionInput,
|
||||
KloSqlQueryExecutionResult,
|
||||
KloSqlQueryExecutorPort,
|
||||
} from './query-executor.js';
|
||||
import { createSqliteQueryExecutor } from './sqlite-query-executor.js';
|
||||
|
||||
export interface DefaultLocalQueryExecutorOptions {
|
||||
postgres?: KloSqlQueryExecutorPort;
|
||||
sqlite?: KloSqlQueryExecutorPort;
|
||||
}
|
||||
|
||||
function driverFor(input: KloSqlQueryExecutionInput): string {
|
||||
return String(input.connection?.driver ?? '').toLowerCase();
|
||||
}
|
||||
|
||||
export function createDefaultLocalQueryExecutor(options: DefaultLocalQueryExecutorOptions = {}): KloSqlQueryExecutorPort {
|
||||
const postgres = options.postgres ?? createPostgresQueryExecutor();
|
||||
const sqlite = options.sqlite ?? createSqliteQueryExecutor();
|
||||
|
||||
return {
|
||||
async execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult> {
|
||||
const driver = driverFor(input);
|
||||
if (driver === 'postgres' || driver === 'postgresql') {
|
||||
return postgres.execute(input);
|
||||
}
|
||||
if (driver === 'sqlite' || driver === 'sqlite3') {
|
||||
return sqlite.execute(input);
|
||||
}
|
||||
throw new Error(`No local query executor is configured for driver "${input.connection?.driver ?? 'unknown'}".`);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
localConnectionInfoFromConfig,
|
||||
localConnectionToWarehouseDescriptor,
|
||||
localConnectionTypeForConfig,
|
||||
} from './local-warehouse-descriptor.js';
|
||||
|
||||
describe('localConnectionToWarehouseDescriptor', () => {
|
||||
it('maps local Postgres URLs to canonical warehouse descriptors', () => {
|
||||
expect(
|
||||
localConnectionToWarehouseDescriptor('warehouse', {
|
||||
driver: 'postgres',
|
||||
url: 'postgresql://readonly@db.example.test/analytics',
|
||||
}),
|
||||
).toMatchObject({
|
||||
id: 'warehouse',
|
||||
connection_type: 'POSTGRESQL',
|
||||
host: 'db.example.test',
|
||||
database: 'analytics',
|
||||
});
|
||||
});
|
||||
|
||||
it('maps BigQuery project and dataset from explicit fields', () => {
|
||||
expect(
|
||||
localConnectionToWarehouseDescriptor('bq', {
|
||||
driver: 'bigquery',
|
||||
project_id: 'acme',
|
||||
dataset_id: 'warehouse',
|
||||
}),
|
||||
).toMatchObject({
|
||||
id: 'bq',
|
||||
connection_type: 'BIGQUERY',
|
||||
project_id: 'acme',
|
||||
dataset_id: 'warehouse',
|
||||
});
|
||||
});
|
||||
|
||||
it('returns null for non-warehouse adapters', () => {
|
||||
expect(localConnectionToWarehouseDescriptor('looker', { driver: 'looker' })).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('local connection info helpers', () => {
|
||||
it('returns canonical warehouse connection types for local catalogs', () => {
|
||||
expect(localConnectionTypeForConfig('warehouse', { driver: 'postgres' })).toBe('POSTGRESQL');
|
||||
expect(localConnectionTypeForConfig('bq', { driver: 'bigquery', project_id: 'acme' })).toBe('BIGQUERY');
|
||||
expect(localConnectionTypeForConfig('snowflake', { driver: 'snowflake' })).toBe('SNOWFLAKE');
|
||||
});
|
||||
|
||||
it('keeps non-warehouse adapter labels for display-only local connection surfaces', () => {
|
||||
expect(localConnectionTypeForConfig('prod-metabase', { driver: 'metabase' })).toBe('metabase');
|
||||
expect(localConnectionTypeForConfig('missing-driver', {} as never)).toBe('unknown');
|
||||
});
|
||||
|
||||
it('builds nullable local connection info records', () => {
|
||||
expect(localConnectionInfoFromConfig('warehouse', { driver: 'postgres' })).toEqual({
|
||||
id: 'warehouse',
|
||||
name: 'warehouse',
|
||||
connectionType: 'POSTGRESQL',
|
||||
});
|
||||
expect(localConnectionInfoFromConfig('missing', undefined)).toBeNull();
|
||||
});
|
||||
});
|
||||
102
packages/context/src/connections/local-warehouse-descriptor.ts
Normal file
102
packages/context/src/connections/local-warehouse-descriptor.ts
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
import type { KloProjectConnectionConfig } from '../project/config.js';
|
||||
import type { ConnectionType } from './connection-type.js';
|
||||
|
||||
export interface LocalWarehouseDescriptor {
|
||||
id: string;
|
||||
connection_type: ConnectionType;
|
||||
host?: string | null;
|
||||
database?: string | null;
|
||||
account?: string | null;
|
||||
project_id?: string | null;
|
||||
dataset_id?: string | null;
|
||||
connection_params: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface LocalConnectionInfo {
|
||||
id: string;
|
||||
name: string;
|
||||
connectionType: string;
|
||||
}
|
||||
|
||||
const DRIVER_TO_CONNECTION_TYPE: Record<string, ConnectionType> = {
|
||||
postgres: 'POSTGRESQL',
|
||||
postgresql: 'POSTGRESQL',
|
||||
sqlite: 'SQLITE',
|
||||
sqlserver: 'SQLSERVER',
|
||||
mssql: 'SQLSERVER',
|
||||
mysql: 'MYSQL',
|
||||
clickhouse: 'CLICKHOUSE',
|
||||
snowflake: 'SNOWFLAKE',
|
||||
bigquery: 'BIGQUERY',
|
||||
};
|
||||
|
||||
export function localConnectionToWarehouseDescriptor(
|
||||
id: string,
|
||||
connection: KloProjectConnectionConfig | undefined,
|
||||
): LocalWarehouseDescriptor | null {
|
||||
if (!connection) {
|
||||
return null;
|
||||
}
|
||||
const connectionType = DRIVER_TO_CONNECTION_TYPE[String(connection.driver ?? '').toLowerCase()];
|
||||
if (!connectionType) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const info: LocalWarehouseDescriptor = {
|
||||
id,
|
||||
connection_type: connectionType,
|
||||
connection_params: { ...connection },
|
||||
};
|
||||
const url = typeof connection.url === 'string' ? connection.url : null;
|
||||
if (url && !url.startsWith('env:') && !url.startsWith('file:')) {
|
||||
try {
|
||||
const parsed = new URL(url);
|
||||
info.host = parsed.hostname || null;
|
||||
if (parsed.pathname.length > 1) {
|
||||
const [first, second] = parsed.pathname.slice(1).split('/');
|
||||
if (connectionType === 'BIGQUERY') {
|
||||
info.project_id = stringField(connection.project_id) ?? parsed.hostname ?? first ?? null;
|
||||
info.dataset_id = stringField(connection.dataset_id) ?? second ?? null;
|
||||
} else {
|
||||
info.database = first ?? null;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
info.host = stringField(connection.host);
|
||||
}
|
||||
}
|
||||
|
||||
info.host = stringField(connection.host) ?? info.host ?? null;
|
||||
info.database = stringField(connection.database) ?? info.database ?? null;
|
||||
info.account = stringField(connection.account) ?? null;
|
||||
info.project_id = stringField(connection.project_id) ?? info.project_id ?? null;
|
||||
info.dataset_id = stringField(connection.dataset_id) ?? info.dataset_id ?? null;
|
||||
return info;
|
||||
}
|
||||
|
||||
export function localConnectionTypeForConfig(id: string, connection: KloProjectConnectionConfig | undefined): string {
|
||||
const descriptor = localConnectionToWarehouseDescriptor(id, connection);
|
||||
if (descriptor) {
|
||||
return descriptor.connection_type;
|
||||
}
|
||||
const driver = typeof connection?.driver === 'string' ? connection.driver.trim() : '';
|
||||
return driver.length > 0 ? driver : 'unknown';
|
||||
}
|
||||
|
||||
export function localConnectionInfoFromConfig(
|
||||
id: string,
|
||||
connection: KloProjectConnectionConfig | undefined,
|
||||
): LocalConnectionInfo | null {
|
||||
if (!connection) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
id,
|
||||
name: id,
|
||||
connectionType: localConnectionTypeForConfig(id, connection),
|
||||
};
|
||||
}
|
||||
|
||||
function stringField(value: unknown): string | null {
|
||||
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : null;
|
||||
}
|
||||
120
packages/context/src/connections/notion-config.test.ts
Normal file
120
packages/context/src/connections/notion-config.test.ts
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import {
|
||||
notionConnectionToPullConfig,
|
||||
parseNotionConnectionConfig,
|
||||
redactNotionConnectionConfig,
|
||||
resolveNotionAuthToken,
|
||||
} from './notion-config.js';
|
||||
|
||||
describe('standalone Notion connection config', () => {
|
||||
let tempDir: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-notion-config-'));
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('parses selected-root Notion config with safe defaults', () => {
|
||||
const parsed = parseNotionConnectionConfig({
|
||||
driver: 'notion',
|
||||
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
|
||||
crawl_mode: 'selected_roots',
|
||||
root_page_ids: ['page-1'],
|
||||
});
|
||||
|
||||
expect(parsed).toEqual({
|
||||
driver: 'notion',
|
||||
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
|
||||
crawl_mode: 'selected_roots',
|
||||
root_page_ids: ['page-1'],
|
||||
root_database_ids: [],
|
||||
root_data_source_ids: [],
|
||||
max_pages_per_run: 1000,
|
||||
max_knowledge_creates_per_run: 5,
|
||||
max_knowledge_updates_per_run: 20,
|
||||
last_successful_cursor: null,
|
||||
});
|
||||
});
|
||||
|
||||
it('redacts token references from display output', () => {
|
||||
expect(
|
||||
redactNotionConnectionConfig(
|
||||
parseNotionConnectionConfig({
|
||||
driver: 'notion',
|
||||
auth_token_ref: 'file:/Users/example/.config/notion-token',
|
||||
crawl_mode: 'all_accessible',
|
||||
max_pages_per_run: 80,
|
||||
}),
|
||||
),
|
||||
).toEqual({
|
||||
driver: 'notion',
|
||||
hasAuthToken: true,
|
||||
crawlMode: 'all_accessible',
|
||||
rootPageIds: [],
|
||||
rootDatabaseIds: [],
|
||||
rootDataSourceIds: [],
|
||||
maxPagesPerRun: 80,
|
||||
maxKnowledgeCreatesPerRun: 5,
|
||||
maxKnowledgeUpdatesPerRun: 20,
|
||||
warning: 'Anything accessible to this Notion integration can become organization knowledge.',
|
||||
});
|
||||
});
|
||||
|
||||
it('requires at least one selected root in selected_roots mode', () => {
|
||||
expect(() =>
|
||||
parseNotionConnectionConfig({
|
||||
driver: 'notion',
|
||||
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
|
||||
crawl_mode: 'selected_roots',
|
||||
}),
|
||||
).toThrow('selected_roots requires at least one root page, database, or data source id');
|
||||
});
|
||||
|
||||
it('resolves env and file token references without exposing the reference in errors', async () => {
|
||||
const tokenPath = join(tempDir, 'notion-token.txt');
|
||||
await writeFile(tokenPath, 'ntn_file_token\n', 'utf-8');
|
||||
|
||||
await expect(
|
||||
resolveNotionAuthToken('env:NOTION_AUTH_TOKEN', {
|
||||
env: { NOTION_AUTH_TOKEN: 'ntn_env_token' },
|
||||
}),
|
||||
).resolves.toBe('ntn_env_token');
|
||||
await expect(resolveNotionAuthToken(`file:${tokenPath}`)).resolves.toBe('ntn_file_token');
|
||||
await expect(resolveNotionAuthToken('env:MISSING_NOTION_TOKEN', { env: {} })).rejects.toThrow(
|
||||
'Notion token environment variable MISSING_NOTION_TOKEN is not set',
|
||||
);
|
||||
});
|
||||
|
||||
it('converts standalone config into adapter pull config', async () => {
|
||||
const pullConfig = await notionConnectionToPullConfig(
|
||||
parseNotionConnectionConfig({
|
||||
driver: 'notion',
|
||||
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
|
||||
crawl_mode: 'all_accessible',
|
||||
max_pages_per_run: 12,
|
||||
max_knowledge_creates_per_run: 2,
|
||||
max_knowledge_updates_per_run: 7,
|
||||
last_successful_cursor: '{"phase":"all_accessible_pages","cursor":"cursor-1"}',
|
||||
}),
|
||||
{ env: { NOTION_AUTH_TOKEN: 'ntn_env_token' } },
|
||||
);
|
||||
|
||||
expect(pullConfig).toEqual({
|
||||
authToken: 'ntn_env_token',
|
||||
crawlMode: 'all_accessible',
|
||||
rootPageIds: [],
|
||||
rootDatabaseIds: [],
|
||||
rootDataSourceIds: [],
|
||||
maxPagesPerRun: 12,
|
||||
maxKnowledgeCreatesPerRun: 2,
|
||||
maxKnowledgeUpdatesPerRun: 7,
|
||||
lastSuccessfulCursor: '{"phase":"all_accessible_pages","cursor":"cursor-1"}',
|
||||
});
|
||||
});
|
||||
});
|
||||
196
packages/context/src/connections/notion-config.ts
Normal file
196
packages/context/src/connections/notion-config.ts
Normal file
|
|
@ -0,0 +1,196 @@
|
|||
import { readFile } from 'node:fs/promises';
|
||||
import { homedir } from 'node:os';
|
||||
import { resolve } from 'node:path';
|
||||
import { type NotionPullConfig, notionPullConfigSchema } from '../ingest/adapters/notion/types.js';
|
||||
import type { KloProjectConnectionConfig } from '../project/config.js';
|
||||
|
||||
export const KLO_NOTION_ORG_KNOWLEDGE_WARNING =
|
||||
'Anything accessible to this Notion integration can become organization knowledge.';
|
||||
|
||||
type KloNotionCrawlMode = 'all_accessible' | 'selected_roots';
|
||||
|
||||
export interface KloNotionConnectionConfig extends KloProjectConnectionConfig {
|
||||
driver: 'notion';
|
||||
auth_token_ref: string;
|
||||
crawl_mode: KloNotionCrawlMode;
|
||||
root_page_ids: string[];
|
||||
root_database_ids: string[];
|
||||
root_data_source_ids: string[];
|
||||
max_pages_per_run: number;
|
||||
max_knowledge_creates_per_run: number;
|
||||
max_knowledge_updates_per_run: number;
|
||||
last_successful_cursor: string | null;
|
||||
}
|
||||
|
||||
export interface RedactedKloNotionConnectionConfig {
|
||||
driver: 'notion';
|
||||
hasAuthToken: boolean;
|
||||
crawlMode: KloNotionCrawlMode;
|
||||
rootPageIds: string[];
|
||||
rootDatabaseIds: string[];
|
||||
rootDataSourceIds: string[];
|
||||
maxPagesPerRun: number;
|
||||
maxKnowledgeCreatesPerRun: number;
|
||||
maxKnowledgeUpdatesPerRun: number;
|
||||
warning: typeof KLO_NOTION_ORG_KNOWLEDGE_WARNING;
|
||||
}
|
||||
|
||||
interface ResolveNotionTokenOptions {
|
||||
env?: Record<string, string | undefined>;
|
||||
readTextFile?: (path: string) => Promise<string>;
|
||||
}
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function record(value: unknown): Record<string, unknown> {
|
||||
if (!isRecord(value)) {
|
||||
throw new Error('Notion connection config must be an object');
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function stringValue(value: unknown, fallback: string): string {
|
||||
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : fallback;
|
||||
}
|
||||
|
||||
function optionalString(value: unknown): string | null {
|
||||
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : null;
|
||||
}
|
||||
|
||||
function stringArray(value: unknown): string[] {
|
||||
if (!Array.isArray(value)) {
|
||||
return [];
|
||||
}
|
||||
return value.filter((item): item is string => typeof item === 'string' && item.trim().length > 0);
|
||||
}
|
||||
|
||||
function integerWithFallback(value: unknown, fallback: number, name: string): number {
|
||||
if (value === undefined || value === null) {
|
||||
return fallback;
|
||||
}
|
||||
if (typeof value !== 'number' || !Number.isInteger(value)) {
|
||||
throw new Error(`${name} must be an integer`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function boundedInteger(value: unknown, fallback: number, name: string, min: number, max: number): number {
|
||||
const parsed = integerWithFallback(value, fallback, name);
|
||||
if (parsed < min || parsed > max) {
|
||||
throw new Error(`${name} must be between ${min} and ${max}`);
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
export function parseNotionConnectionConfig(raw: unknown): KloNotionConnectionConfig {
|
||||
const input = record(raw);
|
||||
if (input.driver !== 'notion') {
|
||||
throw new Error('Notion connection config requires driver: notion');
|
||||
}
|
||||
const authTokenRef = stringValue(input.auth_token_ref, '');
|
||||
if (!authTokenRef) {
|
||||
throw new Error('Notion connection config requires auth_token_ref');
|
||||
}
|
||||
if (!authTokenRef.startsWith('env:') && !authTokenRef.startsWith('file:')) {
|
||||
throw new Error('Notion auth_token_ref must use env:NAME or file:/path');
|
||||
}
|
||||
|
||||
const crawlMode = stringValue(input.crawl_mode, 'selected_roots');
|
||||
if (crawlMode !== 'selected_roots' && crawlMode !== 'all_accessible') {
|
||||
throw new Error(`Unsupported Notion crawl_mode: ${crawlMode}`);
|
||||
}
|
||||
const rootPageIds = stringArray(input.root_page_ids);
|
||||
const rootDatabaseIds = stringArray(input.root_database_ids);
|
||||
const rootDataSourceIds = stringArray(input.root_data_source_ids);
|
||||
if (crawlMode === 'selected_roots' && rootPageIds.length + rootDatabaseIds.length + rootDataSourceIds.length === 0) {
|
||||
throw new Error('selected_roots requires at least one root page, database, or data source id');
|
||||
}
|
||||
|
||||
return {
|
||||
...input,
|
||||
driver: 'notion',
|
||||
auth_token_ref: authTokenRef,
|
||||
crawl_mode: crawlMode,
|
||||
root_page_ids: rootPageIds,
|
||||
root_database_ids: rootDatabaseIds,
|
||||
root_data_source_ids: rootDataSourceIds,
|
||||
max_pages_per_run: boundedInteger(input.max_pages_per_run, 1000, 'max_pages_per_run', 1, 10_000),
|
||||
max_knowledge_creates_per_run: boundedInteger(
|
||||
input.max_knowledge_creates_per_run,
|
||||
5,
|
||||
'max_knowledge_creates_per_run',
|
||||
0,
|
||||
25,
|
||||
),
|
||||
max_knowledge_updates_per_run: boundedInteger(
|
||||
input.max_knowledge_updates_per_run,
|
||||
20,
|
||||
'max_knowledge_updates_per_run',
|
||||
0,
|
||||
100,
|
||||
),
|
||||
last_successful_cursor: optionalString(input.last_successful_cursor),
|
||||
};
|
||||
}
|
||||
|
||||
export function redactNotionConnectionConfig(config: KloNotionConnectionConfig): RedactedKloNotionConnectionConfig {
|
||||
return {
|
||||
driver: 'notion',
|
||||
hasAuthToken: Boolean(config.auth_token_ref),
|
||||
crawlMode: config.crawl_mode,
|
||||
rootPageIds: config.root_page_ids,
|
||||
rootDatabaseIds: config.root_database_ids,
|
||||
rootDataSourceIds: config.root_data_source_ids,
|
||||
maxPagesPerRun: config.max_pages_per_run,
|
||||
maxKnowledgeCreatesPerRun: config.max_knowledge_creates_per_run,
|
||||
maxKnowledgeUpdatesPerRun: config.max_knowledge_updates_per_run,
|
||||
warning: KLO_NOTION_ORG_KNOWLEDGE_WARNING,
|
||||
};
|
||||
}
|
||||
|
||||
function expandHome(path: string): string {
|
||||
return path === '~' || path.startsWith('~/') ? resolve(homedir(), path.slice(2)) : path;
|
||||
}
|
||||
|
||||
export async function resolveNotionAuthToken(
|
||||
authTokenRef: string,
|
||||
options: ResolveNotionTokenOptions = {},
|
||||
): Promise<string> {
|
||||
if (authTokenRef.startsWith('env:')) {
|
||||
const envName = authTokenRef.slice('env:'.length);
|
||||
const value = (options.env ?? process.env)[envName];
|
||||
if (!value) {
|
||||
throw new Error(`Notion token environment variable ${envName} is not set`);
|
||||
}
|
||||
return value.trim();
|
||||
}
|
||||
if (authTokenRef.startsWith('file:')) {
|
||||
const path = expandHome(authTokenRef.slice('file:'.length));
|
||||
const readTextFile = options.readTextFile ?? ((filePath: string) => readFile(filePath, 'utf-8'));
|
||||
const value = (await readTextFile(path)).trim();
|
||||
if (!value) {
|
||||
throw new Error(`Notion token file is empty: ${path}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
throw new Error('Notion auth_token_ref must use env:NAME or file:/path');
|
||||
}
|
||||
|
||||
export async function notionConnectionToPullConfig(
|
||||
config: KloNotionConnectionConfig,
|
||||
options: ResolveNotionTokenOptions = {},
|
||||
): Promise<NotionPullConfig> {
|
||||
return notionPullConfigSchema.parse({
|
||||
authToken: await resolveNotionAuthToken(config.auth_token_ref, options),
|
||||
crawlMode: config.crawl_mode,
|
||||
rootPageIds: config.root_page_ids,
|
||||
rootDatabaseIds: config.root_database_ids,
|
||||
rootDataSourceIds: config.root_data_source_ids,
|
||||
maxPagesPerRun: config.max_pages_per_run,
|
||||
maxKnowledgeCreatesPerRun: config.max_knowledge_creates_per_run,
|
||||
maxKnowledgeUpdatesPerRun: config.max_knowledge_updates_per_run,
|
||||
lastSuccessfulCursor: config.last_successful_cursor,
|
||||
});
|
||||
}
|
||||
111
packages/context/src/connections/postgres-query-executor.test.ts
Normal file
111
packages/context/src/connections/postgres-query-executor.test.ts
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { createPostgresQueryExecutor } from './postgres-query-executor.js';
|
||||
|
||||
function makeClient() {
|
||||
const calls: unknown[] = [];
|
||||
const client = {
|
||||
connect: vi.fn(async () => undefined),
|
||||
query: vi.fn(async (input: unknown) => {
|
||||
calls.push(input);
|
||||
if (input === 'BEGIN READ ONLY') {
|
||||
return { rows: [], fields: [], rowCount: null, command: 'BEGIN' };
|
||||
}
|
||||
if (input === 'COMMIT') {
|
||||
return { rows: [], fields: [], rowCount: null, command: 'COMMIT' };
|
||||
}
|
||||
return {
|
||||
rows: [
|
||||
['paid', 2],
|
||||
['open', 1],
|
||||
],
|
||||
fields: [{ name: 'status' }, { name: 'order_count' }],
|
||||
rowCount: 2,
|
||||
command: 'SELECT',
|
||||
};
|
||||
}),
|
||||
end: vi.fn(async () => undefined),
|
||||
};
|
||||
return { client, calls };
|
||||
}
|
||||
|
||||
describe('createPostgresQueryExecutor', () => {
|
||||
it('runs a read-only transaction in array row mode and closes the client', async () => {
|
||||
const { client, calls } = makeClient();
|
||||
const executor = createPostgresQueryExecutor({
|
||||
clientFactory: vi.fn(() => client),
|
||||
});
|
||||
|
||||
const result = await executor.execute({
|
||||
connectionId: 'warehouse',
|
||||
connection: { driver: 'postgres', url: 'postgres://example/db', readonly: true },
|
||||
sql: 'select status, count(*) as order_count from public.orders group by status',
|
||||
maxRows: 50,
|
||||
});
|
||||
|
||||
expect(client.connect).toHaveBeenCalledTimes(1);
|
||||
expect(calls[0]).toBe('BEGIN READ ONLY');
|
||||
expect(calls[1]).toEqual({
|
||||
text: 'select * from (select status, count(*) as order_count from public.orders group by status) as klo_query_result limit 50',
|
||||
rowMode: 'array',
|
||||
});
|
||||
expect(calls[2]).toBe('COMMIT');
|
||||
expect(client.end).toHaveBeenCalledTimes(1);
|
||||
expect(result).toEqual({
|
||||
headers: ['status', 'order_count'],
|
||||
rows: [
|
||||
['paid', 2],
|
||||
['open', 1],
|
||||
],
|
||||
totalRows: 2,
|
||||
command: 'SELECT',
|
||||
rowCount: 2,
|
||||
});
|
||||
});
|
||||
|
||||
it('rolls back and closes the client when query execution fails', async () => {
|
||||
const client = {
|
||||
connect: vi.fn(async () => undefined),
|
||||
query: vi.fn(async (input: unknown) => {
|
||||
if (input === 'BEGIN READ ONLY' || input === 'ROLLBACK') {
|
||||
return { rows: [], fields: [], rowCount: null, command: String(input) };
|
||||
}
|
||||
throw new Error('syntax error');
|
||||
}),
|
||||
end: vi.fn(async () => undefined),
|
||||
};
|
||||
const executor = createPostgresQueryExecutor({
|
||||
clientFactory: vi.fn(() => client),
|
||||
});
|
||||
|
||||
await expect(
|
||||
executor.execute({
|
||||
connectionId: 'warehouse',
|
||||
connection: { driver: 'postgres', url: 'postgres://example/db', readonly: true },
|
||||
sql: 'select * from broken',
|
||||
maxRows: 10,
|
||||
}),
|
||||
).rejects.toThrow('syntax error');
|
||||
expect(client.query).toHaveBeenCalledWith('ROLLBACK');
|
||||
expect(client.end).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('requires a Postgres url and read-only connection config', async () => {
|
||||
const executor = createPostgresQueryExecutor({ clientFactory: vi.fn() });
|
||||
|
||||
await expect(
|
||||
executor.execute({
|
||||
connectionId: 'warehouse',
|
||||
connection: { driver: 'postgres', readonly: true },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).rejects.toThrow('Local Postgres execution requires connections.warehouse.url');
|
||||
|
||||
await expect(
|
||||
executor.execute({
|
||||
connectionId: 'warehouse',
|
||||
connection: { driver: 'postgres', url: 'postgres://example/db', readonly: false },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).rejects.toThrow('Local query execution requires connections.warehouse.readonly: true');
|
||||
});
|
||||
});
|
||||
80
packages/context/src/connections/postgres-query-executor.ts
Normal file
80
packages/context/src/connections/postgres-query-executor.ts
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
import { Client, type ClientConfig } from 'pg';
|
||||
import type {
|
||||
KloSqlQueryExecutionInput,
|
||||
KloSqlQueryExecutionResult,
|
||||
KloSqlQueryExecutorPort,
|
||||
} from './query-executor.js';
|
||||
import { limitSqlForExecution } from './read-only-sql.js';
|
||||
|
||||
interface PgClientLike {
|
||||
connect(): Promise<unknown>;
|
||||
query(input: string | { text: string; rowMode: 'array' }): Promise<{
|
||||
fields: Array<{ name: string }>;
|
||||
rows: unknown[][];
|
||||
command: string;
|
||||
rowCount: number | null;
|
||||
}>;
|
||||
end(): Promise<void>;
|
||||
}
|
||||
|
||||
interface PostgresQueryExecutorOptions {
|
||||
statementTimeoutMs?: number;
|
||||
queryTimeoutMs?: number;
|
||||
connectionTimeoutMs?: number;
|
||||
clientFactory?: (config: ClientConfig) => PgClientLike;
|
||||
}
|
||||
|
||||
function connectionDriver(input: KloSqlQueryExecutionInput): string {
|
||||
return String(input.connection?.driver ?? '').toLowerCase();
|
||||
}
|
||||
|
||||
function createDefaultClient(config: ClientConfig): PgClientLike {
|
||||
return new Client(config);
|
||||
}
|
||||
|
||||
export function createPostgresQueryExecutor(options: PostgresQueryExecutorOptions = {}): KloSqlQueryExecutorPort {
|
||||
const clientFactory = options.clientFactory ?? createDefaultClient;
|
||||
return {
|
||||
async execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult> {
|
||||
const driver = connectionDriver(input);
|
||||
if (driver !== 'postgres' && driver !== 'postgresql') {
|
||||
throw new Error(`Local Postgres execution cannot run driver "${input.connection?.driver ?? 'unknown'}".`);
|
||||
}
|
||||
if (input.connection?.readonly !== true) {
|
||||
throw new Error(`Local query execution requires connections.${input.connectionId}.readonly: true.`);
|
||||
}
|
||||
if (typeof input.connection.url !== 'string' || input.connection.url.trim().length === 0) {
|
||||
throw new Error(`Local Postgres execution requires connections.${input.connectionId}.url.`);
|
||||
}
|
||||
|
||||
const client = clientFactory({
|
||||
connectionString: input.connection.url,
|
||||
statement_timeout: options.statementTimeoutMs ?? 30_000,
|
||||
query_timeout: options.queryTimeoutMs ?? 35_000,
|
||||
connectionTimeoutMillis: options.connectionTimeoutMs ?? 5_000,
|
||||
application_name: 'klo-local-query',
|
||||
});
|
||||
await client.connect();
|
||||
try {
|
||||
await client.query('BEGIN READ ONLY');
|
||||
const result = await client.query({
|
||||
text: limitSqlForExecution(input.sql, input.maxRows),
|
||||
rowMode: 'array',
|
||||
});
|
||||
await client.query('COMMIT');
|
||||
return {
|
||||
headers: result.fields.map((field) => field.name),
|
||||
rows: result.rows,
|
||||
totalRows: result.rows.length,
|
||||
command: result.command,
|
||||
rowCount: result.rowCount,
|
||||
};
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK').catch(() => undefined);
|
||||
throw error;
|
||||
} finally {
|
||||
await client.end();
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
25
packages/context/src/connections/query-executor.ts
Normal file
25
packages/context/src/connections/query-executor.ts
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
import type { KloProjectConnectionConfig } from '../project/index.js';
|
||||
|
||||
export interface KloSqlQueryExecutionInput {
|
||||
connectionId: string;
|
||||
projectDir?: string;
|
||||
connection: KloProjectConnectionConfig | undefined;
|
||||
sql: string;
|
||||
maxRows?: number;
|
||||
}
|
||||
|
||||
export interface KloSqlQueryExecutionResult {
|
||||
headers: string[];
|
||||
rows: unknown[][];
|
||||
totalRows: number;
|
||||
command: string;
|
||||
rowCount: number | null;
|
||||
}
|
||||
|
||||
export interface KloSqlQueryExecutorPort {
|
||||
execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult>;
|
||||
}
|
||||
|
||||
export function normalizeQueryRows(rows: unknown[]): unknown[][] {
|
||||
return rows.map((row) => (Array.isArray(row) ? row : Object.values(row as Record<string, unknown>)));
|
||||
}
|
||||
30
packages/context/src/connections/read-only-sql.test.ts
Normal file
30
packages/context/src/connections/read-only-sql.test.ts
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { assertReadOnlySql, limitSqlForExecution } from './read-only-sql.js';
|
||||
|
||||
describe('assertReadOnlySql', () => {
|
||||
it('allows select and with queries', () => {
|
||||
expect(assertReadOnlySql('select * from orders')).toBe('select * from orders');
|
||||
expect(assertReadOnlySql('with paid as (select * from orders) select * from paid')).toContain('with paid');
|
||||
});
|
||||
|
||||
it('rejects mutating statements before opening a database connection', () => {
|
||||
expect(() => assertReadOnlySql('delete from orders')).toThrow(
|
||||
'Only read-only SELECT/WITH queries can be executed locally',
|
||||
);
|
||||
expect(() => assertReadOnlySql('create table x(id int)')).toThrow(
|
||||
'Only read-only SELECT/WITH queries can be executed locally',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe('limitSqlForExecution', () => {
|
||||
it('wraps compiled SQL and strips trailing semicolons', () => {
|
||||
expect(limitSqlForExecution('select * from public.orders; ', 25)).toBe(
|
||||
'select * from (select * from public.orders) as klo_query_result limit 25',
|
||||
);
|
||||
});
|
||||
|
||||
it('returns the trimmed SQL when no maxRows value is provided', () => {
|
||||
expect(limitSqlForExecution('select * from orders; ', undefined)).toBe('select * from orders');
|
||||
});
|
||||
});
|
||||
22
packages/context/src/connections/read-only-sql.ts
Normal file
22
packages/context/src/connections/read-only-sql.ts
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
const MUTATING_SQL =
|
||||
/^\s*(insert|update|delete|merge|alter|drop|create|truncate|grant|revoke|copy|call|do|vacuum|analyze|refresh)\b/i;
|
||||
const READ_SQL = /^\s*(select|with)\b/i;
|
||||
|
||||
export function assertReadOnlySql(sql: string): string {
|
||||
const trimmed = sql.trim();
|
||||
if (!READ_SQL.test(trimmed) || MUTATING_SQL.test(trimmed)) {
|
||||
throw new Error('Only read-only SELECT/WITH queries can be executed locally.');
|
||||
}
|
||||
return trimmed;
|
||||
}
|
||||
|
||||
export function limitSqlForExecution(sql: string, maxRows: number | undefined): string {
|
||||
const trimmed = assertReadOnlySql(sql).replace(/;+\s*$/, '');
|
||||
if (!maxRows) {
|
||||
return trimmed;
|
||||
}
|
||||
if (!Number.isInteger(maxRows) || maxRows <= 0) {
|
||||
throw new Error('maxRows must be a positive integer.');
|
||||
}
|
||||
return `select * from (${trimmed}) as klo_query_result limit ${maxRows}`;
|
||||
}
|
||||
148
packages/context/src/connections/sqlite-query-executor.test.ts
Normal file
148
packages/context/src/connections/sqlite-query-executor.test.ts
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
import { mkdtemp, rm } from 'node:fs/promises';
|
||||
import { writeFileSync } from 'node:fs';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import Database from 'better-sqlite3';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { createSqliteQueryExecutor, sqliteDatabasePathFromConnection } from './sqlite-query-executor.js';
|
||||
|
||||
describe('createSqliteQueryExecutor', () => {
|
||||
let tempDir: string;
|
||||
let dbPath: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-sqlite-query-'));
|
||||
dbPath = join(tempDir, 'warehouse.db');
|
||||
const db = new Database(dbPath);
|
||||
db.exec(`
|
||||
CREATE TABLE orders (
|
||||
id INTEGER PRIMARY KEY,
|
||||
status TEXT NOT NULL,
|
||||
amount INTEGER NOT NULL
|
||||
);
|
||||
INSERT INTO orders (status, amount) VALUES
|
||||
('paid', 20),
|
||||
('paid', 30),
|
||||
('open', 10);
|
||||
`);
|
||||
db.close();
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('executes read-only SELECT SQL against a relative SQLite path', async () => {
|
||||
const executor = createSqliteQueryExecutor();
|
||||
|
||||
const result = await executor.execute({
|
||||
connectionId: 'warehouse',
|
||||
projectDir: tempDir,
|
||||
connection: { driver: 'sqlite', path: 'warehouse.db', readonly: true },
|
||||
sql: 'select status, count(*) as order_count from orders group by status order by status',
|
||||
maxRows: 10,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
headers: ['status', 'order_count'],
|
||||
rows: [
|
||||
['open', 1],
|
||||
['paid', 2],
|
||||
],
|
||||
totalRows: 2,
|
||||
command: 'SELECT',
|
||||
rowCount: 2,
|
||||
});
|
||||
});
|
||||
|
||||
it('supports file urls for SQLite database paths', async () => {
|
||||
expect(
|
||||
sqliteDatabasePathFromConnection({
|
||||
connectionId: 'warehouse',
|
||||
projectDir: tempDir,
|
||||
connection: { driver: 'sqlite', url: `file://${dbPath}`, readonly: true },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).toBe(dbPath);
|
||||
});
|
||||
|
||||
it('resolves file references for SQLite path fields', async () => {
|
||||
const pointerPath = join(tempDir, 'sqlite-path.txt');
|
||||
writeFileSync(pointerPath, dbPath, 'utf-8');
|
||||
|
||||
expect(
|
||||
sqliteDatabasePathFromConnection({
|
||||
connectionId: 'warehouse',
|
||||
projectDir: tempDir,
|
||||
connection: { driver: 'sqlite', path: `file:${pointerPath}`, readonly: true },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).toBe(dbPath);
|
||||
});
|
||||
|
||||
it('resolves env references for SQLite database urls', async () => {
|
||||
const originalDatabaseUrl = process.env.KLO_SQLITE_TEST_URL;
|
||||
process.env.KLO_SQLITE_TEST_URL = `sqlite:${dbPath}`;
|
||||
|
||||
try {
|
||||
expect(
|
||||
sqliteDatabasePathFromConnection({
|
||||
connectionId: 'warehouse',
|
||||
projectDir: tempDir,
|
||||
connection: { driver: 'sqlite', url: 'env:KLO_SQLITE_TEST_URL', readonly: true },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).toBe(dbPath);
|
||||
} finally {
|
||||
if (originalDatabaseUrl === undefined) {
|
||||
delete process.env.KLO_SQLITE_TEST_URL;
|
||||
} else {
|
||||
process.env.KLO_SQLITE_TEST_URL = originalDatabaseUrl;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it('rejects mutating SQL before opening the database', async () => {
|
||||
const executor = createSqliteQueryExecutor();
|
||||
|
||||
await expect(
|
||||
executor.execute({
|
||||
connectionId: 'warehouse',
|
||||
projectDir: tempDir,
|
||||
connection: { driver: 'sqlite', path: 'warehouse.db', readonly: true },
|
||||
sql: 'delete from orders',
|
||||
}),
|
||||
).rejects.toThrow('Only read-only SELECT/WITH queries can be executed locally');
|
||||
});
|
||||
|
||||
it('requires a SQLite driver, read-only config, and a database path', async () => {
|
||||
const executor = createSqliteQueryExecutor();
|
||||
|
||||
await expect(
|
||||
executor.execute({
|
||||
connectionId: 'warehouse',
|
||||
projectDir: tempDir,
|
||||
connection: { driver: 'postgres', path: 'warehouse.db', readonly: true },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).rejects.toThrow('Local SQLite execution cannot run driver "postgres"');
|
||||
|
||||
await expect(
|
||||
executor.execute({
|
||||
connectionId: 'warehouse',
|
||||
projectDir: tempDir,
|
||||
connection: { driver: 'sqlite', path: 'warehouse.db', readonly: false },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).rejects.toThrow('Local query execution requires connections.warehouse.readonly: true');
|
||||
|
||||
await expect(
|
||||
executor.execute({
|
||||
connectionId: 'warehouse',
|
||||
projectDir: tempDir,
|
||||
connection: { driver: 'sqlite', readonly: true },
|
||||
sql: 'select 1',
|
||||
}),
|
||||
).rejects.toThrow('Local SQLite execution requires connections.warehouse.path or connections.warehouse.url');
|
||||
});
|
||||
});
|
||||
94
packages/context/src/connections/sqlite-query-executor.ts
Normal file
94
packages/context/src/connections/sqlite-query-executor.ts
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
import { isAbsolute, resolve } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import Database from 'better-sqlite3';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { homedir } from 'node:os';
|
||||
import type {
|
||||
KloSqlQueryExecutionInput,
|
||||
KloSqlQueryExecutionResult,
|
||||
KloSqlQueryExecutorPort,
|
||||
} from './query-executor.js';
|
||||
import { normalizeQueryRows } from './query-executor.js';
|
||||
import { limitSqlForExecution } from './read-only-sql.js';
|
||||
|
||||
type SqliteConnectionConfig = Record<string, unknown> | undefined;
|
||||
|
||||
function connectionDriver(input: KloSqlQueryExecutionInput): string {
|
||||
return String(input.connection?.driver ?? '').toLowerCase();
|
||||
}
|
||||
|
||||
function stringConfigValue(connection: SqliteConnectionConfig, key: string): string | undefined {
|
||||
const value = connection?.[key];
|
||||
return typeof value === 'string' && value.trim().length > 0 ? resolveStringReference(key, value.trim()) : undefined;
|
||||
}
|
||||
|
||||
function resolveStringReference(key: string, value: string): string {
|
||||
if (value.startsWith('env:')) {
|
||||
return process.env[value.slice('env:'.length)] ?? '';
|
||||
}
|
||||
if (key !== 'url' && value.startsWith('file:')) {
|
||||
const rawPath = value.slice('file:'.length);
|
||||
const path = rawPath.startsWith('~') ? resolve(homedir(), rawPath.slice(1)) : rawPath;
|
||||
return readFileSync(path, 'utf-8').trim();
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function sqlitePathFromUrl(url: string): string {
|
||||
if (url.startsWith('file:')) {
|
||||
return fileURLToPath(url);
|
||||
}
|
||||
|
||||
if (url.startsWith('sqlite:')) {
|
||||
const parsed = new URL(url);
|
||||
if (parsed.pathname.length > 0) {
|
||||
return decodeURIComponent(parsed.pathname);
|
||||
}
|
||||
}
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
export function sqliteDatabasePathFromConnection(input: KloSqlQueryExecutionInput): string {
|
||||
const driver = connectionDriver(input);
|
||||
if (driver !== 'sqlite' && driver !== 'sqlite3') {
|
||||
throw new Error(`Local SQLite execution cannot run driver "${input.connection?.driver ?? 'unknown'}".`);
|
||||
}
|
||||
if (input.connection?.readonly !== true) {
|
||||
throw new Error(`Local query execution requires connections.${input.connectionId}.readonly: true.`);
|
||||
}
|
||||
|
||||
const pathValue = stringConfigValue(input.connection, 'path');
|
||||
const urlValue = stringConfigValue(input.connection, 'url');
|
||||
if (!pathValue && !urlValue) {
|
||||
throw new Error(
|
||||
`Local SQLite execution requires connections.${input.connectionId}.path or connections.${input.connectionId}.url.`,
|
||||
);
|
||||
}
|
||||
|
||||
const candidate = pathValue ?? sqlitePathFromUrl(urlValue as string);
|
||||
return isAbsolute(candidate) ? candidate : resolve(input.projectDir ?? process.cwd(), candidate);
|
||||
}
|
||||
|
||||
export function createSqliteQueryExecutor(): KloSqlQueryExecutorPort {
|
||||
return {
|
||||
async execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult> {
|
||||
const sql = limitSqlForExecution(input.sql, input.maxRows);
|
||||
const dbPath = sqliteDatabasePathFromConnection(input);
|
||||
const db = new Database(dbPath, { readonly: true, fileMustExist: true });
|
||||
try {
|
||||
const statement = db.prepare(sql);
|
||||
const rows = statement.all() as unknown[];
|
||||
return {
|
||||
headers: statement.columns().map((column) => column.name),
|
||||
rows: normalizeQueryRows(rows),
|
||||
totalRows: rows.length,
|
||||
command: 'SELECT',
|
||||
rowCount: rows.length,
|
||||
};
|
||||
} finally {
|
||||
db.close();
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
34
packages/context/src/core/config-reference.test.ts
Normal file
34
packages/context/src/core/config-reference.test.ts
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
import { mkdir, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { resolveKloConfigReference, resolveKloHomePath } from './config-reference.js';
|
||||
|
||||
describe('KLO config references', () => {
|
||||
it('resolves env references without returning empty values', () => {
|
||||
expect(resolveKloConfigReference('env:AI_GATEWAY_API_KEY', { AI_GATEWAY_API_KEY: ' gateway-key ' })).toBe(
|
||||
'gateway-key',
|
||||
);
|
||||
expect(resolveKloConfigReference('env:AI_GATEWAY_API_KEY', { AI_GATEWAY_API_KEY: ' ' })).toBeUndefined();
|
||||
expect(resolveKloConfigReference('env:AI_GATEWAY_API_KEY', {})).toBeUndefined();
|
||||
});
|
||||
|
||||
it('resolves file references and trims file content', async () => {
|
||||
const dir = join(tmpdir(), `klo-config-reference-${process.pid}`);
|
||||
await mkdir(dir, { recursive: true });
|
||||
const keyPath = join(dir, 'gateway-key.txt');
|
||||
await writeFile(keyPath, 'file-gateway-key\n', 'utf8');
|
||||
|
||||
expect(resolveKloConfigReference(`file:${keyPath}`, {})).toBe('file-gateway-key');
|
||||
});
|
||||
|
||||
it('returns literal values unchanged after trimming blank-only values', () => {
|
||||
expect(resolveKloConfigReference('provider/model', {})).toBe('provider/model');
|
||||
expect(resolveKloConfigReference(' ', {})).toBeUndefined();
|
||||
expect(resolveKloConfigReference(undefined, {})).toBeUndefined();
|
||||
});
|
||||
|
||||
it('resolves home-prefixed paths', () => {
|
||||
expect(resolveKloHomePath('~/klo/key.txt')).toContain('/klo/key.txt');
|
||||
});
|
||||
});
|
||||
36
packages/context/src/core/config-reference.ts
Normal file
36
packages/context/src/core/config-reference.ts
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
import { readFileSync } from 'node:fs';
|
||||
import { homedir } from 'node:os';
|
||||
import { resolve } from 'node:path';
|
||||
|
||||
export function resolveKloHomePath(path: string): string {
|
||||
if (path === '~') {
|
||||
return homedir();
|
||||
}
|
||||
|
||||
if (path.startsWith('~/')) {
|
||||
return resolve(homedir(), path.slice(2));
|
||||
}
|
||||
|
||||
return resolve(path);
|
||||
}
|
||||
|
||||
export function resolveKloConfigReference(value: string | undefined, env: NodeJS.ProcessEnv): string | undefined {
|
||||
if (!value) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (value.startsWith('env:')) {
|
||||
const envName = value.slice('env:'.length).trim();
|
||||
const envValue = env[envName];
|
||||
return envValue && envValue.trim().length > 0 ? envValue.trim() : undefined;
|
||||
}
|
||||
|
||||
if (value.startsWith('file:')) {
|
||||
const filePath = resolveKloHomePath(value.slice('file:'.length).trim());
|
||||
const fileValue = readFileSync(filePath, 'utf8').trim();
|
||||
return fileValue.length > 0 ? fileValue : undefined;
|
||||
}
|
||||
|
||||
const trimmed = value.trim();
|
||||
return trimmed.length > 0 ? trimmed : undefined;
|
||||
}
|
||||
42
packages/context/src/core/config.ts
Normal file
42
packages/context/src/core/config.ts
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
export interface KloStorageConfig {
|
||||
configDir?: string;
|
||||
homeDir?: string;
|
||||
worktreesDir?: string;
|
||||
}
|
||||
|
||||
export interface KloGitConfig {
|
||||
userName: string;
|
||||
userEmail: string;
|
||||
bootstrapMessage?: string;
|
||||
bootstrapAuthor?: string;
|
||||
bootstrapAuthorEmail?: string;
|
||||
}
|
||||
|
||||
export interface KloCoreConfig {
|
||||
storage: KloStorageConfig;
|
||||
git: KloGitConfig;
|
||||
}
|
||||
|
||||
export interface KloLogger {
|
||||
debug(message: string): void;
|
||||
log(message: string): void;
|
||||
warn(message: string): void;
|
||||
error(message: string, error?: unknown): void;
|
||||
}
|
||||
|
||||
export const noopLogger: KloLogger = {
|
||||
debug: () => undefined,
|
||||
log: () => undefined,
|
||||
warn: () => undefined,
|
||||
error: () => undefined,
|
||||
};
|
||||
|
||||
export function resolveConfigDir(config: KloCoreConfig): string {
|
||||
const homeDir = config.storage.homeDir ?? '/tmp';
|
||||
return config.storage.configDir ?? `${homeDir}/klo/config`;
|
||||
}
|
||||
|
||||
export function resolveWorktreesDir(config: KloCoreConfig): string {
|
||||
const homeDir = config.storage.homeDir ?? '/tmp';
|
||||
return config.storage.worktreesDir ?? `${homeDir}/.worktrees`;
|
||||
}
|
||||
5
packages/context/src/core/embedding.ts
Normal file
5
packages/context/src/core/embedding.ts
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
export interface KloEmbeddingPort {
|
||||
maxBatchSize: number;
|
||||
computeEmbedding(text: string): Promise<number[]>;
|
||||
computeEmbeddingsBulk(texts: string[]): Promise<number[][]>;
|
||||
}
|
||||
43
packages/context/src/core/file-store.ts
Normal file
43
packages/context/src/core/file-store.ts
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
export interface KloFileWriteResult {
|
||||
commitHash?: string | null;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
export interface KloFileReadResult {
|
||||
content: string;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
export interface KloFileListResult {
|
||||
files: string[];
|
||||
}
|
||||
|
||||
export interface KloFileHistoryEntry {
|
||||
sha?: string;
|
||||
message?: string;
|
||||
author?: string;
|
||||
date?: string | Date;
|
||||
[key: string]: unknown;
|
||||
}
|
||||
|
||||
export interface KloFileStorePort<TSelf = unknown> {
|
||||
writeFile(
|
||||
path: string,
|
||||
content: string,
|
||||
author: string,
|
||||
authorEmail: string,
|
||||
commitMessage: string,
|
||||
options?: { skipLock?: boolean },
|
||||
): Promise<KloFileWriteResult>;
|
||||
readFile(path: string): Promise<KloFileReadResult>;
|
||||
deleteFile(
|
||||
path: string,
|
||||
author: string,
|
||||
authorEmail: string,
|
||||
commitMessage: string,
|
||||
options?: { skipLock?: boolean },
|
||||
): Promise<KloFileWriteResult | null>;
|
||||
listFiles(path: string, recursive?: boolean): Promise<KloFileListResult>;
|
||||
getFileHistory(path: string): Promise<KloFileHistoryEntry[] | unknown>;
|
||||
forWorktree(workdir: string): TSelf;
|
||||
}
|
||||
29
packages/context/src/core/git-env.ts
Normal file
29
packages/context/src/core/git-env.ts
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
import { simpleGit, type SimpleGit } from 'simple-git';
|
||||
|
||||
const GIT_HOOK_ENV_KEYS = [
|
||||
'GIT_ALTERNATE_OBJECT_DIRECTORIES',
|
||||
'GIT_DIR',
|
||||
'GIT_INDEX_FILE',
|
||||
'GIT_OBJECT_DIRECTORY',
|
||||
'GIT_PREFIX',
|
||||
'GIT_QUARANTINE_PATH',
|
||||
'GIT_WORK_TREE',
|
||||
'GIT_EDITOR',
|
||||
'GIT_EXEC_PATH',
|
||||
'GIT_PAGER',
|
||||
'PAGER',
|
||||
'VISUAL',
|
||||
'EDITOR',
|
||||
] as const;
|
||||
|
||||
function sanitizedGitEnv(env: NodeJS.ProcessEnv = process.env): NodeJS.ProcessEnv {
|
||||
const sanitized = { ...env };
|
||||
for (const key of GIT_HOOK_ENV_KEYS) {
|
||||
delete sanitized[key];
|
||||
}
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
export function createSimpleGit(baseDir: string): SimpleGit {
|
||||
return simpleGit({ baseDir }).env(sanitizedGitEnv());
|
||||
}
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import type { SimpleGit } from 'simple-git';
|
||||
import type { KloCoreConfig } from './config.js';
|
||||
import { createSimpleGit } from './git-env.js';
|
||||
import { GitService } from './git.service.js';
|
||||
|
||||
describe('GitService.assertWorktreeClean', () => {
|
||||
let workdir: string;
|
||||
let git: SimpleGit;
|
||||
let gitService: GitService;
|
||||
|
||||
beforeEach(async () => {
|
||||
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-clean-'));
|
||||
git = createSimpleGit(workdir);
|
||||
await git.init();
|
||||
await git.addConfig('user.email', 't@test');
|
||||
await git.addConfig('user.name', 'Test');
|
||||
await writeFile(join(workdir, 'init'), 'init');
|
||||
await git.add('.');
|
||||
await git.commit('init');
|
||||
const coreConfig: KloCoreConfig = {
|
||||
storage: { configDir: workdir, homeDir: workdir },
|
||||
git: { userName: 'Test', userEmail: 't@test' },
|
||||
};
|
||||
gitService = new GitService(coreConfig);
|
||||
(gitService as any).git = git;
|
||||
(gitService as any).configDir = workdir;
|
||||
});
|
||||
|
||||
afterEach(async () => rm(workdir, { recursive: true, force: true }));
|
||||
|
||||
it('does not throw on a clean worktree', async () => {
|
||||
await expect(gitService.assertWorktreeClean()).resolves.toBeUndefined();
|
||||
});
|
||||
|
||||
it('throws when MERGE_HEAD exists', async () => {
|
||||
await writeFile(join(workdir, '.git', 'MERGE_HEAD'), 'deadbeef\n');
|
||||
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/MERGE_HEAD/);
|
||||
});
|
||||
|
||||
it('throws when CHERRY_PICK_HEAD exists', async () => {
|
||||
await writeFile(join(workdir, '.git', 'CHERRY_PICK_HEAD'), 'deadbeef\n');
|
||||
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/CHERRY_PICK_HEAD/);
|
||||
});
|
||||
|
||||
it('throws when REVERT_HEAD exists', async () => {
|
||||
await writeFile(join(workdir, '.git', 'REVERT_HEAD'), 'deadbeef\n');
|
||||
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/REVERT_HEAD/);
|
||||
});
|
||||
|
||||
it('throws when sequencer/todo exists (interrupted multi-commit revert/cherry-pick)', async () => {
|
||||
await mkdir(join(workdir, '.git', 'sequencer'), { recursive: true });
|
||||
await writeFile(join(workdir, '.git', 'sequencer', 'todo'), 'pick deadbeef foo\n');
|
||||
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/sequencer/);
|
||||
});
|
||||
|
||||
it('throws when the index has unmerged paths', async () => {
|
||||
await git.checkoutLocalBranch('a');
|
||||
await writeFile(join(workdir, 'shared'), 'A version');
|
||||
await git.add('.');
|
||||
await git.commit('a');
|
||||
await git.checkout('master').catch(() => git.checkout('main'));
|
||||
await git.checkoutLocalBranch('b');
|
||||
await writeFile(join(workdir, 'shared'), 'B version');
|
||||
await git.add('.');
|
||||
await git.commit('b');
|
||||
|
||||
await git.raw(['merge', 'a']).catch(() => undefined);
|
||||
|
||||
await expect(gitService.assertWorktreeClean()).rejects.toThrow();
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { mkdir, mkdtemp, readdir, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import type { SimpleGit } from 'simple-git';
|
||||
import type { KloCoreConfig } from './config.js';
|
||||
import { createSimpleGit } from './git-env.js';
|
||||
import { GitService } from './git.service.js';
|
||||
|
||||
describe('GitService.deleteDirectories', () => {
|
||||
let workdir: string;
|
||||
let git: SimpleGit;
|
||||
let gitService: GitService;
|
||||
|
||||
beforeEach(async () => {
|
||||
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-dd-'));
|
||||
git = createSimpleGit(workdir);
|
||||
await git.init();
|
||||
await git.addConfig('user.email', 't@test');
|
||||
await git.addConfig('user.name', 'Test');
|
||||
await writeFile(join(workdir, 'keep'), 'k');
|
||||
await git.add('.');
|
||||
await git.commit('init');
|
||||
|
||||
const coreConfig: KloCoreConfig = {
|
||||
storage: { configDir: workdir, homeDir: workdir },
|
||||
git: { userName: 'Test', userEmail: 't@test' },
|
||||
};
|
||||
gitService = new GitService(coreConfig);
|
||||
(gitService as any).git = git;
|
||||
(gitService as any).configDir = workdir;
|
||||
});
|
||||
|
||||
afterEach(async () => rm(workdir, { recursive: true, force: true }));
|
||||
|
||||
it('removes multiple directories in a single commit', async () => {
|
||||
for (const name of ['a', 'b', 'c']) {
|
||||
await mkdir(join(workdir, name), { recursive: true });
|
||||
await writeFile(join(workdir, name, 'f.txt'), name);
|
||||
}
|
||||
await git.add('.');
|
||||
await git.commit('seed 3 dirs');
|
||||
const beforeCommits = (await git.log()).total;
|
||||
|
||||
const result = await gitService.deleteDirectories(['a', 'b'], 'gc: drop a+b', 'System User', 'system@example.com');
|
||||
expect(result.commitHash).toBeTruthy();
|
||||
|
||||
const entries = await readdir(workdir);
|
||||
expect(entries).not.toContain('a');
|
||||
expect(entries).not.toContain('b');
|
||||
expect(entries).toContain('c');
|
||||
|
||||
const afterCommits = (await git.log()).total;
|
||||
expect(afterCommits).toBe(beforeCommits + 1);
|
||||
});
|
||||
|
||||
it('no-ops and returns a null hash when the input list is empty', async () => {
|
||||
const result = await gitService.deleteDirectories([], 'empty', 'X', 'x@example.com');
|
||||
expect(result.commitHash).toBe('');
|
||||
expect(result.created).toBe(false);
|
||||
});
|
||||
|
||||
it('ignores paths that have already been deleted — commits only the remaining ones', async () => {
|
||||
await mkdir(join(workdir, 'stale'), { recursive: true });
|
||||
await writeFile(join(workdir, 'stale', 'x'), 'x');
|
||||
await git.add('.');
|
||||
await git.commit('seed stale');
|
||||
const result = await gitService.deleteDirectories(
|
||||
['stale', 'missing'],
|
||||
'gc: drop stale + missing',
|
||||
'System User',
|
||||
'system@example.com',
|
||||
);
|
||||
expect(result.commitHash).toBeTruthy();
|
||||
const entries = await readdir(workdir);
|
||||
expect(entries).not.toContain('stale');
|
||||
});
|
||||
});
|
||||
56
packages/context/src/core/git.service.reset-hard.test.ts
Normal file
56
packages/context/src/core/git.service.reset-hard.test.ts
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import type { SimpleGit } from 'simple-git';
|
||||
import type { KloCoreConfig } from './config.js';
|
||||
import { createSimpleGit } from './git-env.js';
|
||||
import { GitService } from './git.service.js';
|
||||
|
||||
describe('GitService.resetHardTo', () => {
|
||||
let workdir: string;
|
||||
let git: SimpleGit;
|
||||
let gitService: GitService;
|
||||
|
||||
beforeEach(async () => {
|
||||
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-reset-'));
|
||||
git = createSimpleGit(workdir);
|
||||
await git.init();
|
||||
await git.addConfig('user.email', 't@test');
|
||||
await git.addConfig('user.name', 'Test');
|
||||
await writeFile(join(workdir, 'init'), 'init');
|
||||
await git.add('.');
|
||||
await git.commit('init');
|
||||
const coreConfig: KloCoreConfig = {
|
||||
storage: { configDir: workdir, homeDir: workdir },
|
||||
git: { userName: 'Test', userEmail: 't@test' },
|
||||
};
|
||||
gitService = new GitService(coreConfig);
|
||||
(gitService as any).git = git;
|
||||
(gitService as any).configDir = workdir;
|
||||
});
|
||||
|
||||
afterEach(async () => rm(workdir, { recursive: true, force: true }));
|
||||
|
||||
it('rewinds HEAD to the target SHA, removing later commits and their files', async () => {
|
||||
const baseSha = (await git.revparse(['HEAD'])).trim();
|
||||
await writeFile(join(workdir, 'a'), 'a1');
|
||||
await git.add('.');
|
||||
await git.commit('a');
|
||||
await writeFile(join(workdir, 'b'), 'b1');
|
||||
await git.add('.');
|
||||
await git.commit('b');
|
||||
|
||||
await gitService.resetHardTo(baseSha);
|
||||
|
||||
expect((await git.revparse(['HEAD'])).trim()).toBe(baseSha);
|
||||
expect(await readFile(join(workdir, 'a'), 'utf-8').catch(() => null)).toBeNull();
|
||||
expect(await readFile(join(workdir, 'b'), 'utf-8').catch(() => null)).toBeNull();
|
||||
});
|
||||
|
||||
it('is a no-op when target SHA equals current HEAD', async () => {
|
||||
const sha = (await git.revparse(['HEAD'])).trim();
|
||||
await gitService.resetHardTo(sha);
|
||||
expect((await git.revparse(['HEAD'])).trim()).toBe(sha);
|
||||
});
|
||||
});
|
||||
358
packages/context/src/core/git.service.test.ts
Normal file
358
packages/context/src/core/git.service.test.ts
Normal file
|
|
@ -0,0 +1,358 @@
|
|||
import { mkdtemp, realpath, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import type { KloCoreConfig } from './config.js';
|
||||
import { GitService } from './git.service.js';
|
||||
|
||||
// These tests drive a real git repo inside a temp directory — simple-git shells out to the
|
||||
// system `git` binary. They are fast enough to run as unit tests and catch real issues that
|
||||
// would be invisible with mocked git.
|
||||
describe('GitService', () => {
|
||||
let service: GitService;
|
||||
let tempDir: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'git-service-spec-'));
|
||||
|
||||
const coreConfig: KloCoreConfig = {
|
||||
storage: { configDir: tempDir, homeDir: tempDir },
|
||||
git: {
|
||||
userName: 'Test User',
|
||||
userEmail: 'test@example.com',
|
||||
bootstrapMessage: 'Initialize test config repo',
|
||||
bootstrapAuthor: 'test-system',
|
||||
bootstrapAuthorEmail: 'system@example.com',
|
||||
},
|
||||
};
|
||||
|
||||
service = new GitService(coreConfig);
|
||||
await service.onModuleInit();
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
const writeAndCommit = async (filePath: string, content: string, message = 'msg') => {
|
||||
await writeFile(join(tempDir, filePath), content, 'utf-8');
|
||||
return service.commitFile(filePath, message, 'Test', 'test@example.com');
|
||||
};
|
||||
|
||||
describe('cold-start bootstrap commit', () => {
|
||||
it('writes an empty commit on init so HEAD always resolves', async () => {
|
||||
// beforeEach already ran onModuleInit() against an empty temp dir.
|
||||
const head = await service.revParseHead();
|
||||
expect(head).toMatch(/^[0-9a-f]{40}$/);
|
||||
});
|
||||
|
||||
it('does not double-commit when re-initialized', async () => {
|
||||
const before = await service.revParseHead();
|
||||
await service.onModuleInit();
|
||||
const after = await service.revParseHead();
|
||||
expect(after).toBe(before);
|
||||
});
|
||||
});
|
||||
|
||||
describe('commitFile `created` flag', () => {
|
||||
it('is true for a real commit', async () => {
|
||||
const info = await writeAndCommit('a.md', '# Hello');
|
||||
expect(info.created).toBe(true);
|
||||
});
|
||||
|
||||
it('is false on a no-op write (content unchanged)', async () => {
|
||||
await writeAndCommit('a.md', '# Hello');
|
||||
const second = await writeAndCommit('a.md', '# Hello', 'unused');
|
||||
expect(second.created).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('addNote / getNote', () => {
|
||||
it('attaches a note and reads it back', async () => {
|
||||
const info = await writeAndCommit('a.md', '# Hello');
|
||||
await service.addNote(info.commitHash, 'Rich message from LLM');
|
||||
expect(await service.getNote(info.commitHash)).toBe('Rich message from LLM');
|
||||
});
|
||||
|
||||
it('returns undefined when no note exists', async () => {
|
||||
const info = await writeAndCommit('a.md', '# Hello');
|
||||
expect(await service.getNote(info.commitHash)).toBeUndefined();
|
||||
});
|
||||
|
||||
it('overwrites an existing note (idempotent retries)', async () => {
|
||||
const info = await writeAndCommit('a.md', '# Hello');
|
||||
await service.addNote(info.commitHash, 'First');
|
||||
await service.addNote(info.commitHash, 'Second');
|
||||
expect(await service.getNote(info.commitHash)).toBe('Second');
|
||||
});
|
||||
|
||||
it('skips empty/whitespace messages silently', async () => {
|
||||
const info = await writeAndCommit('a.md', '# Hello');
|
||||
await service.addNote(info.commitHash, ' ');
|
||||
expect(await service.getNote(info.commitHash)).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('getFileHistory', () => {
|
||||
it('surfaces enhancedMessage when a note is present', async () => {
|
||||
const info = await writeAndCommit('a.md', '# Hello');
|
||||
await service.addNote(info.commitHash, 'Note body');
|
||||
|
||||
const history = await service.getFileHistory('a.md');
|
||||
expect(history[0]?.enhancedMessage).toBe('Note body');
|
||||
});
|
||||
|
||||
it('leaves enhancedMessage undefined when no note is attached', async () => {
|
||||
await writeAndCommit('a.md', '# Hello');
|
||||
const history = await service.getFileHistory('a.md');
|
||||
expect(history[0]?.enhancedMessage).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('getCommitDiff', () => {
|
||||
it('returns the patch scoped to the requested path', async () => {
|
||||
const info = await writeAndCommit('a.md', '# Hello');
|
||||
const diff = await service.getCommitDiff(info.commitHash, 'a.md');
|
||||
expect(diff).toContain('diff --git');
|
||||
expect(diff).toContain('Hello');
|
||||
});
|
||||
|
||||
it('handles the repository initial commit without throwing', async () => {
|
||||
const info = await writeAndCommit('first.md', 'first');
|
||||
await expect(service.getCommitDiff(info.commitHash, 'first.md')).resolves.toBeDefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('squashTo', () => {
|
||||
const writeAsSystem = async (filePath: string, content: string, message = 'msg') => {
|
||||
await writeFile(join(tempDir, filePath), content, 'utf-8');
|
||||
return service.commitFile(filePath, message, 'System User', 'system@example.com');
|
||||
};
|
||||
|
||||
it('collapses 3 commits after preHead into a single commit', async () => {
|
||||
const pre = await writeAsSystem('a.md', 'v1');
|
||||
const preHead = pre.commitHash;
|
||||
|
||||
await writeAsSystem('b.md', 'b', 'add b');
|
||||
await writeAsSystem('c.md', 'c', 'add c');
|
||||
await writeAsSystem('a.md', 'v2', 'update a');
|
||||
|
||||
const result = await service.squashTo(preHead, {
|
||||
message: 'Ingest: bundle 3 writes',
|
||||
author: 'System User',
|
||||
authorEmail: 'system@example.com',
|
||||
});
|
||||
|
||||
expect(result.squashed).toBe(true);
|
||||
expect(result.squashedCount).toBe(3);
|
||||
expect(result.commitHash).toBeTruthy();
|
||||
expect(result.commitHash).not.toBe(preHead);
|
||||
const commitHash = result.commitHash;
|
||||
if (!commitHash) {
|
||||
throw new Error('Expected squash commit hash');
|
||||
}
|
||||
|
||||
// The squashed commit should preserve the final tree state.
|
||||
const fileAtSquash = await service.getFileAtCommit('a.md', commitHash);
|
||||
expect(fileAtSquash).toBe('v2');
|
||||
const bAtSquash = await service.getFileAtCommit('b.md', commitHash);
|
||||
expect(bAtSquash).toBe('b');
|
||||
});
|
||||
|
||||
it('is a no-op when preHead equals HEAD', async () => {
|
||||
const pre = await writeAsSystem('a.md', 'v1');
|
||||
|
||||
const result = await service.squashTo(pre.commitHash, {
|
||||
message: 'nothing to squash',
|
||||
author: 'System User',
|
||||
authorEmail: 'system@example.com',
|
||||
});
|
||||
|
||||
expect(result.squashed).toBe(false);
|
||||
expect(result.commitHash).toBe(pre.commitHash);
|
||||
});
|
||||
|
||||
it('skips squash when a foreign-author commit sits between preHead and HEAD', async () => {
|
||||
const pre = await writeAsSystem('a.md', 'v1');
|
||||
const preHead = pre.commitHash;
|
||||
|
||||
await writeAsSystem('b.md', 'from us', 'ours');
|
||||
// Foreign commit
|
||||
await writeAndCommit('c.md', 'from someone else', 'foreign');
|
||||
await writeAsSystem('d.md', 'ours again', 'ours 2');
|
||||
|
||||
const result = await service.squashTo(preHead, {
|
||||
message: 'should be skipped',
|
||||
author: 'System User',
|
||||
authorEmail: 'system@example.com',
|
||||
});
|
||||
|
||||
expect(result.squashed).toBe(false);
|
||||
expect(result.reason).toContain('foreign');
|
||||
expect(result.squashedCount).toBe(3);
|
||||
});
|
||||
|
||||
it('returns cleanly when preHead is empty (no starting commit)', async () => {
|
||||
const result = await service.squashTo('', {
|
||||
message: 'would have squashed',
|
||||
author: 'System User',
|
||||
authorEmail: 'system@example.com',
|
||||
});
|
||||
|
||||
expect(result.squashed).toBe(false);
|
||||
expect(result.commitHash).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('worktree lifecycle', () => {
|
||||
// macOS canonicalizes tmp paths (/var/folders → /private/var/folders) when git
|
||||
// returns them from `worktree list`. Resolve through realpath() before comparing.
|
||||
const canonicalSiblingPath = async (suffix: string): Promise<string> => {
|
||||
const parent = await realpath(join(tempDir, '..'));
|
||||
return join(parent, `wt-${Date.now()}-${suffix}`);
|
||||
};
|
||||
|
||||
it('addWorktree creates a branch + directory at the given startSha', async () => {
|
||||
const { commitHash } = await writeAndCommit('seed.md', 'seed');
|
||||
const wtDir = await canonicalSiblingPath('add');
|
||||
await service.addWorktree(wtDir, 'session/alpha', commitHash);
|
||||
const list = await service.listWorktrees();
|
||||
expect(list.find((e) => e.path === wtDir && e.branch === 'refs/heads/session/alpha')).toBeTruthy();
|
||||
await service.removeWorktree(wtDir).catch(() => undefined);
|
||||
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
|
||||
});
|
||||
|
||||
it('removeWorktree detaches the worktree entry', async () => {
|
||||
const { commitHash } = await writeAndCommit('seed.md', 'seed');
|
||||
const wtDir = await canonicalSiblingPath('rm');
|
||||
await service.addWorktree(wtDir, 'session/beta', commitHash);
|
||||
await service.removeWorktree(wtDir);
|
||||
const list = await service.listWorktrees();
|
||||
expect(list.find((e) => e.path === wtDir)).toBeFalsy();
|
||||
});
|
||||
|
||||
it('deleteBranch removes a branch ref', async () => {
|
||||
const { commitHash } = await writeAndCommit('seed.md', 'seed');
|
||||
const wtDir = await canonicalSiblingPath('br');
|
||||
await service.addWorktree(wtDir, 'session/gamma', commitHash);
|
||||
await service.removeWorktree(wtDir);
|
||||
await service.deleteBranch('session/gamma', true);
|
||||
const branches = await (service as unknown as { git: import('simple-git').SimpleGit }).git.branchLocal();
|
||||
expect(branches.all).not.toContain('session/gamma');
|
||||
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
|
||||
});
|
||||
});
|
||||
|
||||
describe('forWorktree', () => {
|
||||
it('returns a GitService whose operations run inside the given worktree', async () => {
|
||||
const { commitHash } = await writeAndCommit('seed.md', 'seed');
|
||||
const parent = await realpath(join(tempDir, '..'));
|
||||
const wtDir = join(parent, `wt-${Date.now()}-fw`);
|
||||
await service.addWorktree(wtDir, 'session/delta', commitHash);
|
||||
|
||||
const scoped = service.forWorktree(wtDir);
|
||||
expect(await scoped.revParseHead()).toBe(commitHash);
|
||||
|
||||
await service.removeWorktree(wtDir).catch(() => undefined);
|
||||
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
|
||||
});
|
||||
});
|
||||
|
||||
describe('squashMergeIntoMain', () => {
|
||||
it('merges a session branch as one commit on main, returning the new SHA + touched paths', async () => {
|
||||
const { commitHash: baseSha } = await writeAndCommit('seed.md', 'seed');
|
||||
const parent = await realpath(join(tempDir, '..'));
|
||||
const wtDir = join(parent, `wt-${Date.now()}-sm`);
|
||||
await service.addWorktree(wtDir, 'session/happy', baseSha);
|
||||
|
||||
const scoped = service.forWorktree(wtDir);
|
||||
await writeFile(join(wtDir, 'a.yaml'), 'one: 1\n', 'utf-8');
|
||||
await scoped.commitFile('a.yaml', 'wip a', 'System User', 'system@example.com');
|
||||
await writeFile(join(wtDir, 'b.yaml'), 'two: 2\n', 'utf-8');
|
||||
await scoped.commitFile('b.yaml', 'wip b', 'System User', 'system@example.com');
|
||||
|
||||
const result = await service.squashMergeIntoMain(
|
||||
'session/happy',
|
||||
'System User',
|
||||
'system@example.com',
|
||||
'Memory capture: 2 files [chat=abcd1234]',
|
||||
);
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
if (!result.ok) {
|
||||
throw new Error('unreachable');
|
||||
}
|
||||
expect(result.squashSha).toMatch(/^[0-9a-f]{40}$/);
|
||||
expect(result.touchedPaths.sort()).toEqual(['a.yaml', 'b.yaml']);
|
||||
|
||||
const mainHead = await service.revParseHead();
|
||||
expect(mainHead).toBe(result.squashSha);
|
||||
expect(mainHead).not.toBe(baseSha);
|
||||
|
||||
await service.removeWorktree(wtDir).catch(() => undefined);
|
||||
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
|
||||
});
|
||||
|
||||
it('returns ok with empty touchedPaths when the session branch has no diff vs main', async () => {
|
||||
const { commitHash: baseSha } = await writeAndCommit('seed.md', 'seed');
|
||||
const parent = await realpath(join(tempDir, '..'));
|
||||
const wtDir = join(parent, `wt-${Date.now()}-sm-empty`);
|
||||
await service.addWorktree(wtDir, 'session/empty', baseSha);
|
||||
|
||||
const result = await service.squashMergeIntoMain(
|
||||
'session/empty',
|
||||
'System User',
|
||||
'system@example.com',
|
||||
'should be a no-op',
|
||||
);
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
if (!result.ok) {
|
||||
throw new Error('unreachable');
|
||||
}
|
||||
expect(result.touchedPaths).toEqual([]);
|
||||
expect(result.squashSha).toBe(baseSha);
|
||||
|
||||
await service.removeWorktree(wtDir).catch(() => undefined);
|
||||
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
|
||||
});
|
||||
|
||||
it('returns conflict=true and leaves main clean when session+main touched same file differently', async () => {
|
||||
await writeAndCommit('shared.yaml', 'base\n');
|
||||
const base = await service.revParseHead();
|
||||
if (!base) {
|
||||
throw new Error('no base head');
|
||||
}
|
||||
|
||||
const parent = await realpath(join(tempDir, '..'));
|
||||
const wtDir = join(parent, `wt-${Date.now()}-conf`);
|
||||
await service.addWorktree(wtDir, 'session/conf', base);
|
||||
const scoped = service.forWorktree(wtDir);
|
||||
await writeFile(join(wtDir, 'shared.yaml'), 'session-edit\n', 'utf-8');
|
||||
await scoped.commitFile('shared.yaml', 'session edit', 'System User', 'system@example.com');
|
||||
|
||||
// Main edits the same file a different way, after the session branched.
|
||||
await writeAndCommit('shared.yaml', 'main-edit\n');
|
||||
|
||||
const result = await service.squashMergeIntoMain(
|
||||
'session/conf',
|
||||
'System User',
|
||||
'system@example.com',
|
||||
'Memory capture: 1 file [chat=dead1234]',
|
||||
);
|
||||
|
||||
expect(result.ok).toBe(false);
|
||||
if (result.ok) {
|
||||
throw new Error('unreachable');
|
||||
}
|
||||
expect(result.conflict).toBe(true);
|
||||
expect(result.conflictPaths).toContain('shared.yaml');
|
||||
|
||||
const status = await (service as unknown as { git: import('simple-git').SimpleGit }).git.status();
|
||||
expect(status.isClean()).toBe(true);
|
||||
|
||||
await service.removeWorktree(wtDir).catch(() => undefined);
|
||||
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
|
||||
});
|
||||
});
|
||||
});
|
||||
855
packages/context/src/core/git.service.ts
Normal file
855
packages/context/src/core/git.service.ts
Normal file
|
|
@ -0,0 +1,855 @@
|
|||
import { promises as fs } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import type { SimpleGit } from 'simple-git';
|
||||
import { noopLogger, resolveConfigDir, type KloCoreConfig, type KloLogger } from './config.js';
|
||||
import { createSimpleGit } from './git-env.js';
|
||||
|
||||
export interface GitCommitInfo {
|
||||
commitHash: string;
|
||||
shortHash: string;
|
||||
message: string;
|
||||
author: string;
|
||||
authorEmail: string;
|
||||
timestamp: string;
|
||||
committedDate: string;
|
||||
/**
|
||||
* True if this call produced a new commit. False when the file was already up-to-date
|
||||
* and the returned info describes the pre-existing HEAD commit (no-op write).
|
||||
*/
|
||||
created: boolean;
|
||||
/** Async LLM-generated commit summary attached as a git note. Undefined if no note present. */
|
||||
enhancedMessage?: string;
|
||||
}
|
||||
|
||||
export interface WorktreeEntry {
|
||||
path: string;
|
||||
branch: string | null;
|
||||
head: string | null;
|
||||
}
|
||||
|
||||
export type SquashMergeResult =
|
||||
| { ok: true; squashSha: string; touchedPaths: string[] }
|
||||
| { ok: false; conflict: true; conflictPaths: string[] };
|
||||
|
||||
export class GitService {
|
||||
private readonly logger: KloLogger;
|
||||
private git!: SimpleGit;
|
||||
private configDir: string;
|
||||
|
||||
constructor(
|
||||
private readonly config: KloCoreConfig,
|
||||
logger?: KloLogger,
|
||||
) {
|
||||
this.logger = logger ?? noopLogger;
|
||||
this.configDir = resolveConfigDir(config);
|
||||
}
|
||||
|
||||
async onModuleInit(): Promise<void> {
|
||||
// Ensure config directory exists
|
||||
await fs.mkdir(this.configDir, { recursive: true });
|
||||
this.logger.log(`Config directory ensured at: ${this.configDir}`);
|
||||
|
||||
// Initialize simple-git
|
||||
this.git = createSimpleGit(this.configDir);
|
||||
|
||||
// Initialize git repository
|
||||
await this.initialize();
|
||||
}
|
||||
|
||||
private async initialize(): Promise<void> {
|
||||
try {
|
||||
// Check if already initialized
|
||||
const isRepo = await this.git.checkIsRepo();
|
||||
|
||||
if (!isRepo) {
|
||||
await this.git.init();
|
||||
const gitConfig = this.config.git;
|
||||
await this.git.addConfig('user.name', gitConfig.userName);
|
||||
await this.git.addConfig('user.email', gitConfig.userEmail);
|
||||
this.logger.log('Initialized git repository');
|
||||
}
|
||||
|
||||
// Ensure HEAD always resolves to a commit so callers (e.g., the memory-agent squash flow)
|
||||
// can rely on `revParseHead()` returning a SHA. Idempotent: skip if HEAD already exists.
|
||||
const head = await this.revParseHead();
|
||||
if (!head) {
|
||||
await this.git.commit(this.config.git.bootstrapMessage ?? 'Initialize klo project repository', {
|
||||
'--allow-empty': null,
|
||||
'--author': `${this.config.git.bootstrapAuthor ?? 'klo system'} <${
|
||||
this.config.git.bootstrapAuthorEmail ?? 'system@klo.local'
|
||||
}>`,
|
||||
});
|
||||
this.logger.log('Wrote bootstrap commit to config repo');
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error('Failed to initialize git repository', error);
|
||||
throw new Error('Failed to initialize git repository');
|
||||
}
|
||||
}
|
||||
|
||||
async commitFile(
|
||||
filePath: string,
|
||||
commitMessage: string,
|
||||
author: string,
|
||||
authorEmail: string,
|
||||
): Promise<GitCommitInfo> {
|
||||
try {
|
||||
// Stage the file
|
||||
await this.git.add(filePath);
|
||||
|
||||
// Check if there are any staged changes to commit
|
||||
const stagedChanges = await this.git.diff(['--cached', '--name-only']);
|
||||
|
||||
if (!stagedChanges.trim()) {
|
||||
// No changes to commit, file already matches what's in git
|
||||
this.logger.debug(`No changes to commit for ${filePath}, file already up to date`);
|
||||
|
||||
// Return info about the current HEAD commit
|
||||
const log = await this.git.log({ maxCount: 1 });
|
||||
const commit = log.latest;
|
||||
|
||||
if (!commit) {
|
||||
throw new Error('Failed to retrieve commit details');
|
||||
}
|
||||
|
||||
return {
|
||||
commitHash: commit.hash,
|
||||
shortHash: commit.hash.substring(0, 8),
|
||||
message: commit.message,
|
||||
author: commit.author_name,
|
||||
authorEmail: commit.author_email,
|
||||
timestamp: commit.date,
|
||||
committedDate: new Date(commit.date).toISOString(),
|
||||
created: false,
|
||||
};
|
||||
}
|
||||
|
||||
// There are changes to commit
|
||||
const result = await this.git.commit(commitMessage, {
|
||||
'--author': `${author} <${authorEmail}>`,
|
||||
});
|
||||
|
||||
if (!result.commit) {
|
||||
throw new Error('No commit hash returned');
|
||||
}
|
||||
|
||||
// Get commit details
|
||||
const log = await this.git.log({ maxCount: 1 });
|
||||
const commit = log.latest;
|
||||
|
||||
if (!commit) {
|
||||
throw new Error('Failed to retrieve commit details');
|
||||
}
|
||||
|
||||
return {
|
||||
commitHash: commit.hash,
|
||||
shortHash: commit.hash.substring(0, 8),
|
||||
message: commit.message,
|
||||
author: commit.author_name,
|
||||
authorEmail: commit.author_email,
|
||||
timestamp: commit.date,
|
||||
committedDate: new Date(commit.date).toISOString(),
|
||||
created: true,
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to commit file ${filePath}`, error);
|
||||
throw new Error(`Failed to commit file: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stage multiple files and produce a single commit. Mirrors `commitFile` but batches
|
||||
* N paths into one atomic commit — used by the SL capture agent to commit all edits at once.
|
||||
*/
|
||||
async commitFiles(
|
||||
filePaths: string[],
|
||||
commitMessage: string,
|
||||
author: string,
|
||||
authorEmail: string,
|
||||
): Promise<GitCommitInfo> {
|
||||
try {
|
||||
for (const filePath of filePaths) {
|
||||
await this.git.add(filePath);
|
||||
}
|
||||
|
||||
const stagedChanges = await this.git.diff(['--cached', '--name-only']);
|
||||
|
||||
if (!stagedChanges.trim()) {
|
||||
this.logger.debug(`No changes to commit for ${filePaths.length} file(s), already up to date`);
|
||||
const log = await this.git.log({ maxCount: 1 });
|
||||
const commit = log.latest;
|
||||
if (!commit) {
|
||||
throw new Error('Failed to retrieve commit details');
|
||||
}
|
||||
return {
|
||||
commitHash: commit.hash,
|
||||
shortHash: commit.hash.substring(0, 8),
|
||||
message: commit.message,
|
||||
author: commit.author_name,
|
||||
authorEmail: commit.author_email,
|
||||
timestamp: commit.date,
|
||||
committedDate: new Date(commit.date).toISOString(),
|
||||
created: false,
|
||||
};
|
||||
}
|
||||
|
||||
const result = await this.git.commit(commitMessage, {
|
||||
'--author': `${author} <${authorEmail}>`,
|
||||
});
|
||||
|
||||
if (!result.commit) {
|
||||
throw new Error('No commit hash returned');
|
||||
}
|
||||
|
||||
const log = await this.git.log({ maxCount: 1 });
|
||||
const commit = log.latest;
|
||||
if (!commit) {
|
||||
throw new Error('Failed to retrieve commit details');
|
||||
}
|
||||
|
||||
return {
|
||||
commitHash: commit.hash,
|
||||
shortHash: commit.hash.substring(0, 8),
|
||||
message: commit.message,
|
||||
author: commit.author_name,
|
||||
authorEmail: commit.author_email,
|
||||
timestamp: commit.date,
|
||||
committedDate: new Date(commit.date).toISOString(),
|
||||
created: true,
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to batch commit ${filePaths.length} file(s)`, error);
|
||||
throw new Error(`Failed to batch commit: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Revert working-tree changes for the given paths (equivalent to `git checkout -- <paths>`).
|
||||
* Used to roll back dirty files when validation fails.
|
||||
*/
|
||||
async checkoutFiles(filePaths: string[]): Promise<void> {
|
||||
if (filePaths.length === 0) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await this.git.checkout(['--', ...filePaths]);
|
||||
} catch (error) {
|
||||
this.logger.warn(
|
||||
`Failed to checkout ${filePaths.length} file(s): ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the content of `filePath` as it existed at `commitHash`. Equivalent to
|
||||
* `git show <sha>:<path>`. Reads from git object storage, so it's safe against
|
||||
* concurrent working-tree mutations.
|
||||
*/
|
||||
async getFileAtCommit(filePath: string, commitHash: string): Promise<string> {
|
||||
try {
|
||||
return await this.git.show([`${commitHash}:${filePath}`]);
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to read ${filePath} at ${commitHash}`, error);
|
||||
throw new Error(`Failed to read file at commit: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
async getFileHistory(filePath: string, limit: number = 50): Promise<GitCommitInfo[]> {
|
||||
try {
|
||||
const log = await this.git.log({
|
||||
file: filePath,
|
||||
maxCount: limit,
|
||||
});
|
||||
|
||||
// N+1 fetch of notes is fine here: capped at 100 commits, cold UI path.
|
||||
return Promise.all(
|
||||
log.all.map(async (commit) => ({
|
||||
commitHash: commit.hash,
|
||||
shortHash: commit.hash.substring(0, 8),
|
||||
message: commit.message,
|
||||
author: commit.author_name,
|
||||
authorEmail: commit.author_email,
|
||||
timestamp: commit.date,
|
||||
committedDate: new Date(commit.date).toISOString(),
|
||||
created: true,
|
||||
enhancedMessage: await this.getNote(commit.hash),
|
||||
})),
|
||||
);
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to get history for ${filePath}`, error);
|
||||
throw new Error(`Failed to retrieve file history: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attach or overwrite an LLM-generated summary note on a commit.
|
||||
* Uses `-f` so retries overwrite rather than fail on existing notes (idempotent).
|
||||
* Callers are responsible for holding `config:repo` Redlock — notes writes mutate
|
||||
* `.git/refs/notes/commits` and must serialize with commits.
|
||||
*/
|
||||
async addNote(commitHash: string, message: string): Promise<void> {
|
||||
const trimmed = message.trim();
|
||||
if (!trimmed) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
await this.git.raw(['notes', 'add', '-f', '-m', trimmed, commitHash]);
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to attach note to ${commitHash}`, error);
|
||||
throw new Error(`Failed to attach git note: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the LLM-generated note for a commit, or undefined if none present.
|
||||
* Swallows `simple-git`'s "no note found" error so callers can treat it as optional.
|
||||
*/
|
||||
async getNote(commitHash: string): Promise<string | undefined> {
|
||||
try {
|
||||
const note = await this.git.raw(['notes', 'show', commitHash]);
|
||||
const trimmed = note.trim();
|
||||
return trimmed ? trimmed : undefined;
|
||||
} catch {
|
||||
// `git notes show` exits non-zero when no note exists — treat as "no note".
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the patch for a commit, optionally scoped to a single path.
|
||||
* Strips the commit header above the first `diff --git` so only the patch body remains,
|
||||
* and clips to 12 KB to bound LLM token cost. Returns '' if the commit changed nothing
|
||||
* on the requested path (e.g. a commit that only touched other files).
|
||||
*/
|
||||
async getCommitDiff(commitHash: string, path?: string): Promise<string> {
|
||||
const args = ['show', '--format=', '--no-color', '--patch', commitHash];
|
||||
if (path) {
|
||||
args.push('--', path);
|
||||
}
|
||||
try {
|
||||
const raw = await this.git.raw(args);
|
||||
const diffStart = raw.indexOf('diff --git');
|
||||
const body = diffStart >= 0 ? raw.slice(diffStart) : raw.trim();
|
||||
const MAX_DIFF_BYTES = 12_000;
|
||||
return body.length > MAX_DIFF_BYTES ? `${body.slice(0, MAX_DIFF_BYTES)}\n… [diff truncated]` : body;
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to read diff for ${commitHash}`, error);
|
||||
throw new Error(`Failed to read commit diff: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
async deleteFile(
|
||||
filePath: string,
|
||||
commitMessage: string,
|
||||
author: string,
|
||||
authorEmail: string,
|
||||
): Promise<GitCommitInfo> {
|
||||
try {
|
||||
// Remove the file from git
|
||||
await this.git.rm(filePath);
|
||||
|
||||
// Commit the deletion
|
||||
const result = await this.git.commit(commitMessage, {
|
||||
'--author': `${author} <${authorEmail}>`,
|
||||
});
|
||||
|
||||
if (!result.commit) {
|
||||
throw new Error('No commit hash returned');
|
||||
}
|
||||
|
||||
// Get commit details
|
||||
const log = await this.git.log({ maxCount: 1 });
|
||||
const commit = log.latest;
|
||||
|
||||
if (!commit) {
|
||||
throw new Error('Failed to retrieve commit details');
|
||||
}
|
||||
|
||||
return {
|
||||
commitHash: commit.hash,
|
||||
shortHash: commit.hash.substring(0, 8),
|
||||
message: commit.message,
|
||||
author: commit.author_name,
|
||||
authorEmail: commit.author_email,
|
||||
timestamp: commit.date,
|
||||
committedDate: new Date(commit.date).toISOString(),
|
||||
created: true,
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to delete file ${filePath}`, error);
|
||||
throw new Error(`Failed to delete file: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve HEAD to a full commit SHA. Returns the empty string if the repo has no commits yet
|
||||
* (a freshly-init'd repo before any writes), so callers can treat that as "nothing to reconcile".
|
||||
*/
|
||||
async revParseHead(): Promise<string> {
|
||||
try {
|
||||
const sha = await this.git.revparse(['HEAD']);
|
||||
return sha.trim();
|
||||
} catch {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify a commit object exists in the local repo. Used by the reconciler to detect
|
||||
* the "history was rewritten / partial clone" case before attempting `git diff $sha..HEAD`.
|
||||
*/
|
||||
async commitExists(commitHash: string): Promise<boolean> {
|
||||
if (!commitHash) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
await this.git.raw(['cat-file', '-e', `${commitHash}^{commit}`]);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* `git diff --name-status $from..$to -- $pathSpec`. Returns one entry per changed path.
|
||||
* Renames (`R{score}\told\tnew`) are split into a `D` for the old path plus an `A` for
|
||||
* the new — the reconciler treats each path independently and the new path's row will
|
||||
* upsert with whatever content the file actually has.
|
||||
*/
|
||||
async diffNameStatus(
|
||||
from: string,
|
||||
to: string,
|
||||
pathSpec?: string,
|
||||
): Promise<Array<{ status: 'A' | 'M' | 'D'; path: string }>> {
|
||||
const args = ['diff', '--name-status', '-z', `${from}..${to}`];
|
||||
if (pathSpec) {
|
||||
args.push('--', pathSpec);
|
||||
}
|
||||
const raw = await this.git.raw(args);
|
||||
if (!raw) {
|
||||
return [];
|
||||
}
|
||||
// -z output: NUL-separated fields. For A/M/D: "<status>\0<path>\0". For R/C: "<status>\0<old>\0<new>\0".
|
||||
const fields = raw.split('\0').filter((f) => f.length > 0);
|
||||
const out: Array<{ status: 'A' | 'M' | 'D'; path: string }> = [];
|
||||
let i = 0;
|
||||
while (i < fields.length) {
|
||||
const status = fields[i];
|
||||
const code = status[0];
|
||||
if (code === 'R' || code === 'C') {
|
||||
const oldPath = fields[i + 1];
|
||||
const newPath = fields[i + 2];
|
||||
out.push({ status: 'D', path: oldPath });
|
||||
out.push({ status: 'A', path: newPath });
|
||||
i += 3;
|
||||
} else if (code === 'A' || code === 'M' || code === 'D') {
|
||||
out.push({ status: code, path: fields[i + 1] });
|
||||
i += 2;
|
||||
} else {
|
||||
// Unknown status (T type-change, U unmerged, X unknown) — treat as modify, skip if no path
|
||||
if (fields[i + 1]) {
|
||||
out.push({ status: 'M', path: fields[i + 1] });
|
||||
}
|
||||
i += 2;
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* List all paths under the working tree that match `pathSpec`, scoped to HEAD.
|
||||
* Used for the reconciler's first-ever run when there's no watermark to diff from.
|
||||
*/
|
||||
async listFilesAtHead(pathSpec: string): Promise<string[]> {
|
||||
try {
|
||||
const raw = await this.git.raw(['ls-tree', '-r', '-z', '--name-only', 'HEAD', '--', pathSpec]);
|
||||
if (!raw) {
|
||||
return [];
|
||||
}
|
||||
return raw.split('\0').filter((f) => f.length > 0);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Collapse all commits between `preHead` and current HEAD into a single commit with the given
|
||||
* message. Used by the memory agent to squash N per-tool-call commits into one ingest commit.
|
||||
*
|
||||
* Author-check guard: if any commit between preHead..HEAD has an author other than
|
||||
* `expectedAuthor`, skips the squash and returns `{ squashed: false, reason: ... }`. This
|
||||
* prevents accidentally collapsing another writer's commits if writes interleaved with ours.
|
||||
*
|
||||
* Caller is responsible for holding the `config:repo` lock so writes and squash serialize.
|
||||
*/
|
||||
async squashTo(
|
||||
preHead: string,
|
||||
options: { message: string; author: string; authorEmail: string; expectedAuthor?: string },
|
||||
): Promise<{ squashed: boolean; commitHash: string | null; reason?: string; squashedCount?: number }> {
|
||||
const { message, author, authorEmail } = options;
|
||||
const expectedAuthor = options.expectedAuthor ?? author;
|
||||
|
||||
if (!preHead) {
|
||||
return { squashed: false, commitHash: null, reason: 'no pre-head recorded (empty repo at start)' };
|
||||
}
|
||||
|
||||
let currentHead: string;
|
||||
try {
|
||||
currentHead = (await this.git.revparse(['HEAD'])).trim();
|
||||
} catch {
|
||||
return { squashed: false, commitHash: null, reason: 'no HEAD (repo is empty)' };
|
||||
}
|
||||
|
||||
if (currentHead === preHead) {
|
||||
return { squashed: false, commitHash: preHead, reason: 'no new commits' };
|
||||
}
|
||||
|
||||
try {
|
||||
const log = await this.git.log({ from: preHead, to: 'HEAD' });
|
||||
const commits = log.all;
|
||||
if (commits.length === 0) {
|
||||
return { squashed: false, commitHash: preHead, reason: 'no new commits' };
|
||||
}
|
||||
const foreign = commits.find((c) => c.author_name !== expectedAuthor);
|
||||
if (foreign) {
|
||||
this.logger.warn(
|
||||
`Skipping squash: commit ${foreign.hash.substring(0, 8)} authored by "${foreign.author_name}" ` +
|
||||
`differs from expected "${expectedAuthor}". Leaving ${commits.length} commit(s) as-is.`,
|
||||
);
|
||||
return {
|
||||
squashed: false,
|
||||
commitHash: currentHead,
|
||||
reason: `foreign commit by ${foreign.author_name}`,
|
||||
squashedCount: commits.length,
|
||||
};
|
||||
}
|
||||
|
||||
// Soft reset to preHead, then produce a single commit with all the staged changes.
|
||||
await this.git.reset(['--soft', preHead]);
|
||||
|
||||
const staged = await this.git.diff(['--cached', '--name-only']);
|
||||
if (!staged.trim()) {
|
||||
// All intervening commits cancelled each other out — return to preHead and commit nothing.
|
||||
return { squashed: true, commitHash: preHead, reason: 'no net changes', squashedCount: commits.length };
|
||||
}
|
||||
|
||||
await this.git.commit(message, { '--author': `${author} <${authorEmail}>` });
|
||||
const newHead = (await this.git.revparse(['HEAD'])).trim();
|
||||
this.logger.log(
|
||||
`squashTo: collapsed ${commits.length} commit(s) into ${newHead.substring(0, 8)} (was ${currentHead.substring(0, 8)})`,
|
||||
);
|
||||
return { squashed: true, commitHash: newHead, squashedCount: commits.length };
|
||||
} catch (error) {
|
||||
this.logger.error('Failed to squash commits', error);
|
||||
throw new Error(`Failed to squash commits: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Squash-merge `branch` into the currently-checked-out branch of THIS worktree (the
|
||||
* main worktree, when called on the root GitService instance). Produces a single
|
||||
* commit whose tree equals the source branch's tree, with the given message/author.
|
||||
* Returns `{ ok: false, conflict: true, conflictPaths }` and leaves the main worktree
|
||||
* clean if git reports merge conflicts.
|
||||
*
|
||||
* Caller must hold the `config:repo` lock so interactive writes don't race against the
|
||||
* merge window.
|
||||
*/
|
||||
async squashMergeIntoMain(
|
||||
branch: string,
|
||||
author: string,
|
||||
authorEmail: string,
|
||||
commitMessage: string,
|
||||
): Promise<SquashMergeResult> {
|
||||
// Diff of HEAD..branch (two dots) lists commits/files reachable from `branch` that
|
||||
// aren't on HEAD — i.e. exactly what the squash would apply. Three dots (HEAD...branch)
|
||||
// is symmetric difference and would mis-classify cases where main moved ahead.
|
||||
const diff = await this.git.raw(['diff', '--name-only', `HEAD..${branch}`]);
|
||||
const touchedPaths = diff
|
||||
.split('\n')
|
||||
.map((l) => l.trim())
|
||||
.filter(Boolean);
|
||||
if (touchedPaths.length === 0) {
|
||||
const head = (await this.git.revparse(['HEAD'])).trim();
|
||||
return { ok: true, squashSha: head, touchedPaths: [] };
|
||||
}
|
||||
|
||||
// `git merge --squash` may NOT throw on a textual conflict — it stages the clean
|
||||
// hunks and leaves conflicted paths unmerged in the index. simple-git may also
|
||||
// throw if the underlying git exits non-zero. Handle both: try the merge, then
|
||||
// independently inspect the index for unmerged paths before committing.
|
||||
let mergeError: unknown = null;
|
||||
try {
|
||||
await this.git.raw(['merge', '--squash', branch]);
|
||||
} catch (error) {
|
||||
mergeError = error;
|
||||
}
|
||||
|
||||
const unmergedOut = await this.git.raw(['diff', '--name-only', '--diff-filter=U']).catch(() => '');
|
||||
const conflictPaths = unmergedOut
|
||||
.split('\n')
|
||||
.map((l) => l.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
if (conflictPaths.length > 0 || mergeError !== null) {
|
||||
// `merge --abort` only works for an in-progress merge; squash sets MERGE_MSG but not
|
||||
// MERGE_HEAD, so fall back to a hard reset which clears the index and worktree.
|
||||
await this.git.raw(['merge', '--abort']).catch(() => undefined);
|
||||
await this.git.raw(['reset', '--hard', 'HEAD']).catch(() => undefined);
|
||||
this.logger.warn(
|
||||
`squashMergeIntoMain: conflict merging ${branch} — aborted. conflictPaths=${conflictPaths.join(',')}` +
|
||||
(mergeError ? ` error=${mergeError instanceof Error ? mergeError.message : String(mergeError)}` : ''),
|
||||
);
|
||||
return { ok: false, conflict: true, conflictPaths };
|
||||
}
|
||||
|
||||
await this.git.commit(commitMessage, { '--author': `${author} <${authorEmail}>` });
|
||||
const squashSha = (await this.git.revparse(['HEAD'])).trim();
|
||||
return { ok: true, squashSha, touchedPaths };
|
||||
}
|
||||
|
||||
/**
|
||||
* Rewinds the current branch's HEAD to `targetSha`, discarding all later commits and any
|
||||
* uncommitted worktree changes. Used by Stage-3 to back out a failed work-unit's commits
|
||||
* on the session worktree - simpler and more robust than `git revert` over a multi-commit
|
||||
* range, which can pause the sequencer on conflicts.
|
||||
*/
|
||||
async resetHardTo(targetSha: string): Promise<void> {
|
||||
await this.git.raw(['reset', '--hard', targetSha]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Throws if the worktree is in a state that would make a downstream merge unsafe: an
|
||||
* in-progress merge, rebase, cherry-pick, revert, interrupted sequencer operation, or
|
||||
* unmerged paths in the index.
|
||||
*/
|
||||
async assertWorktreeClean(): Promise<void> {
|
||||
const inProgressMarkers: ReadonlyArray<{ relPath: string; label: string }> = [
|
||||
{ relPath: 'MERGE_HEAD', label: 'MERGE_HEAD' },
|
||||
{ relPath: 'REBASE_HEAD', label: 'REBASE_HEAD' },
|
||||
{ relPath: 'CHERRY_PICK_HEAD', label: 'CHERRY_PICK_HEAD' },
|
||||
{ relPath: 'REVERT_HEAD', label: 'REVERT_HEAD' },
|
||||
{ relPath: 'sequencer/todo', label: 'sequencer (interrupted multi-commit op)' },
|
||||
];
|
||||
|
||||
for (const { relPath, label } of inProgressMarkers) {
|
||||
const gitPath = (await this.git.raw(['rev-parse', '--git-path', relPath])).trim();
|
||||
const fullPath = gitPath.startsWith('/') ? gitPath : join(this.configDir, gitPath);
|
||||
if (await this.fileExists(fullPath)) {
|
||||
throw new Error(
|
||||
`Worktree has in-progress git operation (${label} present at ${fullPath}); refusing to proceed`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const unmerged = (await this.git.raw(['diff', '--name-only', '--diff-filter=U']).catch(() => ''))
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter(Boolean);
|
||||
if (unmerged.length > 0) {
|
||||
throw new Error(
|
||||
`Worktree has ${unmerged.length} unmerged path(s): ${unmerged.slice(0, 5).join(', ')}; refusing to proceed`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private async fileExists(path: string): Promise<boolean> {
|
||||
try {
|
||||
await fs.access(path);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new worktree at `path` with a new branch `branch` pointing at `startSha`.
|
||||
* Used by the memory agent to isolate per-session writes from interactive saves on main.
|
||||
*/
|
||||
async addWorktree(path: string, branch: string, startSha: string): Promise<void> {
|
||||
try {
|
||||
await this.git.raw(['worktree', 'add', '-b', branch, path, startSha]);
|
||||
} catch (error) {
|
||||
throw new Error(`Failed to add worktree at ${path}: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove the worktree entry and its on-disk directory. Uses `--force` because session
|
||||
* worktrees are klo-internal — a clean working tree is not required.
|
||||
*/
|
||||
async removeWorktree(path: string): Promise<void> {
|
||||
try {
|
||||
await this.git.raw(['worktree', 'remove', '--force', path]);
|
||||
} catch (error) {
|
||||
this.logger.warn(
|
||||
`removeWorktree failed for ${path}: ${error instanceof Error ? error.message : String(error)} — attempting prune`,
|
||||
);
|
||||
await this.git.raw(['worktree', 'prune']).catch(() => undefined);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* List all worktrees attached to this repo, parsed from `worktree list --porcelain`.
|
||||
* The main worktree is included.
|
||||
*/
|
||||
async listWorktrees(): Promise<WorktreeEntry[]> {
|
||||
const out = await this.git.raw(['worktree', 'list', '--porcelain']);
|
||||
const entries: WorktreeEntry[] = [];
|
||||
let current: Partial<WorktreeEntry> = {};
|
||||
for (const line of out.split('\n')) {
|
||||
if (line.startsWith('worktree ')) {
|
||||
if (current.path) {
|
||||
entries.push({
|
||||
path: current.path,
|
||||
branch: current.branch ?? null,
|
||||
head: current.head ?? null,
|
||||
});
|
||||
}
|
||||
current = { path: line.slice('worktree '.length) };
|
||||
} else if (line.startsWith('HEAD ')) {
|
||||
current.head = line.slice('HEAD '.length);
|
||||
} else if (line.startsWith('branch ')) {
|
||||
current.branch = line.slice('branch '.length);
|
||||
}
|
||||
}
|
||||
if (current.path) {
|
||||
entries.push({
|
||||
path: current.path,
|
||||
branch: current.branch ?? null,
|
||||
head: current.head ?? null,
|
||||
});
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
async deleteBranch(branch: string, force = false): Promise<void> {
|
||||
await this.git.raw(['branch', force ? '-D' : '-d', branch]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Lightweight factory returning a GitService instance whose simple-git client is scoped
|
||||
* to `workdir`. Used by memory-agent session worktrees. The returned instance shares
|
||||
* config and the logger with the parent; it does NOT run `onModuleInit`
|
||||
* (the main instance has already initialized the repo).
|
||||
*/
|
||||
forWorktree(workdir: string): GitService {
|
||||
const scoped = new GitService(this.config, this.logger);
|
||||
scoped.git = createSimpleGit(workdir);
|
||||
scoped.configDir = workdir;
|
||||
return scoped;
|
||||
}
|
||||
|
||||
async deleteDirectory(
|
||||
directoryPath: string,
|
||||
commitMessage: string,
|
||||
author: string,
|
||||
authorEmail: string,
|
||||
): Promise<GitCommitInfo> {
|
||||
try {
|
||||
// Remove the directory recursively from git
|
||||
await this.git.rm(['-r', directoryPath]);
|
||||
|
||||
// Commit the deletion
|
||||
const result = await this.git.commit(commitMessage, {
|
||||
'--author': `${author} <${authorEmail}>`,
|
||||
});
|
||||
|
||||
if (!result.commit) {
|
||||
throw new Error('No commit hash returned');
|
||||
}
|
||||
|
||||
// Get commit details
|
||||
const log = await this.git.log({ maxCount: 1 });
|
||||
const commit = log.latest;
|
||||
|
||||
if (!commit) {
|
||||
throw new Error('Failed to retrieve commit details');
|
||||
}
|
||||
|
||||
return {
|
||||
commitHash: commit.hash,
|
||||
shortHash: commit.hash.substring(0, 8),
|
||||
message: commit.message,
|
||||
author: commit.author_name,
|
||||
authorEmail: commit.author_email,
|
||||
timestamp: commit.date,
|
||||
committedDate: new Date(commit.date).toISOString(),
|
||||
created: true,
|
||||
};
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to delete directory ${directoryPath}`, error);
|
||||
throw new Error(`Failed to delete directory: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove multiple directories recursively and commit them as one change.
|
||||
* Paths that don't exist in the working tree are skipped silently (useful for GC
|
||||
* where the DB-known path has already been evicted by a previous run).
|
||||
* Returns a GitCommitInfo with created=false and an empty commitHash when no
|
||||
* paths were actually removed.
|
||||
*/
|
||||
async deleteDirectories(
|
||||
directoryPaths: string[],
|
||||
commitMessage: string,
|
||||
author: string,
|
||||
authorEmail: string,
|
||||
): Promise<GitCommitInfo> {
|
||||
if (directoryPaths.length === 0) {
|
||||
return {
|
||||
commitHash: '',
|
||||
shortHash: '',
|
||||
message: commitMessage,
|
||||
author,
|
||||
authorEmail,
|
||||
timestamp: new Date().toISOString(),
|
||||
committedDate: new Date().toISOString(),
|
||||
created: false,
|
||||
};
|
||||
}
|
||||
const removed: string[] = [];
|
||||
for (const path of directoryPaths) {
|
||||
try {
|
||||
await this.git.rm(['-r', path]);
|
||||
removed.push(path);
|
||||
} catch (error) {
|
||||
this.logger.warn(
|
||||
`deleteDirectories: skipping ${path}: ${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
if (removed.length === 0) {
|
||||
return {
|
||||
commitHash: '',
|
||||
shortHash: '',
|
||||
message: commitMessage,
|
||||
author,
|
||||
authorEmail,
|
||||
timestamp: new Date().toISOString(),
|
||||
committedDate: new Date().toISOString(),
|
||||
created: false,
|
||||
};
|
||||
}
|
||||
|
||||
const result = await this.git.commit(commitMessage, { '--author': `${author} <${authorEmail}>` });
|
||||
if (!result.commit) {
|
||||
throw new Error('No commit hash returned from deleteDirectories');
|
||||
}
|
||||
const log = await this.git.log({ maxCount: 1 });
|
||||
const commit = log.latest;
|
||||
if (!commit) {
|
||||
throw new Error('Failed to retrieve commit details after deleteDirectories');
|
||||
}
|
||||
return {
|
||||
commitHash: commit.hash,
|
||||
shortHash: commit.hash.substring(0, 8),
|
||||
message: commit.message,
|
||||
author: commit.author_name,
|
||||
authorEmail: commit.author_email,
|
||||
timestamp: commit.date,
|
||||
committedDate: new Date(commit.date).toISOString(),
|
||||
created: true,
|
||||
};
|
||||
}
|
||||
}
|
||||
27
packages/context/src/core/index.ts
Normal file
27
packages/context/src/core/index.ts
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
export type { KloCoreConfig, KloGitConfig, KloLogger, KloStorageConfig } from './config.js';
|
||||
export { noopLogger, resolveConfigDir, resolveWorktreesDir } from './config.js';
|
||||
export { resolveKloConfigReference, resolveKloHomePath } from './config-reference.js';
|
||||
export type { KloEmbeddingPort } from './embedding.js';
|
||||
export {
|
||||
REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
redactKloSensitiveMetadata,
|
||||
redactKloSensitiveText,
|
||||
redactKloSensitiveValue,
|
||||
} from './redaction.js';
|
||||
export type {
|
||||
KloFileHistoryEntry,
|
||||
KloFileListResult,
|
||||
KloFileReadResult,
|
||||
KloFileStorePort,
|
||||
KloFileWriteResult,
|
||||
} from './file-store.js';
|
||||
export type { GitCommitInfo, SquashMergeResult, WorktreeEntry } from './git.service.js';
|
||||
export { GitService } from './git.service.js';
|
||||
export type {
|
||||
SentinelPayload,
|
||||
SessionOutcome,
|
||||
SessionWorktree,
|
||||
SessionWorktreeServiceDeps,
|
||||
WorktreeConfigPort,
|
||||
} from './session-worktree.service.js';
|
||||
export { SessionWorktreeService } from './session-worktree.service.js';
|
||||
47
packages/context/src/core/redaction.ts
Normal file
47
packages/context/src/core/redaction.ts
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
export const REDACTED_KLO_CREDENTIAL_VALUE = '<redacted>';
|
||||
|
||||
const SENSITIVE_FIELD_NAME = /(password|secret|token|api[_-]?key|private[_-]?key|passphrase|credential|authorization|url)/i;
|
||||
const URL_CREDENTIAL_PATTERN = /([a-z][a-z0-9+.-]*:\/\/[^:\s/@]+:)([^@\s/]+)(@)/gi;
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function isSensitiveField(key: string): boolean {
|
||||
return SENSITIVE_FIELD_NAME.test(key);
|
||||
}
|
||||
|
||||
export function redactKloSensitiveValue(key: string, value: unknown): unknown {
|
||||
if (isSensitiveField(key)) {
|
||||
return REDACTED_KLO_CREDENTIAL_VALUE;
|
||||
}
|
||||
if (Array.isArray(value)) {
|
||||
return value.map((item) => redactKloSensitiveValue(key, item));
|
||||
}
|
||||
if (isRecord(value)) {
|
||||
return redactKloSensitiveMetadata(value);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
export function redactKloSensitiveMetadata(metadata: Record<string, unknown>): Record<string, unknown> {
|
||||
const redacted: Record<string, unknown> = {};
|
||||
for (const [key, value] of Object.entries(metadata)) {
|
||||
if (Array.isArray(value)) {
|
||||
redacted[key] = value.map((item) =>
|
||||
isRecord(item) ? redactKloSensitiveMetadata(item) : redactKloSensitiveValue(key, item),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if (isRecord(value)) {
|
||||
redacted[key] = redactKloSensitiveValue(key, value);
|
||||
continue;
|
||||
}
|
||||
redacted[key] = redactKloSensitiveValue(key, value);
|
||||
}
|
||||
return redacted;
|
||||
}
|
||||
|
||||
export function redactKloSensitiveText(value: string): string {
|
||||
return value.replace(URL_CREDENTIAL_PATTERN, `$1${REDACTED_KLO_CREDENTIAL_VALUE}$3`);
|
||||
}
|
||||
124
packages/context/src/core/session-worktree.service.test.ts
Normal file
124
packages/context/src/core/session-worktree.service.test.ts
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
import { mkdtemp, realpath, rm, stat } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
import type { KloCoreConfig } from './config.js';
|
||||
import { GitService } from './git.service.js';
|
||||
import { SessionWorktreeService, type WorktreeConfigPort } from './session-worktree.service.js';
|
||||
|
||||
interface TestWorktreeConfig extends WorktreeConfigPort<TestWorktreeConfig> {
|
||||
workdir?: string;
|
||||
}
|
||||
|
||||
// SessionWorktreeService glues a real GitService to a scoped config adapter.
|
||||
describe('SessionWorktreeService', () => {
|
||||
let sessionService: SessionWorktreeService<TestWorktreeConfig>;
|
||||
let gitService: GitService;
|
||||
let homeDir: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
homeDir = await mkdtemp(join(tmpdir(), 'sws-spec-'));
|
||||
homeDir = await realpath(homeDir);
|
||||
|
||||
const coreConfig: KloCoreConfig = {
|
||||
storage: { configDir: homeDir, homeDir },
|
||||
git: {
|
||||
userName: 'System User',
|
||||
userEmail: 'system@example.com',
|
||||
bootstrapMessage: 'Initialize test config repo',
|
||||
bootstrapAuthor: 'test-system',
|
||||
bootstrapAuthorEmail: 'system@example.com',
|
||||
},
|
||||
};
|
||||
|
||||
gitService = new GitService(coreConfig);
|
||||
await gitService.onModuleInit();
|
||||
const configService: TestWorktreeConfig = {
|
||||
forWorktree: vi.fn(
|
||||
(workdir: string): TestWorktreeConfig => ({ workdir, forWorktree: configService.forWorktree }),
|
||||
),
|
||||
};
|
||||
sessionService = new SessionWorktreeService({
|
||||
coreConfig,
|
||||
gitService,
|
||||
configService,
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(homeDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
describe('create', () => {
|
||||
it('creates a worktree + branch and returns scoped services', async () => {
|
||||
const baseSha = await gitService.revParseHead();
|
||||
if (!baseSha) {
|
||||
throw new Error('no base sha');
|
||||
}
|
||||
|
||||
const session = await sessionService.create('chat-abc', baseSha);
|
||||
|
||||
expect(session.workdir).toBe(join(homeDir, '.worktrees', 'session-chat-abc'));
|
||||
expect(session.branch).toBe('session/chat-abc');
|
||||
expect(session.baseSha).toBe(baseSha);
|
||||
const stats = await stat(session.workdir);
|
||||
expect(stats.isDirectory()).toBe(true);
|
||||
|
||||
// Scoped git instance reports the worktree's HEAD (= baseSha at creation time).
|
||||
expect(await session.git.revParseHead()).toBe(baseSha);
|
||||
|
||||
const list = await gitService.listWorktrees();
|
||||
expect(list.find((e) => e.path === session.workdir)).toBeTruthy();
|
||||
});
|
||||
|
||||
it('appends a timestamp suffix when the primary dir already exists', async () => {
|
||||
const baseSha = await gitService.revParseHead();
|
||||
if (!baseSha) {
|
||||
throw new Error('no base sha');
|
||||
}
|
||||
|
||||
const first = await sessionService.create('chat-dup', baseSha);
|
||||
const second = await sessionService.create('chat-dup', baseSha);
|
||||
|
||||
expect(first.workdir).not.toBe(second.workdir);
|
||||
expect(second.branch).toMatch(/^session\/chat-dup-\d+$/);
|
||||
});
|
||||
});
|
||||
|
||||
describe('cleanup', () => {
|
||||
it('success removes the worktree dir and deletes the branch', async () => {
|
||||
const baseSha = await gitService.revParseHead();
|
||||
if (!baseSha) {
|
||||
throw new Error('no base sha');
|
||||
}
|
||||
|
||||
const session = await sessionService.create('chat-cleanup-ok', baseSha);
|
||||
await sessionService.cleanup(session, 'success');
|
||||
|
||||
const list = await gitService.listWorktrees();
|
||||
expect(list.find((e) => e.path === session.workdir)).toBeFalsy();
|
||||
await expect(stat(session.workdir)).rejects.toThrow();
|
||||
});
|
||||
|
||||
it('conflict keeps the worktree and writes a sentinel file', async () => {
|
||||
const baseSha = await gitService.revParseHead();
|
||||
if (!baseSha) {
|
||||
throw new Error('no base sha');
|
||||
}
|
||||
|
||||
const session = await sessionService.create('chat-cleanup-conflict', baseSha);
|
||||
await sessionService.cleanup(session, 'conflict', { conflictPaths: ['shared.yaml'] });
|
||||
|
||||
// Dir still exists.
|
||||
await expect(stat(session.workdir)).resolves.toBeTruthy();
|
||||
|
||||
const { readFile } = await import('node:fs/promises');
|
||||
const raw = await readFile(join(session.workdir, '.klo-outcome'), 'utf-8');
|
||||
const parsed = JSON.parse(raw);
|
||||
expect(parsed.outcome).toBe('conflict');
|
||||
expect(parsed.chatId).toBe('chat-cleanup-conflict');
|
||||
expect(parsed.conflictPaths).toEqual(['shared.yaml']);
|
||||
expect(typeof parsed.at).toBe('string');
|
||||
});
|
||||
});
|
||||
});
|
||||
113
packages/context/src/core/session-worktree.service.ts
Normal file
113
packages/context/src/core/session-worktree.service.ts
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
import { mkdir, stat, writeFile } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import { noopLogger, resolveWorktreesDir, type KloCoreConfig, type KloLogger } from './config.js';
|
||||
import { GitService } from './git.service.js';
|
||||
|
||||
export type SessionOutcome = 'success' | 'empty' | 'conflict' | 'crash';
|
||||
|
||||
export interface SentinelPayload {
|
||||
outcome: SessionOutcome;
|
||||
at: string;
|
||||
chatId: string;
|
||||
baseSha: string;
|
||||
conflictPaths?: string[];
|
||||
}
|
||||
|
||||
export interface WorktreeConfigPort<TConfig> {
|
||||
forWorktree(workdir: string): TConfig;
|
||||
}
|
||||
|
||||
export interface SessionWorktree<TConfig> {
|
||||
chatId: string;
|
||||
workdir: string;
|
||||
branch: string;
|
||||
baseSha: string;
|
||||
createdAt: Date;
|
||||
git: GitService;
|
||||
config: TConfig;
|
||||
}
|
||||
|
||||
export interface SessionWorktreeServiceDeps<TConfig extends WorktreeConfigPort<TConfig>> {
|
||||
coreConfig: KloCoreConfig;
|
||||
gitService: GitService;
|
||||
configService: TConfig;
|
||||
logger?: KloLogger;
|
||||
}
|
||||
|
||||
export class SessionWorktreeService<TConfig extends WorktreeConfigPort<TConfig> = WorktreeConfigPort<never>> {
|
||||
private readonly logger: KloLogger;
|
||||
private readonly worktreesRoot: string;
|
||||
|
||||
constructor(private readonly deps: SessionWorktreeServiceDeps<TConfig>) {
|
||||
this.logger = deps.logger ?? noopLogger;
|
||||
this.worktreesRoot = resolveWorktreesDir(deps.coreConfig);
|
||||
}
|
||||
|
||||
async create(sessionKey: string, baseSha: string): Promise<SessionWorktree<TConfig>> {
|
||||
await mkdir(this.worktreesRoot, { recursive: true });
|
||||
|
||||
let dirName = `session-${sessionKey}`;
|
||||
let branch = `session/${sessionKey}`;
|
||||
let workdir = join(this.worktreesRoot, dirName);
|
||||
|
||||
try {
|
||||
await stat(workdir);
|
||||
const suffix = Date.now().toString();
|
||||
dirName = `session-${sessionKey}-${suffix}`;
|
||||
branch = `session/${sessionKey}-${suffix}`;
|
||||
workdir = join(this.worktreesRoot, dirName);
|
||||
this.logger.warn(`session worktree collision for key=${sessionKey}; using suffix ${suffix}`);
|
||||
} catch {
|
||||
// no collision: primary name is free
|
||||
}
|
||||
|
||||
await this.deps.gitService.addWorktree(workdir, branch, baseSha);
|
||||
|
||||
return {
|
||||
chatId: sessionKey,
|
||||
workdir,
|
||||
branch,
|
||||
baseSha,
|
||||
createdAt: new Date(),
|
||||
git: this.deps.gitService.forWorktree(workdir),
|
||||
config: this.deps.configService.forWorktree(workdir),
|
||||
};
|
||||
}
|
||||
|
||||
async cleanup(
|
||||
session: SessionWorktree<TConfig>,
|
||||
outcome: SessionOutcome,
|
||||
extra?: { conflictPaths?: string[] },
|
||||
): Promise<void> {
|
||||
if (outcome === 'success' || outcome === 'empty') {
|
||||
try {
|
||||
await this.deps.gitService.removeWorktree(session.workdir);
|
||||
await this.deps.gitService.deleteBranch(session.branch, true);
|
||||
} catch (error) {
|
||||
this.logger.warn(
|
||||
`cleanup(${outcome}) failed for ${session.chatId}: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const payload: SentinelPayload = {
|
||||
outcome,
|
||||
at: new Date().toISOString(),
|
||||
chatId: session.chatId,
|
||||
baseSha: session.baseSha,
|
||||
...(extra?.conflictPaths ? { conflictPaths: extra.conflictPaths } : {}),
|
||||
};
|
||||
try {
|
||||
await writeFile(join(session.workdir, '.klo-outcome'), JSON.stringify(payload, null, 2), 'utf-8');
|
||||
} catch (error) {
|
||||
this.logger.warn(
|
||||
`cleanup(${outcome}) failed to write sentinel for ${session.chatId}: ${
|
||||
error instanceof Error ? error.message : String(error)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
1
packages/context/src/daemon/index.ts
Normal file
1
packages/context/src/daemon/index.ts
Normal file
|
|
@ -0,0 +1 @@
|
|||
export * from './semantic-layer-compute.js';
|
||||
339
packages/context/src/daemon/semantic-layer-compute.test.ts
Normal file
339
packages/context/src/daemon/semantic-layer-compute.test.ts
Normal file
|
|
@ -0,0 +1,339 @@
|
|||
import { once } from 'node:events';
|
||||
import { createServer } from 'node:http';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { createHttpSemanticLayerComputePort, createPythonSemanticLayerComputePort } from './semantic-layer-compute.js';
|
||||
|
||||
const source = {
|
||||
name: 'orders',
|
||||
table: 'public.orders',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'number' }],
|
||||
joins: [],
|
||||
measures: [{ name: 'order_count', expr: 'count(*)' }],
|
||||
};
|
||||
|
||||
const sourceGenerationInput = {
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
db: 'public',
|
||||
comment: 'Orders table',
|
||||
columns: [
|
||||
{ name: 'id', type: 'integer', primaryKey: true, nullable: false, comment: 'Order ID' },
|
||||
{ name: 'customer_id', type: 'integer' },
|
||||
{ name: 'amount', type: 'decimal', comment: 'Order amount' },
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
db: 'public',
|
||||
columns: [
|
||||
{ name: 'id', type: 'integer', primaryKey: true },
|
||||
{ name: 'email', type: 'varchar' },
|
||||
],
|
||||
},
|
||||
],
|
||||
links: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
relationshipType: 'MANY_TO_ONE',
|
||||
},
|
||||
],
|
||||
dialect: 'postgres',
|
||||
};
|
||||
|
||||
const sourceGenerationDaemonPayload = {
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
db: 'public',
|
||||
comment: 'Orders table',
|
||||
columns: [
|
||||
{ name: 'id', type: 'integer', primary_key: true, nullable: false, comment: 'Order ID' },
|
||||
{ name: 'customer_id', type: 'integer' },
|
||||
{ name: 'amount', type: 'decimal', comment: 'Order amount' },
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
db: 'public',
|
||||
columns: [
|
||||
{ name: 'id', type: 'integer', primary_key: true },
|
||||
{ name: 'email', type: 'varchar' },
|
||||
],
|
||||
},
|
||||
],
|
||||
links: [
|
||||
{
|
||||
from_table: 'orders',
|
||||
from_column: 'customer_id',
|
||||
to_table: 'customers',
|
||||
to_column: 'id',
|
||||
relationship_type: 'MANY_TO_ONE',
|
||||
},
|
||||
],
|
||||
dialect: 'postgres',
|
||||
};
|
||||
|
||||
const sourceGenerationDaemonResponse = {
|
||||
source_count: 2,
|
||||
sources: [
|
||||
{
|
||||
name: 'orders',
|
||||
table: 'public.orders',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'number' }],
|
||||
joins: [
|
||||
{
|
||||
to: 'customers',
|
||||
on: 'customer_id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
},
|
||||
],
|
||||
measures: [{ name: 'record_count', expr: 'count(id)' }],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
describe('createPythonSemanticLayerComputePort', () => {
|
||||
it('calls the semantic-query stdio command', async () => {
|
||||
const runJson = vi.fn(async () => ({
|
||||
sql: 'select count(*) from public.orders',
|
||||
dialect: 'postgres',
|
||||
columns: [{ name: 'orders.order_count' }],
|
||||
plan: { sources_used: ['orders'] },
|
||||
}));
|
||||
const port = createPythonSemanticLayerComputePort({ runJson });
|
||||
|
||||
await expect(
|
||||
port.query({
|
||||
sources: [source],
|
||||
dialect: 'postgres',
|
||||
query: { measures: ['orders.order_count'], dimensions: [] },
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
sql: 'select count(*) from public.orders',
|
||||
dialect: 'postgres',
|
||||
columns: [{ name: 'orders.order_count' }],
|
||||
plan: { sources_used: ['orders'] },
|
||||
});
|
||||
|
||||
expect(runJson).toHaveBeenCalledWith('semantic-query', {
|
||||
sources: [source],
|
||||
dialect: 'postgres',
|
||||
query: { measures: ['orders.order_count'], dimensions: [] },
|
||||
});
|
||||
});
|
||||
|
||||
it('calls the semantic-validate stdio command', async () => {
|
||||
const runJson = vi.fn(async () => ({
|
||||
valid: true,
|
||||
errors: [],
|
||||
warnings: [],
|
||||
per_source_warnings: {},
|
||||
}));
|
||||
const port = createPythonSemanticLayerComputePort({ runJson });
|
||||
|
||||
await expect(
|
||||
port.validateSources({
|
||||
sources: [source],
|
||||
dialect: 'postgres',
|
||||
recentlyTouched: ['orders'],
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
valid: true,
|
||||
errors: [],
|
||||
warnings: [],
|
||||
perSourceWarnings: {},
|
||||
});
|
||||
|
||||
expect(runJson).toHaveBeenCalledWith('semantic-validate', {
|
||||
sources: [source],
|
||||
dialect: 'postgres',
|
||||
recently_touched: ['orders'],
|
||||
});
|
||||
});
|
||||
|
||||
it('calls the semantic-generate-sources stdio command', async () => {
|
||||
const runJson = vi.fn(async () => sourceGenerationDaemonResponse);
|
||||
const port = createPythonSemanticLayerComputePort({ runJson });
|
||||
|
||||
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
|
||||
sourceCount: 2,
|
||||
sources: sourceGenerationDaemonResponse.sources,
|
||||
});
|
||||
|
||||
expect(runJson).toHaveBeenCalledWith('semantic-generate-sources', sourceGenerationDaemonPayload);
|
||||
});
|
||||
});
|
||||
|
||||
describe('createHttpSemanticLayerComputePort', () => {
|
||||
it('calls semantic query and validate HTTP endpoints through an injected runner', async () => {
|
||||
const requestJson = vi.fn(async (path: string) => {
|
||||
if (path === '/semantic-layer/query') {
|
||||
return {
|
||||
sql: 'select count(*) from public.orders',
|
||||
dialect: 'postgres',
|
||||
columns: [{ name: 'orders.order_count' }],
|
||||
plan: { sources_used: ['orders'] },
|
||||
};
|
||||
}
|
||||
return {
|
||||
valid: true,
|
||||
errors: [],
|
||||
warnings: [],
|
||||
per_source_warnings: {},
|
||||
};
|
||||
});
|
||||
const port = createHttpSemanticLayerComputePort({ baseUrl: 'http://127.0.0.1:8765/', requestJson });
|
||||
|
||||
await expect(
|
||||
port.query({
|
||||
sources: [source],
|
||||
dialect: 'postgres',
|
||||
query: { measures: ['orders.order_count'], dimensions: [] },
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
sql: 'select count(*) from public.orders',
|
||||
dialect: 'postgres',
|
||||
columns: [{ name: 'orders.order_count' }],
|
||||
plan: { sources_used: ['orders'] },
|
||||
});
|
||||
|
||||
await expect(
|
||||
port.validateSources({
|
||||
sources: [source],
|
||||
dialect: 'postgres',
|
||||
recentlyTouched: ['orders'],
|
||||
}),
|
||||
).resolves.toEqual({
|
||||
valid: true,
|
||||
errors: [],
|
||||
warnings: [],
|
||||
perSourceWarnings: {},
|
||||
});
|
||||
|
||||
expect(requestJson).toHaveBeenNthCalledWith(1, '/semantic-layer/query', {
|
||||
sources: [source],
|
||||
dialect: 'postgres',
|
||||
query: { measures: ['orders.order_count'], dimensions: [] },
|
||||
});
|
||||
expect(requestJson).toHaveBeenNthCalledWith(2, '/semantic-layer/validate', {
|
||||
sources: [source],
|
||||
dialect: 'postgres',
|
||||
recently_touched: ['orders'],
|
||||
});
|
||||
});
|
||||
|
||||
it('calls the semantic source-generation HTTP endpoint through an injected runner', async () => {
|
||||
const requestJson = vi.fn(async () => sourceGenerationDaemonResponse);
|
||||
const port = createHttpSemanticLayerComputePort({ baseUrl: 'http://127.0.0.1:8765/', requestJson });
|
||||
|
||||
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
|
||||
sourceCount: 2,
|
||||
sources: sourceGenerationDaemonResponse.sources,
|
||||
});
|
||||
|
||||
expect(requestJson).toHaveBeenCalledWith('/semantic-layer/generate-sources', sourceGenerationDaemonPayload);
|
||||
});
|
||||
|
||||
it('posts JSON to a running HTTP daemon endpoint', async () => {
|
||||
const requests: Array<{ url: string | undefined; body: unknown }> = [];
|
||||
const server = createServer((request, response) => {
|
||||
const chunks: Buffer[] = [];
|
||||
request.on('data', (chunk: Buffer) => chunks.push(chunk));
|
||||
request.on('end', () => {
|
||||
requests.push({
|
||||
url: request.url,
|
||||
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
|
||||
});
|
||||
response.writeHead(200, { 'content-type': 'application/json' });
|
||||
response.end(
|
||||
JSON.stringify({
|
||||
sql: 'select count(*) from public.orders',
|
||||
dialect: 'postgres',
|
||||
columns: [{ name: 'orders.order_count' }],
|
||||
plan: { sources_used: ['orders'] },
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
server.listen(0, '127.0.0.1');
|
||||
await once(server, 'listening');
|
||||
try {
|
||||
const address = server.address();
|
||||
if (!address || typeof address === 'string') {
|
||||
throw new Error('expected TCP server address');
|
||||
}
|
||||
const port = createHttpSemanticLayerComputePort({ baseUrl: `http://127.0.0.1:${address.port}` });
|
||||
|
||||
await expect(
|
||||
port.query({
|
||||
sources: [source],
|
||||
dialect: 'postgres',
|
||||
query: { measures: ['orders.order_count'], dimensions: [] },
|
||||
}),
|
||||
).resolves.toMatchObject({
|
||||
sql: 'select count(*) from public.orders',
|
||||
dialect: 'postgres',
|
||||
});
|
||||
|
||||
expect(requests).toEqual([
|
||||
{
|
||||
url: '/semantic-layer/query',
|
||||
body: {
|
||||
sources: [source],
|
||||
dialect: 'postgres',
|
||||
query: { measures: ['orders.order_count'], dimensions: [] },
|
||||
},
|
||||
},
|
||||
]);
|
||||
} finally {
|
||||
server.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('posts source-generation JSON to a running HTTP daemon endpoint', async () => {
|
||||
const requests: Array<{ url: string | undefined; body: unknown }> = [];
|
||||
const server = createServer((request, response) => {
|
||||
const chunks: Buffer[] = [];
|
||||
request.on('data', (chunk: Buffer) => chunks.push(chunk));
|
||||
request.on('end', () => {
|
||||
requests.push({
|
||||
url: request.url,
|
||||
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
|
||||
});
|
||||
response.writeHead(200, { 'content-type': 'application/json' });
|
||||
response.end(JSON.stringify(sourceGenerationDaemonResponse));
|
||||
});
|
||||
});
|
||||
|
||||
server.listen(0, '127.0.0.1');
|
||||
await once(server, 'listening');
|
||||
try {
|
||||
const address = server.address();
|
||||
if (!address || typeof address === 'string') {
|
||||
throw new Error('expected TCP server address');
|
||||
}
|
||||
const port = createHttpSemanticLayerComputePort({ baseUrl: `http://127.0.0.1:${address.port}` });
|
||||
|
||||
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
|
||||
sourceCount: 2,
|
||||
sources: sourceGenerationDaemonResponse.sources,
|
||||
});
|
||||
|
||||
expect(requests).toEqual([
|
||||
{
|
||||
url: '/semantic-layer/generate-sources',
|
||||
body: sourceGenerationDaemonPayload,
|
||||
},
|
||||
]);
|
||||
} finally {
|
||||
server.close();
|
||||
}
|
||||
});
|
||||
});
|
||||
304
packages/context/src/daemon/semantic-layer-compute.ts
Normal file
304
packages/context/src/daemon/semantic-layer-compute.ts
Normal file
|
|
@ -0,0 +1,304 @@
|
|||
import { request as httpRequest } from 'node:http';
|
||||
import { request as httpsRequest } from 'node:https';
|
||||
import { URL } from 'node:url';
|
||||
import { spawn } from 'node:child_process';
|
||||
import type { SemanticLayerQueryInput, SemanticLayerSource } from '../sl/index.js';
|
||||
|
||||
export interface KloSemanticLayerComputeQueryResult {
|
||||
sql: string;
|
||||
dialect: string;
|
||||
columns: Array<Record<string, unknown>>;
|
||||
plan: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface KloSemanticLayerComputeValidationResult {
|
||||
valid: boolean;
|
||||
errors: string[];
|
||||
warnings: string[];
|
||||
perSourceWarnings: Record<string, string[]>;
|
||||
}
|
||||
|
||||
export interface KloSemanticLayerSourceGenerationColumnInput {
|
||||
name: string;
|
||||
type: string;
|
||||
primaryKey?: boolean;
|
||||
nullable?: boolean;
|
||||
comment?: string | null;
|
||||
}
|
||||
|
||||
export interface KloSemanticLayerSourceGenerationTableInput {
|
||||
name: string;
|
||||
catalog?: string | null;
|
||||
db?: string | null;
|
||||
comment?: string | null;
|
||||
columns: KloSemanticLayerSourceGenerationColumnInput[];
|
||||
}
|
||||
|
||||
export interface KloSemanticLayerSourceGenerationLinkInput {
|
||||
fromTable: string;
|
||||
fromColumn: string;
|
||||
toTable: string;
|
||||
toColumn: string;
|
||||
relationshipType: string;
|
||||
}
|
||||
|
||||
export interface KloSemanticLayerSourceGenerationInput {
|
||||
tables: KloSemanticLayerSourceGenerationTableInput[];
|
||||
links: KloSemanticLayerSourceGenerationLinkInput[];
|
||||
dialect?: string;
|
||||
}
|
||||
|
||||
export interface KloSemanticLayerSourceGenerationResult {
|
||||
sources: Array<Record<string, unknown>>;
|
||||
sourceCount: number;
|
||||
}
|
||||
|
||||
export interface KloSemanticLayerComputePort {
|
||||
query(input: {
|
||||
sources: Array<Record<string, unknown> | SemanticLayerSource>;
|
||||
query: SemanticLayerQueryInput;
|
||||
dialect: string;
|
||||
}): Promise<KloSemanticLayerComputeQueryResult>;
|
||||
validateSources(input: {
|
||||
sources: Array<Record<string, unknown> | SemanticLayerSource>;
|
||||
dialect: string;
|
||||
recentlyTouched?: string[];
|
||||
}): Promise<KloSemanticLayerComputeValidationResult>;
|
||||
generateSources(input: KloSemanticLayerSourceGenerationInput): Promise<KloSemanticLayerSourceGenerationResult>;
|
||||
}
|
||||
|
||||
export type KloDaemonCommand = 'semantic-query' | 'semantic-validate' | 'semantic-generate-sources';
|
||||
|
||||
export type KloDaemonJsonRunner = (
|
||||
subcommand: KloDaemonCommand,
|
||||
payload: Record<string, unknown>,
|
||||
) => Promise<Record<string, unknown>>;
|
||||
|
||||
export type KloDaemonHttpJsonRunner = (path: string, payload: Record<string, unknown>) => Promise<Record<string, unknown>>;
|
||||
|
||||
export interface PythonSemanticLayerComputeOptions {
|
||||
command?: string;
|
||||
args?: string[];
|
||||
cwd?: string;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
runJson?: KloDaemonJsonRunner;
|
||||
}
|
||||
|
||||
export interface HttpSemanticLayerComputeOptions {
|
||||
baseUrl: string;
|
||||
requestJson?: KloDaemonHttpJsonRunner;
|
||||
}
|
||||
|
||||
function parseJsonObject(raw: string, subcommand: string): Record<string, unknown> {
|
||||
const parsed = JSON.parse(raw) as unknown;
|
||||
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
|
||||
throw new Error(`klo-daemon ${subcommand} returned non-object JSON`);
|
||||
}
|
||||
return parsed as Record<string, unknown>;
|
||||
}
|
||||
|
||||
function runProcessJson(
|
||||
options: Required<Pick<PythonSemanticLayerComputeOptions, 'command' | 'args'>> &
|
||||
Pick<PythonSemanticLayerComputeOptions, 'cwd' | 'env'>,
|
||||
): KloDaemonJsonRunner {
|
||||
return async (subcommand: KloDaemonCommand, payload: Record<string, unknown>): Promise<Record<string, unknown>> =>
|
||||
new Promise((resolve, reject) => {
|
||||
const child = spawn(options.command, [...options.args, subcommand], {
|
||||
cwd: options.cwd,
|
||||
env: { ...process.env, ...options.env },
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
});
|
||||
const stdout: Buffer[] = [];
|
||||
const stderr: Buffer[] = [];
|
||||
|
||||
child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk));
|
||||
child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk));
|
||||
child.on('error', reject);
|
||||
child.on('close', (code) => {
|
||||
const stdoutText = Buffer.concat(stdout).toString('utf8').trim();
|
||||
const stderrText = Buffer.concat(stderr).toString('utf8').trim();
|
||||
if (code !== 0) {
|
||||
reject(new Error(`klo-daemon ${subcommand} failed: ${stderrText || `exit code ${code}`}`));
|
||||
return;
|
||||
}
|
||||
try {
|
||||
resolve(parseJsonObject(stdoutText, subcommand));
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
child.stdin.end(`${JSON.stringify(payload)}\n`);
|
||||
});
|
||||
}
|
||||
|
||||
function normalizedBaseUrl(baseUrl: string): string {
|
||||
return baseUrl.endsWith('/') ? baseUrl : `${baseUrl}/`;
|
||||
}
|
||||
|
||||
function postJson(baseUrl: string): KloDaemonHttpJsonRunner {
|
||||
return async (path, payload) =>
|
||||
new Promise((resolve, reject) => {
|
||||
const target = new URL(path.replace(/^\//, ''), normalizedBaseUrl(baseUrl));
|
||||
const body = JSON.stringify(payload);
|
||||
const client = target.protocol === 'https:' ? httpsRequest : httpRequest;
|
||||
const request = client(
|
||||
target,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: {
|
||||
accept: 'application/json',
|
||||
'content-type': 'application/json',
|
||||
'content-length': Buffer.byteLength(body),
|
||||
},
|
||||
},
|
||||
(response) => {
|
||||
const chunks: Buffer[] = [];
|
||||
response.on('data', (chunk: Buffer) => chunks.push(chunk));
|
||||
response.on('end', () => {
|
||||
const text = Buffer.concat(chunks).toString('utf8');
|
||||
const statusCode = response.statusCode ?? 0;
|
||||
if (statusCode < 200 || statusCode >= 300) {
|
||||
reject(new Error(`klo-daemon HTTP ${path} failed with ${statusCode}: ${text}`));
|
||||
return;
|
||||
}
|
||||
try {
|
||||
resolve(parseJsonObject(text, path));
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
},
|
||||
);
|
||||
request.on('error', reject);
|
||||
request.end(body);
|
||||
});
|
||||
}
|
||||
|
||||
function stringArray(value: unknown): string[] {
|
||||
return Array.isArray(value) ? value.filter((item): item is string => typeof item === 'string') : [];
|
||||
}
|
||||
|
||||
function recordValue(value: unknown): Record<string, unknown> {
|
||||
return value && typeof value === 'object' && !Array.isArray(value) ? (value as Record<string, unknown>) : {};
|
||||
}
|
||||
|
||||
function recordArray(value: unknown): Array<Record<string, unknown>> {
|
||||
return Array.isArray(value)
|
||||
? value.filter(
|
||||
(item): item is Record<string, unknown> => item !== null && typeof item === 'object' && !Array.isArray(item),
|
||||
)
|
||||
: [];
|
||||
}
|
||||
|
||||
function sourceGenerationPayload(input: KloSemanticLayerSourceGenerationInput): Record<string, unknown> {
|
||||
return {
|
||||
tables: input.tables.map((table) => ({
|
||||
name: table.name,
|
||||
...(table.catalog !== undefined ? { catalog: table.catalog } : {}),
|
||||
...(table.db !== undefined ? { db: table.db } : {}),
|
||||
...(table.comment !== undefined ? { comment: table.comment } : {}),
|
||||
columns: table.columns.map((column) => ({
|
||||
name: column.name,
|
||||
type: column.type,
|
||||
...(column.primaryKey !== undefined ? { primary_key: column.primaryKey } : {}),
|
||||
...(column.nullable !== undefined ? { nullable: column.nullable } : {}),
|
||||
...(column.comment !== undefined ? { comment: column.comment } : {}),
|
||||
})),
|
||||
})),
|
||||
links: input.links.map((link) => ({
|
||||
from_table: link.fromTable,
|
||||
from_column: link.fromColumn,
|
||||
to_table: link.toTable,
|
||||
to_column: link.toColumn,
|
||||
relationship_type: link.relationshipType,
|
||||
})),
|
||||
dialect: input.dialect ?? 'postgres',
|
||||
};
|
||||
}
|
||||
|
||||
function sourceGenerationResult(raw: Record<string, unknown>): KloSemanticLayerSourceGenerationResult {
|
||||
return {
|
||||
sources: recordArray(raw.sources),
|
||||
sourceCount: typeof raw.source_count === 'number' ? raw.source_count : recordArray(raw.sources).length,
|
||||
};
|
||||
}
|
||||
|
||||
export function createPythonSemanticLayerComputePort(
|
||||
options: PythonSemanticLayerComputeOptions = {},
|
||||
): KloSemanticLayerComputePort {
|
||||
const command = options.command ?? 'python';
|
||||
const args = options.args ?? ['-m', 'klo_daemon'];
|
||||
const runJson = options.runJson ?? runProcessJson({ command, args, cwd: options.cwd, env: options.env });
|
||||
|
||||
return {
|
||||
async query(input) {
|
||||
const raw = await runJson('semantic-query', {
|
||||
sources: input.sources,
|
||||
dialect: input.dialect,
|
||||
query: input.query,
|
||||
});
|
||||
return {
|
||||
sql: typeof raw.sql === 'string' ? raw.sql : '',
|
||||
dialect: typeof raw.dialect === 'string' ? raw.dialect : input.dialect,
|
||||
columns: recordArray(raw.columns),
|
||||
plan: recordValue(raw.plan),
|
||||
};
|
||||
},
|
||||
async validateSources(input) {
|
||||
const raw = await runJson('semantic-validate', {
|
||||
sources: input.sources,
|
||||
dialect: input.dialect,
|
||||
recently_touched: input.recentlyTouched,
|
||||
});
|
||||
return {
|
||||
valid: raw.valid === true,
|
||||
errors: stringArray(raw.errors),
|
||||
warnings: stringArray(raw.warnings),
|
||||
perSourceWarnings: recordValue(raw.per_source_warnings) as Record<string, string[]>,
|
||||
};
|
||||
},
|
||||
async generateSources(input) {
|
||||
const raw = await runJson('semantic-generate-sources', sourceGenerationPayload(input));
|
||||
return sourceGenerationResult(raw);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export function createHttpSemanticLayerComputePort(
|
||||
options: HttpSemanticLayerComputeOptions,
|
||||
): KloSemanticLayerComputePort {
|
||||
const requestJson = options.requestJson ?? postJson(options.baseUrl);
|
||||
|
||||
return {
|
||||
async query(input) {
|
||||
const raw = await requestJson('/semantic-layer/query', {
|
||||
sources: input.sources,
|
||||
dialect: input.dialect,
|
||||
query: input.query,
|
||||
});
|
||||
return {
|
||||
sql: typeof raw.sql === 'string' ? raw.sql : '',
|
||||
dialect: typeof raw.dialect === 'string' ? raw.dialect : input.dialect,
|
||||
columns: recordArray(raw.columns),
|
||||
plan: recordValue(raw.plan),
|
||||
};
|
||||
},
|
||||
async validateSources(input) {
|
||||
const raw = await requestJson('/semantic-layer/validate', {
|
||||
sources: input.sources,
|
||||
dialect: input.dialect,
|
||||
recently_touched: input.recentlyTouched,
|
||||
});
|
||||
return {
|
||||
valid: raw.valid === true,
|
||||
errors: stringArray(raw.errors),
|
||||
warnings: stringArray(raw.warnings),
|
||||
perSourceWarnings: recordValue(raw.per_source_warnings) as Record<string, string[]>,
|
||||
};
|
||||
},
|
||||
async generateSources(input) {
|
||||
const raw = await requestJson('/semantic-layer/generate-sources', sourceGenerationPayload(input));
|
||||
return sourceGenerationResult(raw);
|
||||
},
|
||||
};
|
||||
}
|
||||
12
packages/context/src/index.test.ts
Normal file
12
packages/context/src/index.test.ts
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
|
||||
import { kloContextPackageInfo } from './index.js';
|
||||
|
||||
describe('kloContextPackageInfo', () => {
|
||||
it('identifies the context package', () => {
|
||||
expect(kloContextPackageInfo).toEqual({
|
||||
name: '@klo/context',
|
||||
version: '0.0.0-private',
|
||||
});
|
||||
});
|
||||
});
|
||||
144
packages/context/src/index.ts
Normal file
144
packages/context/src/index.ts
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
export interface KloContextPackageInfo {
|
||||
name: '@klo/context';
|
||||
version: '0.0.0-private';
|
||||
}
|
||||
|
||||
export const kloContextPackageInfo: KloContextPackageInfo = {
|
||||
name: '@klo/context',
|
||||
version: '0.0.0-private',
|
||||
};
|
||||
|
||||
export * from './agent/index.js';
|
||||
export * from './core/index.js';
|
||||
export * from './daemon/index.js';
|
||||
export * from './ingest/index.js';
|
||||
export * from './llm/index.js';
|
||||
export type {
|
||||
CaptureSession,
|
||||
CaptureSignals,
|
||||
MemoryAgentInput,
|
||||
MemoryAgentResult,
|
||||
MemoryAgentServiceDeps,
|
||||
MemoryAgentSettings,
|
||||
MemoryAgentSourceType,
|
||||
MemoryCommitMessagePort,
|
||||
MemoryConnectionPort,
|
||||
MemoryFileStorePort,
|
||||
MemoryKnowledgeSlRefsPort,
|
||||
MemoryLockPort,
|
||||
MemorySlSourceReconcilerPort,
|
||||
MemoryTelemetryPort,
|
||||
MemoryToolSetLike,
|
||||
MemoryToolsetFactoryPort,
|
||||
} from './memory/index.js';
|
||||
export * from './project/index.js';
|
||||
export * from './prompts/index.js';
|
||||
export * from './search/index.js';
|
||||
export * from './sql-analysis/index.js';
|
||||
export type {
|
||||
KloColumnAnalysisResult,
|
||||
KloColumnDescriptionPromptInput,
|
||||
KloColumnEmbeddingForeignKeys,
|
||||
KloColumnEmbeddingTextInput,
|
||||
KloColumnSampleInput,
|
||||
KloColumnSampleResult,
|
||||
KloColumnSampleUpdate,
|
||||
KloColumnStatsInput,
|
||||
KloColumnStatsResult,
|
||||
KloConnectionDriver,
|
||||
KloConnectorCapabilities,
|
||||
KloCredentialEnvelope,
|
||||
KloCredentialEnvReference,
|
||||
KloCredentialFileReference,
|
||||
KloDataDictionaryColumnState,
|
||||
KloDataDictionarySampleDecision,
|
||||
KloDataDictionarySettings,
|
||||
KloDataDictionarySkipReason,
|
||||
KloDataSourceDescriptionPromptInput,
|
||||
KloDescriptionCachePort,
|
||||
KloDescriptionColumn,
|
||||
KloDescriptionColumnTable,
|
||||
KloDescriptionGenerationSettings,
|
||||
KloDescriptionGeneratorOptions,
|
||||
KloDescriptionSource,
|
||||
KloDescriptionTableInput,
|
||||
KloDescriptionUpdate,
|
||||
KloEmbeddingPort as KloScanEmbeddingPort,
|
||||
KloEmbeddingUpdate,
|
||||
KloEnrichedColumn,
|
||||
KloEnrichedRelationship,
|
||||
KloEnrichedSchema,
|
||||
KloEnrichedTable,
|
||||
KloEnrichmentScanPhaseResult,
|
||||
KloGenerateColumnDescriptionsInput,
|
||||
KloGenerateDataSourceDescriptionInput,
|
||||
KloGenerateTableDescriptionInput,
|
||||
KloOptionalConnectorCapabilities,
|
||||
KloProgressPort,
|
||||
KloQueryResult as KloScanQueryResult,
|
||||
KloReadOnlyQueryInput,
|
||||
KloRelationshipEndpoint,
|
||||
KloRelationshipSource,
|
||||
KloRelationshipType,
|
||||
KloRelationshipUpdate,
|
||||
KloResolvedCredentialEnvelope,
|
||||
KloScanArtifactPaths,
|
||||
KloScanConnector,
|
||||
KloScanContext,
|
||||
KloScanDiffSummary,
|
||||
KloScanEnrichmentSummary,
|
||||
KloScanInput,
|
||||
KloScanLoggerPort,
|
||||
KloScanMetadataStore,
|
||||
KloScanMode,
|
||||
KloScanOrchestratorOptions,
|
||||
KloScanOrchestratorRunInput,
|
||||
KloScanOrchestratorRunResult,
|
||||
KloScanRelationshipSummary,
|
||||
KloScanReport,
|
||||
KloScanTrigger,
|
||||
KloScanWarning,
|
||||
KloScanWarningCode,
|
||||
KloSchemaColumn,
|
||||
KloSchemaDimensionType,
|
||||
KloSchemaForeignKey,
|
||||
KloSchemaScope,
|
||||
KloSchemaSnapshot,
|
||||
KloSchemaTable,
|
||||
KloSchemaTableKind,
|
||||
KloSkippedRelationship,
|
||||
KloStructuralScanPhaseResult,
|
||||
KloStructuralSyncPlan,
|
||||
KloStructuralSyncStats,
|
||||
KloTableDescriptionPromptInput,
|
||||
KloTableRef,
|
||||
KloTableSampleInput,
|
||||
KloTableSampleResult,
|
||||
KloColumnTypeMapping,
|
||||
} from './scan/index.js';
|
||||
export {
|
||||
appendKloWordLimitInstruction,
|
||||
buildKloColumnDescriptionPrompt,
|
||||
buildKloColumnEmbeddingText,
|
||||
buildKloDataSourceDescriptionPrompt,
|
||||
buildKloTableDescriptionPrompt,
|
||||
createKloConnectorCapabilities,
|
||||
defaultKloDataDictionarySettings,
|
||||
inferKloDimensionType,
|
||||
isKloDataDictionaryCandidate,
|
||||
kloColumnTypeMappingFromNative,
|
||||
KloDescriptionGenerator,
|
||||
KloScanOrchestrator,
|
||||
normalizeKloNativeType,
|
||||
REDACTED_KLO_CREDENTIAL_VALUE,
|
||||
redactKloCredentialEnvelope,
|
||||
redactKloCredentialValue,
|
||||
redactKloScanMetadata,
|
||||
redactKloScanReport,
|
||||
redactKloScanWarning,
|
||||
shouldKloSampleColumnForDictionary,
|
||||
} from './scan/index.js';
|
||||
export * from './skills/index.js';
|
||||
export * from './sl/index.js';
|
||||
export * from './tools/index.js';
|
||||
export * from './wiki/index.js';
|
||||
42
packages/context/src/ingest/action-identity.test.ts
Normal file
42
packages/context/src/ingest/action-identity.test.ts
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { actionTargetConnectionId, memoryActionIdentity } from './action-identity.js';
|
||||
|
||||
describe('memory action target identity', () => {
|
||||
it('keys SL actions by target connection and wiki actions by run connection', () => {
|
||||
expect(
|
||||
memoryActionIdentity(
|
||||
{ target: 'sl', type: 'created', key: 'orders', detail: '', targetConnectionId: 'warehouse-b' },
|
||||
'looker-run',
|
||||
),
|
||||
).toBe('sl:warehouse-b:orders');
|
||||
|
||||
expect(memoryActionIdentity({ target: 'sl', type: 'created', key: 'orders', detail: '' }, 'warehouse-a')).toBe(
|
||||
'sl:warehouse-a:orders',
|
||||
);
|
||||
|
||||
expect(
|
||||
memoryActionIdentity(
|
||||
{
|
||||
target: 'wiki',
|
||||
type: 'created',
|
||||
key: 'knowledge/global/orders.md',
|
||||
detail: '',
|
||||
targetConnectionId: 'ignored',
|
||||
},
|
||||
'looker-run',
|
||||
),
|
||||
).toBe('wiki:looker-run:knowledge/global/orders.md');
|
||||
});
|
||||
|
||||
it('resolves action target connection only for SL actions', () => {
|
||||
expect(
|
||||
actionTargetConnectionId(
|
||||
{ target: 'sl', type: 'updated', key: 'orders', detail: '', targetConnectionId: 'warehouse-b' },
|
||||
'looker-run',
|
||||
),
|
||||
).toBe('warehouse-b');
|
||||
expect(actionTargetConnectionId({ target: 'wiki', type: 'updated', key: 'orders', detail: '' }, 'looker-run')).toBe(
|
||||
'looker-run',
|
||||
);
|
||||
});
|
||||
});
|
||||
9
packages/context/src/ingest/action-identity.ts
Normal file
9
packages/context/src/ingest/action-identity.ts
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
import type { MemoryAction } from '../memory/index.js';
|
||||
|
||||
export function actionTargetConnectionId(action: MemoryAction, runConnectionId: string): string {
|
||||
return action.target === 'sl' ? (action.targetConnectionId ?? runConnectionId) : runConnectionId;
|
||||
}
|
||||
|
||||
export function memoryActionIdentity(action: MemoryAction, runConnectionId: string): string {
|
||||
return `${action.target}:${actionTargetConnectionId(action, runConnectionId)}:${action.key}`;
|
||||
}
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { DbtParsedTable } from './parse-schema.js';
|
||||
import { findMatchingKloTable, matchDbtTables, type DbtHostTableLite } from './match-tables.js';
|
||||
|
||||
const hostTables: DbtHostTableLite[] = [
|
||||
{ id: '1', name: 'orders', catalog: 'warehouse', db: 'analytics', columns: [{ id: 'c1', name: 'id' }] },
|
||||
{ id: '2', name: 'orders', catalog: 'warehouse', db: 'staging', columns: [{ id: 'c2', name: 'id' }] },
|
||||
{ id: '3', name: 'customers', catalog: null, db: null, columns: [{ id: 'c3', name: 'id' }] },
|
||||
];
|
||||
|
||||
function table(input: Partial<DbtParsedTable>): DbtParsedTable {
|
||||
return {
|
||||
name: 'orders',
|
||||
description: null,
|
||||
database: null,
|
||||
schema: null,
|
||||
columns: [],
|
||||
resourceType: 'model',
|
||||
...input,
|
||||
};
|
||||
}
|
||||
|
||||
describe('dbt descriptions table matching', () => {
|
||||
it('uses schema plus name first and checks catalog when dbt database is present', () => {
|
||||
expect(
|
||||
findMatchingKloTable(table({ database: 'warehouse', schema: 'analytics' }), hostTables, null)?.id,
|
||||
).toBe('1');
|
||||
});
|
||||
|
||||
it('does not fall back to name-only for source tables', () => {
|
||||
expect(findMatchingKloTable(table({ resourceType: 'source' }), hostTables, null)).toBeUndefined();
|
||||
});
|
||||
|
||||
it('uses targetSchema for models and name-only only when unique', () => {
|
||||
expect(findMatchingKloTable(table({ resourceType: 'model' }), hostTables, 'staging')?.id).toBe('2');
|
||||
expect(findMatchingKloTable(table({ name: 'customers', resourceType: 'model' }), hostTables, null)?.id).toBe(
|
||||
'3',
|
||||
);
|
||||
expect(findMatchingKloTable(table({ resourceType: 'model' }), hostTables, null)).toBeUndefined();
|
||||
});
|
||||
|
||||
it('summarizes matched columns and descriptions', () => {
|
||||
const matches = matchDbtTables(
|
||||
[
|
||||
table({
|
||||
name: 'customers',
|
||||
description: 'Customers',
|
||||
columns: [
|
||||
{ name: 'id', description: 'Primary key', dataType: null },
|
||||
{ name: 'missing', description: 'Missing', dataType: null },
|
||||
],
|
||||
}),
|
||||
],
|
||||
hostTables,
|
||||
null,
|
||||
);
|
||||
|
||||
expect(matches).toEqual([
|
||||
{
|
||||
dbtTable: 'customers',
|
||||
dbtSchema: null,
|
||||
dbtDatabase: null,
|
||||
hostTableId: '3',
|
||||
hostTableName: 'customers',
|
||||
matched: true,
|
||||
tableDescriptionAction: 'import',
|
||||
tableDescriptionFound: true,
|
||||
columnsToImport: 1,
|
||||
columnsMatched: 1,
|
||||
columnsTotal: 2,
|
||||
columnDescriptionsFound: 1,
|
||||
},
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
import type { DbtParsedTable } from './parse-schema.js';
|
||||
|
||||
export interface DbtHostTableLite {
|
||||
id: string;
|
||||
name: string;
|
||||
catalog: string | null;
|
||||
db: string | null;
|
||||
columns: Array<{ id: string; name: string }>;
|
||||
}
|
||||
|
||||
export interface DbtTableMatch {
|
||||
dbtTable: string;
|
||||
dbtSchema: string | null;
|
||||
dbtDatabase: string | null;
|
||||
hostTableId: string | null;
|
||||
hostTableName: string | null;
|
||||
matched: boolean;
|
||||
tableDescriptionAction: 'skip' | 'import';
|
||||
tableDescriptionFound: boolean;
|
||||
columnsToImport: number;
|
||||
columnsMatched: number;
|
||||
columnsTotal: number;
|
||||
columnDescriptionsFound: number;
|
||||
}
|
||||
|
||||
export function matchDbtTables(
|
||||
dbtTables: DbtParsedTable[],
|
||||
hostTables: DbtHostTableLite[],
|
||||
targetSchema?: string | null,
|
||||
): DbtTableMatch[] {
|
||||
return dbtTables.map((dbtTable) => {
|
||||
const hostTable = findMatchingKloTable(dbtTable, hostTables, targetSchema);
|
||||
|
||||
if (!hostTable) {
|
||||
return {
|
||||
dbtTable: dbtTable.name,
|
||||
dbtSchema: dbtTable.schema,
|
||||
dbtDatabase: dbtTable.database,
|
||||
hostTableId: null,
|
||||
hostTableName: null,
|
||||
matched: false,
|
||||
tableDescriptionAction: 'skip',
|
||||
tableDescriptionFound: Boolean(dbtTable.description),
|
||||
columnsToImport: 0,
|
||||
columnsMatched: 0,
|
||||
columnsTotal: dbtTable.columns.length,
|
||||
columnDescriptionsFound: dbtTable.columns.filter((column) => Boolean(column.description)).length,
|
||||
};
|
||||
}
|
||||
|
||||
const analysis = analyzeColumns(dbtTable, hostTable);
|
||||
return {
|
||||
dbtTable: dbtTable.name,
|
||||
dbtSchema: dbtTable.schema,
|
||||
dbtDatabase: dbtTable.database,
|
||||
hostTableId: hostTable.id,
|
||||
hostTableName: hostTable.name,
|
||||
matched: true,
|
||||
tableDescriptionAction: dbtTable.description ? 'import' : 'skip',
|
||||
tableDescriptionFound: Boolean(dbtTable.description),
|
||||
...analysis,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
export function findMatchingKloTable(
|
||||
dbtTable: DbtParsedTable,
|
||||
hostTables: DbtHostTableLite[],
|
||||
targetSchema?: string | null,
|
||||
): DbtHostTableLite | undefined {
|
||||
const dbtName = dbtTable.name.toLowerCase();
|
||||
const effectiveSchema = dbtTable.schema ?? targetSchema ?? null;
|
||||
|
||||
if (effectiveSchema) {
|
||||
const strictMatch = hostTables.find((table) => {
|
||||
const nameMatches = table.name.toLowerCase() === dbtName;
|
||||
const schemaMatches = table.db?.toLowerCase() === effectiveSchema.toLowerCase();
|
||||
if (!nameMatches || !schemaMatches) {
|
||||
return false;
|
||||
}
|
||||
if (dbtTable.database && table.catalog) {
|
||||
return table.catalog.toLowerCase() === dbtTable.database.toLowerCase();
|
||||
}
|
||||
return true;
|
||||
});
|
||||
if (strictMatch) {
|
||||
return strictMatch;
|
||||
}
|
||||
}
|
||||
|
||||
if (dbtTable.resourceType === 'source') {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const nameMatches = hostTables.filter((table) => table.name.toLowerCase() === dbtName);
|
||||
return nameMatches.length === 1 ? nameMatches[0] : undefined;
|
||||
}
|
||||
|
||||
function analyzeColumns(
|
||||
dbtTable: DbtParsedTable,
|
||||
hostTable: DbtHostTableLite,
|
||||
): Pick<DbtTableMatch, 'columnsToImport' | 'columnsMatched' | 'columnsTotal' | 'columnDescriptionsFound'> {
|
||||
let columnsToImport = 0;
|
||||
let columnsMatched = 0;
|
||||
let columnDescriptionsFound = 0;
|
||||
|
||||
for (const dbtColumn of dbtTable.columns) {
|
||||
const hostColumn = hostTable.columns.find(
|
||||
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
|
||||
);
|
||||
if (!hostColumn) {
|
||||
continue;
|
||||
}
|
||||
columnsMatched++;
|
||||
if (dbtColumn.description) {
|
||||
columnDescriptionsFound++;
|
||||
columnsToImport++;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
columnsToImport,
|
||||
columnsMatched,
|
||||
columnsTotal: dbtTable.columns.length,
|
||||
columnDescriptionsFound,
|
||||
};
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { ParsedSemanticModel } from '../metricflow/deep-parse.js';
|
||||
import { mergeSemanticModelTables } from './merge-semantic-model-tables.js';
|
||||
import type { DbtSchemaParseResult } from './parse-schema.js';
|
||||
|
||||
const semanticModel: ParsedSemanticModel = {
|
||||
name: 'orders_semantic',
|
||||
description: 'Order facts',
|
||||
modelRef: 'fct_orders',
|
||||
dimensions: [
|
||||
{ name: 'status', column: 'status', type: 'categorical', description: 'Order status' },
|
||||
{ name: 'ordered_at', column: 'ordered_at', type: 'time' },
|
||||
],
|
||||
measures: [],
|
||||
entities: [],
|
||||
defaultTimeDimension: null,
|
||||
};
|
||||
|
||||
describe('mergeSemanticModelTables', () => {
|
||||
it('adds missing MetricFlow model refs as dbt model tables', () => {
|
||||
const input: DbtSchemaParseResult = { projectName: null, dbtVersion: null, tables: [], relationships: [] };
|
||||
|
||||
expect(mergeSemanticModelTables(input, [semanticModel])).toEqual({
|
||||
projectName: null,
|
||||
dbtVersion: null,
|
||||
relationships: [],
|
||||
tables: [
|
||||
{
|
||||
name: 'fct_orders',
|
||||
description: 'Order facts',
|
||||
database: null,
|
||||
schema: null,
|
||||
resourceType: 'model',
|
||||
columns: [
|
||||
{ name: 'status', description: 'Order status', dataType: null },
|
||||
{ name: 'ordered_at', description: null, dataType: 'TIMESTAMP' },
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('does not add a duplicate table when schema parsing already found the model ref', () => {
|
||||
const input: DbtSchemaParseResult = {
|
||||
projectName: null,
|
||||
dbtVersion: null,
|
||||
relationships: [],
|
||||
tables: [
|
||||
{
|
||||
name: 'FCT_ORDERS',
|
||||
description: 'Existing',
|
||||
database: null,
|
||||
schema: null,
|
||||
resourceType: 'model',
|
||||
columns: [],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
expect(mergeSemanticModelTables(input, [semanticModel]).tables).toHaveLength(1);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
import type { ParsedSemanticModel } from '../metricflow/deep-parse.js';
|
||||
import type { DbtSchemaParseResult } from './parse-schema.js';
|
||||
|
||||
export function mergeSemanticModelTables(
|
||||
parseResult: DbtSchemaParseResult,
|
||||
semanticModels: ParsedSemanticModel[],
|
||||
): DbtSchemaParseResult {
|
||||
const merged: DbtSchemaParseResult = {
|
||||
...parseResult,
|
||||
tables: [...parseResult.tables],
|
||||
relationships: [...parseResult.relationships],
|
||||
};
|
||||
const existingTableNames = new Set(merged.tables.map((table) => table.name.toLowerCase()));
|
||||
|
||||
for (const model of semanticModels) {
|
||||
const tableName = model.modelRef;
|
||||
if (existingTableNames.has(tableName.toLowerCase())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
merged.tables.push({
|
||||
name: tableName,
|
||||
description: model.description,
|
||||
database: null,
|
||||
schema: null,
|
||||
columns: model.dimensions.map((dimension) => ({
|
||||
name: dimension.column,
|
||||
description: dimension.description ?? null,
|
||||
dataType: dimension.type === 'time' ? 'TIMESTAMP' : null,
|
||||
})),
|
||||
resourceType: 'model',
|
||||
});
|
||||
existingTableNames.add(tableName.toLowerCase());
|
||||
}
|
||||
|
||||
return merged;
|
||||
}
|
||||
|
|
@ -0,0 +1,214 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { parseDbtSchemaFile, parseDbtSchemaFiles } from './parse-schema.js';
|
||||
|
||||
describe('dbt descriptions schema parser', () => {
|
||||
it('resolves shared dbt vars and defaults before parsing schema YAML', () => {
|
||||
const result = parseDbtSchemaFile(
|
||||
`
|
||||
version: 2
|
||||
sources:
|
||||
- name: raw
|
||||
database: "{{ var('database') }}"
|
||||
schema: "{{ var('schema', 'fallback_schema') }}"
|
||||
tables:
|
||||
- name: orders
|
||||
identifier: fct_orders
|
||||
description: "Orders from {{ var('database') }}"
|
||||
columns:
|
||||
- name: customer_id
|
||||
description: "Customer id"
|
||||
tests:
|
||||
- relationships:
|
||||
to: ref('customers')
|
||||
field: id
|
||||
models:
|
||||
- name: "{{ var('model_name', 'orders_model') }}"
|
||||
schema: "{{ var('model_schema') }}"
|
||||
columns:
|
||||
- name: id
|
||||
description: "Order id"
|
||||
`,
|
||||
{ path: 'models/schema.yml', variables: new Map([['database', 'analytics'], ['model_schema', 'mart']]) },
|
||||
);
|
||||
|
||||
expect(result.tables).toEqual([
|
||||
{
|
||||
name: 'fct_orders',
|
||||
description: 'Orders from analytics',
|
||||
database: 'analytics',
|
||||
schema: 'fallback_schema',
|
||||
columns: [
|
||||
{
|
||||
name: 'customer_id',
|
||||
description: 'Customer id',
|
||||
dataType: null,
|
||||
dataTests: [{ name: 'relationships', package: 'dbt', kwargs: { to: "ref('customers')", field: 'id' } }],
|
||||
},
|
||||
],
|
||||
resourceType: 'source',
|
||||
},
|
||||
{
|
||||
name: 'orders_model',
|
||||
description: null,
|
||||
database: null,
|
||||
schema: 'mart',
|
||||
columns: [{ name: 'id', description: 'Order id', dataType: null }],
|
||||
resourceType: 'model',
|
||||
},
|
||||
]);
|
||||
expect(result.relationships).toEqual([
|
||||
{
|
||||
fromTable: 'fct_orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
fromSchema: 'fallback_schema',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('deduplicates tables by database schema and name while merging columns', () => {
|
||||
const result = parseDbtSchemaFiles([
|
||||
{
|
||||
path: 'models/a.yml',
|
||||
content: `
|
||||
version: 2
|
||||
models:
|
||||
- name: orders
|
||||
description: Orders
|
||||
columns:
|
||||
- name: id
|
||||
description: Primary key
|
||||
`,
|
||||
},
|
||||
{
|
||||
path: 'models/b.yml',
|
||||
content: `
|
||||
version: 2
|
||||
models:
|
||||
- name: orders
|
||||
columns:
|
||||
- name: status
|
||||
description: Status
|
||||
- name: id
|
||||
data_type: integer
|
||||
`,
|
||||
},
|
||||
]);
|
||||
|
||||
expect(result.tables).toEqual([
|
||||
{
|
||||
name: 'orders',
|
||||
description: 'Orders',
|
||||
database: null,
|
||||
schema: null,
|
||||
resourceType: 'model',
|
||||
columns: [
|
||||
{ name: 'id', description: 'Primary key', dataType: 'integer' },
|
||||
{ name: 'status', description: 'Status', dataType: null },
|
||||
],
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('returns an empty result for malformed YAML and preserves unresolved Jinja text', () => {
|
||||
expect(parseDbtSchemaFile('{{{{ invalid yaml', { path: 'broken.yml' })).toEqual({
|
||||
projectName: null,
|
||||
dbtVersion: null,
|
||||
tables: [],
|
||||
relationships: [],
|
||||
});
|
||||
|
||||
const unresolved = parseDbtSchemaFile(
|
||||
`
|
||||
version: 2
|
||||
models:
|
||||
- name: "{{ var('missing_model') }}"
|
||||
`,
|
||||
{ variables: new Map() },
|
||||
);
|
||||
expect(unresolved.tables[0]?.name).toBe("{{ var('missing_model') }}");
|
||||
});
|
||||
|
||||
it('extracts data tests, constraints, enum values, tags, and freshness', () => {
|
||||
const result = parseDbtSchemaFile(`
|
||||
version: 2
|
||||
sources:
|
||||
- name: raw
|
||||
schema: jaffle
|
||||
tags: ["raw"]
|
||||
tables:
|
||||
- name: customers
|
||||
tags: ["core"]
|
||||
loaded_at_field: updated_at
|
||||
freshness:
|
||||
warn_after: { count: 12, period: hour }
|
||||
columns:
|
||||
- name: id
|
||||
tests:
|
||||
- not_null
|
||||
- unique
|
||||
- name: status
|
||||
data_tests:
|
||||
- accepted_values:
|
||||
values: ['active', 'inactive']
|
||||
models:
|
||||
- name: orders
|
||||
tags: ["finance"]
|
||||
loaded_at_field: run_at
|
||||
columns:
|
||||
- name: status
|
||||
data_tests:
|
||||
- dbt_utils.expression_is_true:
|
||||
expression: "status is not null"
|
||||
- accepted_values: ['placed', 'shipped']
|
||||
`);
|
||||
|
||||
const customers = result.tables.find((table) => table.name === 'customers');
|
||||
expect(customers?.tagsDbt).toEqual(['raw', 'core']);
|
||||
expect(customers?.freshnessDbt?.loadedAtField).toBe('updated_at');
|
||||
expect(customers?.freshnessDbt?.raw).toBeDefined();
|
||||
const id = customers?.columns.find((column) => column.name === 'id');
|
||||
expect(id?.constraints?.dbt).toEqual({ not_null: true, unique: true });
|
||||
const status = customers?.columns.find((column) => column.name === 'status');
|
||||
expect(status?.enumValuesDbt).toEqual(['active', 'inactive']);
|
||||
|
||||
const orders = result.tables.find((table) => table.name === 'orders');
|
||||
expect(orders?.tagsDbt).toEqual(['finance']);
|
||||
expect(orders?.freshnessDbt?.loadedAtField).toBe('run_at');
|
||||
const ordersStatus = orders?.columns.find((column) => column.name === 'status');
|
||||
expect(ordersStatus?.enumValuesDbt).toEqual(['placed', 'shipped']);
|
||||
expect(ordersStatus?.dataTests).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({ package: 'dbt_utils', name: 'expression_is_true' }),
|
||||
expect.objectContaining({ package: 'dbt', name: 'accepted_values' }),
|
||||
]),
|
||||
);
|
||||
});
|
||||
|
||||
it('parses relationships from model column data tests', () => {
|
||||
const result = parseDbtSchemaFile(`
|
||||
version: 2
|
||||
models:
|
||||
- name: orders
|
||||
schema: public
|
||||
columns:
|
||||
- name: customer_id
|
||||
data_tests:
|
||||
- relationships:
|
||||
arguments:
|
||||
to: "ref('customers')"
|
||||
field: id
|
||||
`);
|
||||
|
||||
expect(result.relationships).toEqual([
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
fromSchema: 'public',
|
||||
},
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,655 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import { parse as parseYaml } from 'yaml';
|
||||
import { type KloLogger, noopLogger } from '../../../core/index.js';
|
||||
import { resolveJinjaVariables } from '../../dbt-shared/project-vars.js';
|
||||
|
||||
export interface DbtParsedColumn {
|
||||
name: string;
|
||||
description: string | null;
|
||||
dataType: string | null;
|
||||
dataTests?: DbtDataTestRef[];
|
||||
constraints?: DbtColumnConstraints;
|
||||
enumValuesDbt?: string[];
|
||||
}
|
||||
|
||||
export interface DbtDataTestRef {
|
||||
name: string;
|
||||
package: string;
|
||||
kwargs?: Record<string, unknown>;
|
||||
}
|
||||
|
||||
export interface DbtColumnConstraints {
|
||||
dbt: {
|
||||
not_null?: boolean;
|
||||
unique?: boolean;
|
||||
};
|
||||
}
|
||||
|
||||
export interface DbtParsedRelationship {
|
||||
fromTable: string;
|
||||
fromColumn: string;
|
||||
toTable: string;
|
||||
toColumn: string;
|
||||
fromSchema?: string;
|
||||
toSchema?: string;
|
||||
description?: string;
|
||||
}
|
||||
|
||||
export interface DbtParsedTable {
|
||||
name: string;
|
||||
description: string | null;
|
||||
database: string | null;
|
||||
schema: string | null;
|
||||
columns: DbtParsedColumn[];
|
||||
resourceType?: 'source' | 'model';
|
||||
tagsDbt?: string[];
|
||||
freshnessDbt?: {
|
||||
raw?: unknown;
|
||||
loadedAtField?: string | null;
|
||||
};
|
||||
}
|
||||
|
||||
export interface DbtSchemaParseResult {
|
||||
projectName: string | null;
|
||||
dbtVersion: string | null;
|
||||
tables: DbtParsedTable[];
|
||||
relationships: DbtParsedRelationship[];
|
||||
}
|
||||
|
||||
export interface DbtSchemaFile {
|
||||
content: string;
|
||||
path: string;
|
||||
}
|
||||
|
||||
interface ParseDbtSchemaOptions {
|
||||
path?: string;
|
||||
variables?: Map<string, string>;
|
||||
projectName?: string | null;
|
||||
logger?: KloLogger;
|
||||
}
|
||||
|
||||
interface DbtSchemaYaml {
|
||||
version?: number;
|
||||
sources?: DbtSchemaSource[];
|
||||
models?: DbtSchemaModel[];
|
||||
}
|
||||
|
||||
interface DbtSchemaSource {
|
||||
name: string;
|
||||
description?: string;
|
||||
database?: string;
|
||||
schema?: string;
|
||||
tags?: string[];
|
||||
tables?: DbtSchemaTable[];
|
||||
}
|
||||
|
||||
interface DbtSchemaTable {
|
||||
name: string;
|
||||
description?: string;
|
||||
identifier?: string;
|
||||
tags?: string[];
|
||||
loaded_at_field?: string;
|
||||
freshness?: unknown;
|
||||
columns?: DbtSchemaColumn[];
|
||||
}
|
||||
|
||||
interface DbtSchemaModel {
|
||||
name: string;
|
||||
description?: string;
|
||||
database?: string;
|
||||
schema?: string;
|
||||
tags?: string[];
|
||||
loaded_at_field?: string;
|
||||
freshness?: unknown;
|
||||
columns?: DbtSchemaColumn[];
|
||||
}
|
||||
|
||||
interface DbtSchemaColumn {
|
||||
name: string;
|
||||
description?: string;
|
||||
data_type?: string;
|
||||
data_tests?: DbtSchemaDataTest[];
|
||||
tests?: DbtSchemaDataTest[];
|
||||
}
|
||||
|
||||
type DbtSchemaDataTest =
|
||||
| string
|
||||
| {
|
||||
relationships?: {
|
||||
to?: string;
|
||||
field?: string;
|
||||
arguments?: { to?: string; field?: string };
|
||||
};
|
||||
not_null?: unknown;
|
||||
unique?: unknown;
|
||||
accepted_values?: { values?: unknown } | unknown;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
|
||||
export function parseDbtSchemaFile(content: string, options: ParseDbtSchemaOptions = {}): DbtSchemaParseResult {
|
||||
return new DbtSchemaParser(options.logger ?? noopLogger).parseFile(content, options);
|
||||
}
|
||||
|
||||
export function parseDbtSchemaFiles(
|
||||
files: DbtSchemaFile[],
|
||||
variables?: Map<string, string>,
|
||||
options: { projectName?: string | null; logger?: KloLogger } = {},
|
||||
): DbtSchemaParseResult {
|
||||
return new DbtSchemaParser(options.logger ?? noopLogger).parseFiles(files, variables, options.projectName ?? null);
|
||||
}
|
||||
|
||||
export function computeDbtSchemaHash(files: DbtSchemaFile[]): string {
|
||||
const combined = [...files]
|
||||
.sort((a, b) => a.path.localeCompare(b.path))
|
||||
.map((file) => `${file.path}:${file.content}`)
|
||||
.join('\n');
|
||||
return createHash('sha256').update(combined).digest('hex').substring(0, 16);
|
||||
}
|
||||
|
||||
class DbtSchemaParser {
|
||||
constructor(private readonly logger: KloLogger) {}
|
||||
|
||||
parseFile(yamlContent: string, options: ParseDbtSchemaOptions = {}): DbtSchemaParseResult {
|
||||
this.logger.debug(`Parsing schema file: ${options.path ?? 'unknown'}`);
|
||||
|
||||
const resolved = options.variables
|
||||
? resolveJinjaVariables(yamlContent, options.variables)
|
||||
: { content: yamlContent, unresolvedVars: [] };
|
||||
if (resolved.unresolvedVars.length > 0) {
|
||||
this.logger.warn(
|
||||
`Unresolved dbt variables in ${options.path ?? 'schema file'}: ${resolved.unresolvedVars.join(', ')}`,
|
||||
);
|
||||
}
|
||||
|
||||
let schema: DbtSchemaYaml;
|
||||
try {
|
||||
schema = parseYaml(resolved.content) as DbtSchemaYaml;
|
||||
} catch (error) {
|
||||
this.logger.warn(`Failed to parse YAML${options.path ? ` at ${options.path}` : ''}: ${error}`);
|
||||
return this.emptyResult(options.projectName ?? null);
|
||||
}
|
||||
|
||||
if (!schema || typeof schema !== 'object') {
|
||||
return this.emptyResult(options.projectName ?? null);
|
||||
}
|
||||
|
||||
const tables = [...this.parseSources(schema.sources), ...this.parseModels(schema.models)];
|
||||
const relationships = [
|
||||
...this.parseSourceRelationships(schema.sources),
|
||||
...this.parseModelRelationships(schema.models),
|
||||
];
|
||||
|
||||
return {
|
||||
projectName: options.projectName ?? null,
|
||||
dbtVersion: null,
|
||||
tables,
|
||||
relationships,
|
||||
};
|
||||
}
|
||||
|
||||
parseFiles(
|
||||
files: DbtSchemaFile[],
|
||||
variables?: Map<string, string>,
|
||||
projectName: string | null = null,
|
||||
): DbtSchemaParseResult {
|
||||
const allTables: DbtParsedTable[] = [];
|
||||
const allRelationships: DbtParsedRelationship[] = [];
|
||||
|
||||
for (const file of files) {
|
||||
const result = this.parseFile(file.content, { path: file.path, variables, projectName });
|
||||
allTables.push(...result.tables);
|
||||
allRelationships.push(...result.relationships);
|
||||
}
|
||||
|
||||
return {
|
||||
projectName,
|
||||
dbtVersion: null,
|
||||
tables: this.deduplicateTables(allTables),
|
||||
relationships: this.deduplicateRelationships(allRelationships),
|
||||
};
|
||||
}
|
||||
|
||||
private parseSources(sources: DbtSchemaSource[] | undefined): DbtParsedTable[] {
|
||||
if (!sources || !Array.isArray(sources)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const tables: DbtParsedTable[] = [];
|
||||
|
||||
for (const source of sources) {
|
||||
const sourceSchema = source.schema ?? source.name;
|
||||
const sourceDatabase = source.database ?? null;
|
||||
const sourceTags = this.normalizeTagList(source.tags);
|
||||
|
||||
if (!source.tables || !Array.isArray(source.tables)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const table of source.tables) {
|
||||
const tagsDbt = this.mergeTagsDbt(sourceTags, this.normalizeTagList(table.tags));
|
||||
const freshnessDbt = this.buildFreshnessDbt(table.freshness, table.loaded_at_field);
|
||||
tables.push({
|
||||
name: table.identifier ?? table.name,
|
||||
description: this.normalizeDescription(table.description),
|
||||
database: sourceDatabase,
|
||||
schema: sourceSchema,
|
||||
columns: this.parseColumns(table.columns),
|
||||
resourceType: 'source',
|
||||
...(tagsDbt ? { tagsDbt } : {}),
|
||||
...(freshnessDbt ? { freshnessDbt } : {}),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return tables;
|
||||
}
|
||||
|
||||
private parseModels(models: DbtSchemaModel[] | undefined): DbtParsedTable[] {
|
||||
if (!models || !Array.isArray(models)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const tables: DbtParsedTable[] = [];
|
||||
|
||||
for (const model of models) {
|
||||
if (!model.name) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const tagsDbt = this.mergeTagsDbt(this.normalizeTagList(model.tags));
|
||||
const freshnessDbt = this.buildFreshnessDbt(model.freshness, model.loaded_at_field);
|
||||
tables.push({
|
||||
name: model.name,
|
||||
description: this.normalizeDescription(model.description),
|
||||
database: model.database ?? null,
|
||||
schema: model.schema ?? null,
|
||||
columns: this.parseColumns(model.columns),
|
||||
resourceType: 'model',
|
||||
...(tagsDbt ? { tagsDbt } : {}),
|
||||
...(freshnessDbt ? { freshnessDbt } : {}),
|
||||
});
|
||||
}
|
||||
|
||||
return tables;
|
||||
}
|
||||
|
||||
private parseColumns(columns: DbtSchemaColumn[] | undefined): DbtParsedColumn[] {
|
||||
if (!columns || !Array.isArray(columns)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return columns.map((column) => {
|
||||
const { refs, constraints, enumValues } = this.parseDataTests(column.data_tests ?? column.tests);
|
||||
return {
|
||||
name: column.name,
|
||||
description: this.normalizeDescription(column.description),
|
||||
dataType: column.data_type ?? null,
|
||||
...(refs.length > 0 ? { dataTests: refs } : {}),
|
||||
...(constraints ? { constraints } : {}),
|
||||
...(enumValues.length > 0 ? { enumValuesDbt: enumValues } : {}),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
private parseDataTests(tests: DbtSchemaDataTest[] | undefined): {
|
||||
refs: DbtDataTestRef[];
|
||||
constraints: DbtColumnConstraints | undefined;
|
||||
enumValues: string[];
|
||||
} {
|
||||
const refs: DbtDataTestRef[] = [];
|
||||
const dbt: { not_null?: boolean; unique?: boolean } = {};
|
||||
const enumValues: string[] = [];
|
||||
if (!tests?.length) {
|
||||
return { refs, constraints: undefined, enumValues };
|
||||
}
|
||||
|
||||
for (const test of tests) {
|
||||
if (typeof test === 'string') {
|
||||
const parsed = this.parseTestNameString(test);
|
||||
refs.push(parsed);
|
||||
if (parsed.package === 'dbt' && parsed.name === 'not_null') {
|
||||
dbt.not_null = true;
|
||||
}
|
||||
if (parsed.package === 'dbt' && parsed.name === 'unique') {
|
||||
dbt.unique = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const [key, value] of Object.entries(test)) {
|
||||
if (key === 'relationships') {
|
||||
refs.push({
|
||||
name: 'relationships',
|
||||
package: 'dbt',
|
||||
...(value && typeof value === 'object' && !Array.isArray(value)
|
||||
? { kwargs: value as Record<string, unknown> }
|
||||
: {}),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
if (key === 'not_null') {
|
||||
refs.push({ name: 'not_null', package: 'dbt' });
|
||||
dbt.not_null = true;
|
||||
continue;
|
||||
}
|
||||
if (key === 'unique') {
|
||||
refs.push({ name: 'unique', package: 'dbt' });
|
||||
dbt.unique = true;
|
||||
continue;
|
||||
}
|
||||
if (key === 'accepted_values') {
|
||||
if (Array.isArray(value)) {
|
||||
enumValues.push(...value.map((item) => String(item)));
|
||||
refs.push({ name: 'accepted_values', package: 'dbt', kwargs: { values: value } });
|
||||
continue;
|
||||
}
|
||||
if (value && typeof value === 'object' && !Array.isArray(value)) {
|
||||
const values = (value as { values?: unknown }).values;
|
||||
if (Array.isArray(values)) {
|
||||
enumValues.push(...values.map((item) => String(item)));
|
||||
}
|
||||
refs.push({ name: 'accepted_values', package: 'dbt', kwargs: value as Record<string, unknown> });
|
||||
continue;
|
||||
}
|
||||
}
|
||||
refs.push({
|
||||
...this.parseTestNameString(key),
|
||||
...(value && typeof value === 'object' && !Array.isArray(value)
|
||||
? { kwargs: value as Record<string, unknown> }
|
||||
: {}),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const constraints = dbt.not_null || dbt.unique ? { dbt } : undefined;
|
||||
return { refs, constraints, enumValues };
|
||||
}
|
||||
|
||||
private parseTestNameString(value: string): { name: string; package: string } {
|
||||
const parts = value.split('.');
|
||||
if (parts.length >= 2) {
|
||||
return { package: parts[0]!, name: parts.slice(1).join('.') };
|
||||
}
|
||||
return { package: 'dbt', name: value };
|
||||
}
|
||||
|
||||
private parseSourceRelationships(sources: DbtSchemaSource[] | undefined): DbtParsedRelationship[] {
|
||||
if (!sources || !Array.isArray(sources)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const relationships: DbtParsedRelationship[] = [];
|
||||
|
||||
for (const source of sources) {
|
||||
const sourceSchema = source.schema ?? source.name;
|
||||
|
||||
if (!source.tables || !Array.isArray(source.tables)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const table of source.tables) {
|
||||
const tableName = table.identifier ?? table.name;
|
||||
|
||||
if (!table.columns || !Array.isArray(table.columns)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const column of table.columns) {
|
||||
const tests = column.data_tests ?? column.tests ?? [];
|
||||
|
||||
for (const test of tests) {
|
||||
const relationship = this.parseRelationshipTest(test, tableName, column.name, sourceSchema);
|
||||
if (relationship) {
|
||||
relationships.push(relationship);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return relationships;
|
||||
}
|
||||
|
||||
private parseModelRelationships(models: DbtSchemaModel[] | undefined): DbtParsedRelationship[] {
|
||||
if (!models || !Array.isArray(models)) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const relationships: DbtParsedRelationship[] = [];
|
||||
|
||||
for (const model of models) {
|
||||
if (!model.name || !model.columns || !Array.isArray(model.columns)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const column of model.columns) {
|
||||
const tests = column.data_tests ?? column.tests ?? [];
|
||||
|
||||
for (const test of tests) {
|
||||
const relationship = this.parseRelationshipTest(test, model.name, column.name, model.schema ?? undefined);
|
||||
if (relationship) {
|
||||
relationships.push(relationship);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return relationships;
|
||||
}
|
||||
|
||||
private parseRelationshipTest(
|
||||
test: DbtSchemaDataTest,
|
||||
fromTable: string,
|
||||
fromColumn: string,
|
||||
fromSchema?: string,
|
||||
): DbtParsedRelationship | null {
|
||||
if (typeof test === 'string' || !test.relationships) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const relationship = test.relationships;
|
||||
const toRef = relationship.to ?? relationship.arguments?.to;
|
||||
const toColumn = relationship.field ?? relationship.arguments?.field;
|
||||
|
||||
if (!toRef || !toColumn) {
|
||||
this.logger.debug(`Skipping incomplete relationship test for ${fromTable}.${fromColumn}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const toTable = this.parseRef(toRef);
|
||||
if (!toTable) {
|
||||
this.logger.debug(`Could not parse ref: ${toRef}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
fromTable,
|
||||
fromColumn,
|
||||
toTable,
|
||||
toColumn,
|
||||
...(fromSchema ? { fromSchema } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
private parseRef(refString: string): string | null {
|
||||
const refMatch = refString.match(/ref\s*\(\s*['"]([^'"]+)['"]\s*\)/);
|
||||
if (refMatch) {
|
||||
return refMatch[1];
|
||||
}
|
||||
|
||||
const sourceMatch = refString.match(/source\s*\(\s*['"][^'"]+['"]\s*,\s*['"]([^'"]+)['"]\s*\)/);
|
||||
if (sourceMatch) {
|
||||
return sourceMatch[1];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private normalizeDescription(description: string | undefined): string | null {
|
||||
if (!description) {
|
||||
return null;
|
||||
}
|
||||
const trimmed = description.trim();
|
||||
return trimmed.length > 0 ? trimmed : null;
|
||||
}
|
||||
|
||||
private normalizeTagList(tags: string[] | undefined): string[] {
|
||||
if (!tags || !Array.isArray(tags)) {
|
||||
return [];
|
||||
}
|
||||
return tags.map((tag) => String(tag));
|
||||
}
|
||||
|
||||
private mergeTagsDbt(...lists: Array<string[] | undefined>): string[] | undefined {
|
||||
const merged: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const list of lists) {
|
||||
for (const item of list ?? []) {
|
||||
if (!seen.has(item)) {
|
||||
seen.add(item);
|
||||
merged.push(item);
|
||||
}
|
||||
}
|
||||
}
|
||||
return merged.length > 0 ? merged : undefined;
|
||||
}
|
||||
|
||||
private buildFreshnessDbt(freshness: unknown, loadedAtField: string | undefined): DbtParsedTable['freshnessDbt'] {
|
||||
const loadedTrim = loadedAtField?.trim();
|
||||
const hasFreshness = freshness !== undefined && freshness !== null;
|
||||
if (!hasFreshness && !loadedTrim) {
|
||||
return undefined;
|
||||
}
|
||||
return {
|
||||
...(hasFreshness ? { raw: freshness } : {}),
|
||||
...(hasFreshness ? { loadedAtField: loadedTrim ?? null } : loadedTrim ? { loadedAtField: loadedTrim } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
private deduplicateTables(tables: DbtParsedTable[]): DbtParsedTable[] {
|
||||
const seen = new Map<string, DbtParsedTable>();
|
||||
|
||||
for (const table of tables) {
|
||||
const key = `${table.database ?? ''}.${table.schema ?? ''}.${table.name}`.toLowerCase();
|
||||
const existing = seen.get(key);
|
||||
|
||||
if (!existing) {
|
||||
seen.set(key, table);
|
||||
continue;
|
||||
}
|
||||
|
||||
seen.set(key, {
|
||||
...existing,
|
||||
description: existing.description ?? table.description,
|
||||
columns: this.mergeColumns(existing.columns, table.columns),
|
||||
tagsDbt: this.mergeTagsDbt(existing.tagsDbt, table.tagsDbt),
|
||||
freshnessDbt: this.mergeFreshnessDbt(existing.freshnessDbt, table.freshnessDbt),
|
||||
});
|
||||
}
|
||||
|
||||
return Array.from(seen.values());
|
||||
}
|
||||
|
||||
private mergeColumns(existing: DbtParsedColumn[], incoming: DbtParsedColumn[]): DbtParsedColumn[] {
|
||||
const seen = new Map<string, DbtParsedColumn>();
|
||||
|
||||
for (const column of existing) {
|
||||
seen.set(column.name.toLowerCase(), column);
|
||||
}
|
||||
|
||||
for (const column of incoming) {
|
||||
const key = column.name.toLowerCase();
|
||||
const existingColumn = seen.get(key);
|
||||
|
||||
if (!existingColumn) {
|
||||
seen.set(key, column);
|
||||
continue;
|
||||
}
|
||||
|
||||
seen.set(key, {
|
||||
...existingColumn,
|
||||
description: existingColumn.description ?? column.description,
|
||||
dataType: existingColumn.dataType ?? column.dataType,
|
||||
dataTests: this.mergeDbtDataTests(existingColumn.dataTests, column.dataTests),
|
||||
constraints: this.mergeDbtConstraints(existingColumn.constraints, column.constraints),
|
||||
enumValuesDbt: this.mergeStringList(existingColumn.enumValuesDbt, column.enumValuesDbt),
|
||||
});
|
||||
}
|
||||
|
||||
return Array.from(seen.values());
|
||||
}
|
||||
|
||||
private deduplicateRelationships(relationships: DbtParsedRelationship[]): DbtParsedRelationship[] {
|
||||
const seen = new Set<string>();
|
||||
const result: DbtParsedRelationship[] = [];
|
||||
|
||||
for (const relationship of relationships) {
|
||||
const key =
|
||||
`${relationship.fromTable}.${relationship.fromColumn}->${relationship.toTable}.${relationship.toColumn}`.toLowerCase();
|
||||
if (!seen.has(key)) {
|
||||
seen.add(key);
|
||||
result.push(relationship);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private mergeFreshnessDbt(
|
||||
existing?: DbtParsedTable['freshnessDbt'],
|
||||
incoming?: DbtParsedTable['freshnessDbt'],
|
||||
): DbtParsedTable['freshnessDbt'] {
|
||||
if (!existing && !incoming) {
|
||||
return undefined;
|
||||
}
|
||||
const raw = existing?.raw !== undefined ? existing.raw : incoming?.raw;
|
||||
const loadedAtField = existing?.loadedAtField ?? incoming?.loadedAtField;
|
||||
return {
|
||||
...(raw !== undefined ? { raw } : {}),
|
||||
...(loadedAtField !== undefined ? { loadedAtField } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
private mergeDbtConstraints(
|
||||
existing?: DbtColumnConstraints,
|
||||
incoming?: DbtColumnConstraints,
|
||||
): DbtColumnConstraints | undefined {
|
||||
const notNull = !!(existing?.dbt.not_null || incoming?.dbt.not_null);
|
||||
const unique = !!(existing?.dbt.unique || incoming?.dbt.unique);
|
||||
if (!notNull && !unique) {
|
||||
return undefined;
|
||||
}
|
||||
return { dbt: { ...(notNull ? { not_null: true } : {}), ...(unique ? { unique: true } : {}) } };
|
||||
}
|
||||
|
||||
private mergeStringList(existing?: string[], incoming?: string[]): string[] | undefined {
|
||||
return this.mergeTagsDbt(existing, incoming);
|
||||
}
|
||||
|
||||
private mergeDbtDataTests(existing?: DbtDataTestRef[], incoming?: DbtDataTestRef[]): DbtDataTestRef[] | undefined {
|
||||
if (!existing?.length) {
|
||||
return incoming?.length ? [...incoming] : undefined;
|
||||
}
|
||||
if (!incoming?.length) {
|
||||
return [...existing];
|
||||
}
|
||||
const tests = new Map<string, DbtDataTestRef>();
|
||||
for (const test of [...existing, ...incoming]) {
|
||||
const kwargsKey =
|
||||
test.kwargs && Object.keys(test.kwargs).length > 0
|
||||
? `:${createHash('sha256').update(JSON.stringify(test.kwargs)).digest('hex').slice(0, 16)}`
|
||||
: '';
|
||||
tests.set(`${test.package}:${test.name}${kwargsKey}`, test);
|
||||
}
|
||||
return [...tests.values()];
|
||||
}
|
||||
|
||||
private emptyResult(projectName: string | null): DbtSchemaParseResult {
|
||||
return {
|
||||
projectName,
|
||||
dbtVersion: null,
|
||||
tables: [],
|
||||
relationships: [],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,102 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { DbtSchemaParseResult } from './parse-schema.js';
|
||||
import { toDescriptionUpdates } from './to-description-updates.js';
|
||||
import type { DbtHostTableLite } from './match-tables.js';
|
||||
|
||||
const hostTables: DbtHostTableLite[] = [
|
||||
{
|
||||
id: '1',
|
||||
name: 'orders',
|
||||
catalog: 'warehouse',
|
||||
db: 'analytics',
|
||||
columns: [
|
||||
{ id: 'c1', name: 'id' },
|
||||
{ id: 'c2', name: 'amount' },
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
function parseResult(description: string | null, columnDescription: string | null): DbtSchemaParseResult {
|
||||
return {
|
||||
projectName: null,
|
||||
dbtVersion: null,
|
||||
relationships: [],
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
description,
|
||||
database: 'warehouse',
|
||||
schema: 'analytics',
|
||||
resourceType: 'model',
|
||||
columns: [
|
||||
{ name: 'id', description: columnDescription, dataType: null },
|
||||
{ name: 'missing', description: 'not imported', dataType: null },
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
describe('dbt descriptions update payloads', () => {
|
||||
it('emits dbt writes and matching ai invalidations when descriptions exist', () => {
|
||||
expect(
|
||||
toDescriptionUpdates({
|
||||
connectionId: 'conn-1',
|
||||
parseResult: parseResult('Orders table', 'Primary key'),
|
||||
hostTables,
|
||||
targetSchema: null,
|
||||
}),
|
||||
).toEqual({
|
||||
dbt: [
|
||||
{
|
||||
connectionId: 'conn-1',
|
||||
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
|
||||
source: 'dbt',
|
||||
tableDescription: 'Orders table',
|
||||
columnDescriptions: { id: 'Primary key' },
|
||||
},
|
||||
],
|
||||
aiInvalidations: [
|
||||
{
|
||||
connectionId: 'conn-1',
|
||||
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
|
||||
source: 'ai',
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('does not emit spurious dbt writes or ai invalidations when no descriptions exist', () => {
|
||||
expect(
|
||||
toDescriptionUpdates({
|
||||
connectionId: 'conn-1',
|
||||
parseResult: parseResult(null, null),
|
||||
hostTables,
|
||||
targetSchema: null,
|
||||
}),
|
||||
).toEqual({ dbt: [], aiInvalidations: [] });
|
||||
});
|
||||
|
||||
it('emits ai invalidation without a dbt description write when only structural metadata exists', () => {
|
||||
const result = parseResult(null, null);
|
||||
result.tables[0]!.tagsDbt = ['finance'];
|
||||
|
||||
expect(
|
||||
toDescriptionUpdates({
|
||||
connectionId: 'conn-1',
|
||||
parseResult: result,
|
||||
hostTables,
|
||||
targetSchema: null,
|
||||
}),
|
||||
).toEqual({
|
||||
dbt: [],
|
||||
aiInvalidations: [
|
||||
{
|
||||
connectionId: 'conn-1',
|
||||
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
|
||||
source: 'ai',
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
import type { KloDescriptionUpdate } from '../../../scan/enrichment-types.js';
|
||||
import { findMatchingKloTable, type DbtHostTableLite } from './match-tables.js';
|
||||
import type { DbtSchemaParseResult } from './parse-schema.js';
|
||||
|
||||
export interface DbtDescriptionUpdates {
|
||||
dbt: KloDescriptionUpdate[];
|
||||
aiInvalidations: KloDescriptionUpdate[];
|
||||
}
|
||||
|
||||
export function toDescriptionUpdates(input: {
|
||||
connectionId: string;
|
||||
parseResult: DbtSchemaParseResult;
|
||||
hostTables: DbtHostTableLite[];
|
||||
targetSchema: string | null;
|
||||
}): DbtDescriptionUpdates {
|
||||
const dbt: KloDescriptionUpdate[] = [];
|
||||
const aiInvalidations: KloDescriptionUpdate[] = [];
|
||||
|
||||
for (const dbtTable of input.parseResult.tables) {
|
||||
const hostTable = findMatchingKloTable(dbtTable, input.hostTables, input.targetSchema);
|
||||
if (!hostTable) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const tableDescription = dbtTable.description ?? undefined;
|
||||
const columnDescriptions: Record<string, string | null> = {};
|
||||
|
||||
for (const dbtColumn of dbtTable.columns) {
|
||||
if (!dbtColumn.description) {
|
||||
continue;
|
||||
}
|
||||
const hostColumn = hostTable.columns.find(
|
||||
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
|
||||
);
|
||||
if (hostColumn) {
|
||||
columnDescriptions[hostColumn.name] = dbtColumn.description;
|
||||
}
|
||||
}
|
||||
|
||||
const hasColumnDescriptions = Object.keys(columnDescriptions).length > 0;
|
||||
const hasDescriptionChange = tableDescription !== undefined || hasColumnDescriptions;
|
||||
const hasMetadataChange =
|
||||
!!dbtTable.tagsDbt?.length ||
|
||||
dbtTable.freshnessDbt !== undefined ||
|
||||
dbtTable.columns.some(
|
||||
(column) => column.constraints !== undefined || !!column.enumValuesDbt?.length || !!column.dataTests?.length,
|
||||
);
|
||||
if (!hasDescriptionChange && !hasMetadataChange) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const tableRef = { catalog: hostTable.catalog, db: hostTable.db, name: hostTable.name };
|
||||
if (hasDescriptionChange) {
|
||||
dbt.push({
|
||||
connectionId: input.connectionId,
|
||||
table: tableRef,
|
||||
source: 'dbt',
|
||||
...(tableDescription !== undefined ? { tableDescription } : {}),
|
||||
...(hasColumnDescriptions ? { columnDescriptions } : {}),
|
||||
});
|
||||
}
|
||||
aiInvalidations.push({
|
||||
connectionId: input.connectionId,
|
||||
table: tableRef,
|
||||
source: 'ai',
|
||||
});
|
||||
}
|
||||
|
||||
return { dbt, aiInvalidations };
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { toMetadataUpdates } from './to-metadata-updates.js';
|
||||
|
||||
describe('toMetadataUpdates', () => {
|
||||
it('emits source-keyed dbt metadata updates for matched tables and columns', () => {
|
||||
const updates = toMetadataUpdates({
|
||||
connectionId: 'conn_1',
|
||||
targetSchema: 'analytics',
|
||||
hostTables: [
|
||||
{
|
||||
id: 'orders-id',
|
||||
name: 'orders',
|
||||
catalog: 'warehouse',
|
||||
db: 'analytics',
|
||||
columns: [
|
||||
{ id: 'status-id', name: 'status' },
|
||||
{ id: 'created-id', name: 'created_at' },
|
||||
],
|
||||
},
|
||||
],
|
||||
parseResult: {
|
||||
projectName: null,
|
||||
dbtVersion: null,
|
||||
relationships: [],
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
description: null,
|
||||
database: 'warehouse',
|
||||
schema: 'analytics',
|
||||
resourceType: 'model',
|
||||
tagsDbt: ['finance'],
|
||||
freshnessDbt: { loadedAtField: 'created_at' },
|
||||
columns: [
|
||||
{
|
||||
name: 'status',
|
||||
description: null,
|
||||
dataType: null,
|
||||
enumValuesDbt: ['placed', 'shipped'],
|
||||
constraints: { dbt: { not_null: true } },
|
||||
dataTests: [{ name: 'accepted_values', package: 'dbt', kwargs: { values: ['placed', 'shipped'] } }],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
});
|
||||
|
||||
expect(updates).toEqual([
|
||||
{
|
||||
connectionId: 'conn_1',
|
||||
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
|
||||
source: 'dbt',
|
||||
tableFields: {
|
||||
tags: ['finance'],
|
||||
freshness: { loaded_at_field: 'created_at' },
|
||||
},
|
||||
columnFields: {
|
||||
status: {
|
||||
constraints: { not_null: true },
|
||||
enum_values: ['placed', 'shipped'],
|
||||
tests: [
|
||||
{ name: 'accepted_values', package: 'dbt', kwargs: { values: ['placed', 'shipped'] } },
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
import type { KloMetadataUpdate } from '../../../scan/enrichment-types.js';
|
||||
import { findMatchingKloTable, type DbtHostTableLite } from './match-tables.js';
|
||||
import type { DbtSchemaParseResult } from './parse-schema.js';
|
||||
|
||||
export function toMetadataUpdates(input: {
|
||||
connectionId: string;
|
||||
parseResult: DbtSchemaParseResult;
|
||||
hostTables: DbtHostTableLite[];
|
||||
targetSchema: string | null;
|
||||
}): KloMetadataUpdate[] {
|
||||
const updates: KloMetadataUpdate[] = [];
|
||||
|
||||
for (const dbtTable of input.parseResult.tables) {
|
||||
const hostTable = findMatchingKloTable(dbtTable, input.hostTables, input.targetSchema);
|
||||
if (!hostTable) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const tableFields: Record<string, unknown> = {};
|
||||
if (dbtTable.tagsDbt?.length) {
|
||||
tableFields.tags = dbtTable.tagsDbt;
|
||||
}
|
||||
if (dbtTable.freshnessDbt) {
|
||||
tableFields.freshness = {
|
||||
...(dbtTable.freshnessDbt.raw !== undefined ? { raw: dbtTable.freshnessDbt.raw } : {}),
|
||||
...(dbtTable.freshnessDbt.loadedAtField !== undefined
|
||||
? { loaded_at_field: dbtTable.freshnessDbt.loadedAtField }
|
||||
: {}),
|
||||
};
|
||||
}
|
||||
|
||||
const columnFields: Record<string, Record<string, unknown>> = {};
|
||||
for (const dbtColumn of dbtTable.columns) {
|
||||
const hostColumn = hostTable.columns.find(
|
||||
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
|
||||
);
|
||||
if (!hostColumn) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const fields: Record<string, unknown> = {};
|
||||
if (dbtColumn.constraints) {
|
||||
fields.constraints = dbtColumn.constraints.dbt;
|
||||
}
|
||||
if (dbtColumn.enumValuesDbt?.length) {
|
||||
fields.enum_values = dbtColumn.enumValuesDbt;
|
||||
}
|
||||
if (dbtColumn.dataTests?.length) {
|
||||
fields.tests = dbtColumn.dataTests.map((test) => ({
|
||||
name: test.name,
|
||||
package: test.package,
|
||||
...(test.kwargs ? { kwargs: test.kwargs } : {}),
|
||||
}));
|
||||
}
|
||||
if (Object.keys(fields).length > 0) {
|
||||
columnFields[hostColumn.name] = fields;
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.keys(tableFields).length === 0 && Object.keys(columnFields).length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
updates.push({
|
||||
connectionId: input.connectionId,
|
||||
table: { catalog: hostTable.catalog, db: hostTable.db, name: hostTable.name },
|
||||
source: 'dbt',
|
||||
...(Object.keys(tableFields).length > 0 ? { tableFields } : {}),
|
||||
...(Object.keys(columnFields).length > 0 ? { columnFields } : {}),
|
||||
});
|
||||
}
|
||||
|
||||
return updates;
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { DbtHostTableLite } from './match-tables.js';
|
||||
import type { DbtSchemaParseResult } from './parse-schema.js';
|
||||
import { toRelationshipUpdates } from './to-relationship-updates.js';
|
||||
|
||||
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
|
||||
|
||||
const hostTables: DbtHostTableLite[] = [
|
||||
{
|
||||
id: '1',
|
||||
name: 'orders',
|
||||
catalog: 'warehouse',
|
||||
db: 'analytics',
|
||||
columns: [{ id: 'c1', name: 'customer_id' }],
|
||||
},
|
||||
{
|
||||
id: '2',
|
||||
name: 'customers',
|
||||
catalog: 'warehouse',
|
||||
db: 'staging',
|
||||
columns: [{ id: 'c2', name: 'id' }],
|
||||
},
|
||||
];
|
||||
|
||||
const parseResult: DbtSchemaParseResult = {
|
||||
projectName: null,
|
||||
dbtVersion: null,
|
||||
tables: [],
|
||||
relationships: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
fromSchema: 'analytics',
|
||||
toSchema: 'analytics',
|
||||
description: 'schema intentionally differs from the host customers table',
|
||||
},
|
||||
{ fromTable: 'orders', fromColumn: 'missing', toTable: 'customers', toColumn: 'id' },
|
||||
{ fromTable: 'orders', fromColumn: 'customer_id', toTable: 'missing_table', toColumn: 'id' },
|
||||
],
|
||||
};
|
||||
|
||||
describe('dbt relationship update payloads', () => {
|
||||
it('validates relationships using the current name-only matching behavior and dbt provenance', () => {
|
||||
expect(toRelationshipUpdates({ connectionId: 'conn-1', parseResult, hostTables })).toEqual({
|
||||
joins: [
|
||||
{
|
||||
connectionId: 'conn-1',
|
||||
fromTable: 'orders',
|
||||
fromColumns: ['customer_id'],
|
||||
toTable: 'customers',
|
||||
toColumns: ['id'],
|
||||
relationship: 'many_to_one',
|
||||
author: 'dbt',
|
||||
authorEmail: DBT_SYSTEM_EMAIL,
|
||||
},
|
||||
],
|
||||
skippedNoMatch: 2,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
import type { KloJoinUpdate } from '../../../scan/enrichment-types.js';
|
||||
import type { DbtHostTableLite } from './match-tables.js';
|
||||
import type { DbtSchemaParseResult } from './parse-schema.js';
|
||||
|
||||
export interface DbtRelationshipUpdates {
|
||||
joins: KloJoinUpdate[];
|
||||
skippedNoMatch: number;
|
||||
}
|
||||
|
||||
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
|
||||
|
||||
export function toRelationshipUpdates(input: {
|
||||
connectionId: string;
|
||||
parseResult: DbtSchemaParseResult;
|
||||
hostTables: DbtHostTableLite[];
|
||||
}): DbtRelationshipUpdates {
|
||||
const tablesByName = new Map<string, DbtHostTableLite>();
|
||||
for (const table of input.hostTables) {
|
||||
tablesByName.set(table.name.toLowerCase(), table);
|
||||
}
|
||||
|
||||
const joins: KloJoinUpdate[] = [];
|
||||
let skippedNoMatch = 0;
|
||||
|
||||
for (const relationship of input.parseResult.relationships) {
|
||||
const fromTable = tablesByName.get(relationship.fromTable.toLowerCase());
|
||||
const toTable = tablesByName.get(relationship.toTable.toLowerCase());
|
||||
if (!fromTable || !toTable) {
|
||||
skippedNoMatch++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const fromColumn = fromTable.columns.find(
|
||||
(column) => column.name.toLowerCase() === relationship.fromColumn.toLowerCase(),
|
||||
);
|
||||
const toColumn = toTable.columns.find(
|
||||
(column) => column.name.toLowerCase() === relationship.toColumn.toLowerCase(),
|
||||
);
|
||||
if (!fromColumn || !toColumn) {
|
||||
skippedNoMatch++;
|
||||
continue;
|
||||
}
|
||||
|
||||
joins.push({
|
||||
connectionId: input.connectionId,
|
||||
fromTable: fromTable.name,
|
||||
fromColumns: [fromColumn.name],
|
||||
toTable: toTable.name,
|
||||
toColumns: [toColumn.name],
|
||||
relationship: 'many_to_one',
|
||||
author: 'dbt',
|
||||
authorEmail: DBT_SYSTEM_EMAIL,
|
||||
});
|
||||
}
|
||||
|
||||
return { joins, skippedNoMatch };
|
||||
}
|
||||
|
|
@ -0,0 +1,410 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { type DbtHostTableLite, matchDbtTables } from './dbt-descriptions/match-tables.js';
|
||||
import { mergeSemanticModelTables } from './dbt-descriptions/merge-semantic-model-tables.js';
|
||||
import { parseDbtSchemaFiles } from './dbt-descriptions/parse-schema.js';
|
||||
import { toDescriptionUpdates } from './dbt-descriptions/to-description-updates.js';
|
||||
import { toRelationshipUpdates } from './dbt-descriptions/to-relationship-updates.js';
|
||||
import { parseMetricflowFiles } from './metricflow/deep-parse.js';
|
||||
import { mapCrossModelMetricToSource, mapSemanticModelToSource } from './metricflow/semantic-models.js';
|
||||
|
||||
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
|
||||
|
||||
const metricflowYaml = `
|
||||
semantic_models:
|
||||
- name: orders_semantic
|
||||
description: MetricFlow order facts
|
||||
model: ref('fct_orders')
|
||||
defaults:
|
||||
agg_time_dimension: ordered_at
|
||||
entities:
|
||||
- name: customer
|
||||
type: foreign
|
||||
expr: customer_id
|
||||
description: Customer relationship
|
||||
dimensions:
|
||||
- name: status
|
||||
type: categorical
|
||||
expr: status
|
||||
description: Order status
|
||||
- name: ordered_at
|
||||
type: time
|
||||
expr: ordered_at
|
||||
measures:
|
||||
- name: total_revenue
|
||||
agg: sum
|
||||
expr: amount
|
||||
description: Revenue
|
||||
- name: customers_semantic
|
||||
description: Customer dimension
|
||||
model: ref('dim_customers')
|
||||
entities:
|
||||
- name: customer
|
||||
type: primary
|
||||
expr: id
|
||||
dimensions:
|
||||
- name: country
|
||||
type: categorical
|
||||
expr: country
|
||||
description: Customer country
|
||||
measures:
|
||||
- name: customer_count
|
||||
agg: count
|
||||
expr: id
|
||||
description: Customer count
|
||||
metrics:
|
||||
- name: total_revenue
|
||||
type: simple
|
||||
type_params:
|
||||
measure: total_revenue
|
||||
- name: customer_count
|
||||
type: simple
|
||||
type_params:
|
||||
measure: customer_count
|
||||
- name: revenue_per_customer
|
||||
description: Revenue per customer
|
||||
type: derived
|
||||
type_params:
|
||||
expr: total_revenue / NULLIF(customer_count, 0)
|
||||
metrics:
|
||||
- name: total_revenue
|
||||
alias: total_revenue
|
||||
- name: customer_count
|
||||
alias: customer_count
|
||||
`;
|
||||
|
||||
const schemaYaml = `
|
||||
version: 2
|
||||
sources:
|
||||
- name: raw
|
||||
database: warehouse
|
||||
schema: landing
|
||||
tables:
|
||||
- name: customers
|
||||
identifier: dim_customers
|
||||
description: Raw customer dimension
|
||||
columns:
|
||||
- name: id
|
||||
description: Customer primary key
|
||||
- name: country
|
||||
description: Country name
|
||||
models:
|
||||
- name: "{{ var('orders_model', 'fct_orders') }}"
|
||||
schema: "{{ var('mart_schema', 'analytics') }}"
|
||||
description: Modeled orders
|
||||
columns:
|
||||
- name: customer_id
|
||||
description: Linked customer id
|
||||
tests:
|
||||
- relationships:
|
||||
to: ref('dim_customers')
|
||||
field: id
|
||||
- name: status
|
||||
description: Order status
|
||||
- name: amount
|
||||
description: Gross amount
|
||||
`;
|
||||
|
||||
const hostTables: DbtHostTableLite[] = [
|
||||
{
|
||||
id: 'orders-table',
|
||||
name: 'fct_orders',
|
||||
catalog: 'warehouse',
|
||||
db: 'analytics',
|
||||
columns: [
|
||||
{ id: 'orders-customer-id', name: 'customer_id' },
|
||||
{ id: 'orders-status', name: 'status' },
|
||||
{ id: 'orders-amount', name: 'amount' },
|
||||
{ id: 'orders-ordered-at', name: 'ordered_at' },
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'customers-table',
|
||||
name: 'dim_customers',
|
||||
catalog: 'warehouse',
|
||||
db: 'landing',
|
||||
columns: [
|
||||
{ id: 'customers-id', name: 'id' },
|
||||
{ id: 'customers-country', name: 'country' },
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
describe('dbt extraction golden parity fixture', () => {
|
||||
it('freezes the relocated MetricFlow and dbt-description contract together', () => {
|
||||
const metricflow = parseMetricflowFiles([{ path: 'semantic_models/orders.yml', content: metricflowYaml }]);
|
||||
|
||||
expect(metricflow).toEqual({
|
||||
semanticModels: [
|
||||
{
|
||||
name: 'orders_semantic',
|
||||
description: 'MetricFlow order facts',
|
||||
modelRef: 'fct_orders',
|
||||
dimensions: [
|
||||
{
|
||||
name: 'status',
|
||||
column: 'status',
|
||||
type: 'string',
|
||||
label: 'Status',
|
||||
description: 'Order status',
|
||||
},
|
||||
{
|
||||
name: 'ordered_at',
|
||||
column: 'ordered_at',
|
||||
type: 'time',
|
||||
label: 'Ordered At',
|
||||
description: undefined,
|
||||
},
|
||||
],
|
||||
measures: [
|
||||
{
|
||||
type: 'simple',
|
||||
name: 'total_revenue',
|
||||
column: 'amount',
|
||||
aggregation: 'sum',
|
||||
label: 'Total Revenue',
|
||||
description: 'Revenue',
|
||||
},
|
||||
],
|
||||
entities: [{ name: 'customer', type: 'foreign', expr: 'customer_id', description: 'Customer relationship' }],
|
||||
defaultTimeDimension: 'ordered_at',
|
||||
},
|
||||
{
|
||||
name: 'customers_semantic',
|
||||
description: 'Customer dimension',
|
||||
modelRef: 'dim_customers',
|
||||
dimensions: [
|
||||
{
|
||||
name: 'country',
|
||||
column: 'country',
|
||||
type: 'string',
|
||||
label: 'Country',
|
||||
description: 'Customer country',
|
||||
},
|
||||
],
|
||||
measures: [
|
||||
{
|
||||
type: 'simple',
|
||||
name: 'customer_count',
|
||||
column: 'id',
|
||||
aggregation: 'count',
|
||||
label: 'Customer Count',
|
||||
description: 'Customer count',
|
||||
},
|
||||
],
|
||||
entities: [{ name: 'customer', type: 'primary', expr: 'id' }],
|
||||
defaultTimeDimension: null,
|
||||
},
|
||||
],
|
||||
crossModelMetrics: [
|
||||
{
|
||||
name: 'revenue_per_customer',
|
||||
label: null,
|
||||
description: 'Revenue per customer',
|
||||
type: 'derived',
|
||||
expr: 'total_revenue / NULLIF(customer_count, 0)',
|
||||
dependsOn: [
|
||||
{ metricName: 'orders_semantic', alias: 'total_revenue' },
|
||||
{ metricName: 'customers_semantic', alias: 'customer_count' },
|
||||
],
|
||||
filter: null,
|
||||
},
|
||||
],
|
||||
relationships: [
|
||||
{
|
||||
fromTable: 'fct_orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'dim_customers',
|
||||
toColumn: 'id',
|
||||
description: 'Customer relationship',
|
||||
},
|
||||
],
|
||||
warnings: [],
|
||||
});
|
||||
|
||||
expect(mapSemanticModelToSource(metricflow.semanticModels[0], 'analytics.fct_orders')).toEqual({
|
||||
name: 'fct-orders',
|
||||
table: 'analytics.fct_orders',
|
||||
grain: ['status', 'ordered_at'],
|
||||
columns: [
|
||||
{ name: 'status', type: 'string', description: 'Order status' },
|
||||
{ name: 'ordered_at', type: 'time' },
|
||||
],
|
||||
measures: [
|
||||
{
|
||||
name: 'total_revenue',
|
||||
expr: 'sum(amount)',
|
||||
description: 'Revenue',
|
||||
},
|
||||
],
|
||||
joins: [],
|
||||
descriptions: { dbt: 'MetricFlow order facts' },
|
||||
});
|
||||
|
||||
expect(mapCrossModelMetricToSource(metricflow.crossModelMetrics[0])).toEqual({
|
||||
name: 'revenue-per-customer',
|
||||
sql: 'total_revenue / NULLIF(customer_count, 0)',
|
||||
descriptions: { dbt: 'Revenue per customer' },
|
||||
grain: [],
|
||||
columns: [],
|
||||
measures: [
|
||||
{
|
||||
name: 'revenue_per_customer',
|
||||
expr: 'total_revenue / NULLIF(customer_count, 0)',
|
||||
description: 'Revenue per customer',
|
||||
},
|
||||
],
|
||||
joins: [],
|
||||
});
|
||||
|
||||
const schema = parseDbtSchemaFiles(
|
||||
[{ path: 'models/schema.yml', content: schemaYaml }],
|
||||
new Map([
|
||||
['orders_model', 'fct_orders'],
|
||||
['mart_schema', 'analytics'],
|
||||
]),
|
||||
);
|
||||
const merged = mergeSemanticModelTables(schema, metricflow.semanticModels);
|
||||
|
||||
expect(merged).toEqual({
|
||||
projectName: null,
|
||||
dbtVersion: null,
|
||||
tables: [
|
||||
{
|
||||
name: 'dim_customers',
|
||||
description: 'Raw customer dimension',
|
||||
database: 'warehouse',
|
||||
schema: 'landing',
|
||||
columns: [
|
||||
{ name: 'id', description: 'Customer primary key', dataType: null },
|
||||
{ name: 'country', description: 'Country name', dataType: null },
|
||||
],
|
||||
resourceType: 'source',
|
||||
},
|
||||
{
|
||||
name: 'fct_orders',
|
||||
description: 'Modeled orders',
|
||||
database: null,
|
||||
schema: 'analytics',
|
||||
columns: [
|
||||
{
|
||||
name: 'customer_id',
|
||||
description: 'Linked customer id',
|
||||
dataType: null,
|
||||
dataTests: [
|
||||
{
|
||||
name: 'relationships',
|
||||
package: 'dbt',
|
||||
kwargs: { to: "ref('dim_customers')", field: 'id' },
|
||||
},
|
||||
],
|
||||
},
|
||||
{ name: 'status', description: 'Order status', dataType: null },
|
||||
{ name: 'amount', description: 'Gross amount', dataType: null },
|
||||
],
|
||||
resourceType: 'model',
|
||||
},
|
||||
],
|
||||
relationships: [
|
||||
{
|
||||
fromTable: 'fct_orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'dim_customers',
|
||||
toColumn: 'id',
|
||||
fromSchema: 'analytics',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(matchDbtTables(merged.tables, hostTables, 'analytics')).toEqual([
|
||||
{
|
||||
dbtTable: 'dim_customers',
|
||||
dbtSchema: 'landing',
|
||||
dbtDatabase: 'warehouse',
|
||||
hostTableId: 'customers-table',
|
||||
hostTableName: 'dim_customers',
|
||||
matched: true,
|
||||
tableDescriptionAction: 'import',
|
||||
tableDescriptionFound: true,
|
||||
columnsToImport: 2,
|
||||
columnsMatched: 2,
|
||||
columnsTotal: 2,
|
||||
columnDescriptionsFound: 2,
|
||||
},
|
||||
{
|
||||
dbtTable: 'fct_orders',
|
||||
dbtSchema: 'analytics',
|
||||
dbtDatabase: null,
|
||||
hostTableId: 'orders-table',
|
||||
hostTableName: 'fct_orders',
|
||||
matched: true,
|
||||
tableDescriptionAction: 'import',
|
||||
tableDescriptionFound: true,
|
||||
columnsToImport: 3,
|
||||
columnsMatched: 3,
|
||||
columnsTotal: 3,
|
||||
columnDescriptionsFound: 3,
|
||||
},
|
||||
]);
|
||||
|
||||
expect(
|
||||
toDescriptionUpdates({
|
||||
connectionId: 'warehouse-1',
|
||||
parseResult: merged,
|
||||
hostTables,
|
||||
targetSchema: 'analytics',
|
||||
}),
|
||||
).toEqual({
|
||||
dbt: [
|
||||
{
|
||||
connectionId: 'warehouse-1',
|
||||
table: { catalog: 'warehouse', db: 'landing', name: 'dim_customers' },
|
||||
source: 'dbt',
|
||||
tableDescription: 'Raw customer dimension',
|
||||
columnDescriptions: {
|
||||
id: 'Customer primary key',
|
||||
country: 'Country name',
|
||||
},
|
||||
},
|
||||
{
|
||||
connectionId: 'warehouse-1',
|
||||
table: { catalog: 'warehouse', db: 'analytics', name: 'fct_orders' },
|
||||
source: 'dbt',
|
||||
tableDescription: 'Modeled orders',
|
||||
columnDescriptions: {
|
||||
customer_id: 'Linked customer id',
|
||||
status: 'Order status',
|
||||
amount: 'Gross amount',
|
||||
},
|
||||
},
|
||||
],
|
||||
aiInvalidations: [
|
||||
{
|
||||
connectionId: 'warehouse-1',
|
||||
table: { catalog: 'warehouse', db: 'landing', name: 'dim_customers' },
|
||||
source: 'ai',
|
||||
},
|
||||
{
|
||||
connectionId: 'warehouse-1',
|
||||
table: { catalog: 'warehouse', db: 'analytics', name: 'fct_orders' },
|
||||
source: 'ai',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(toRelationshipUpdates({ connectionId: 'warehouse-1', parseResult: merged, hostTables })).toEqual({
|
||||
joins: [
|
||||
{
|
||||
connectionId: 'warehouse-1',
|
||||
fromTable: 'fct_orders',
|
||||
fromColumns: ['customer_id'],
|
||||
toTable: 'dim_customers',
|
||||
toColumns: ['id'],
|
||||
relationship: 'many_to_one',
|
||||
author: 'dbt',
|
||||
authorEmail: DBT_SYSTEM_EMAIL,
|
||||
},
|
||||
],
|
||||
skippedNoMatch: 0,
|
||||
});
|
||||
});
|
||||
});
|
||||
36
packages/context/src/ingest/adapters/dbt/chunk.test.ts
Normal file
36
packages/context/src/ingest/adapters/dbt/chunk.test.ts
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { chunkDbtProject } from './chunk.js';
|
||||
|
||||
describe('chunkDbtProject', () => {
|
||||
const diffSet = (modified: string[]) => ({ added: [], modified, deleted: [], unchanged: [] });
|
||||
|
||||
it('caps peerFileIndex when the project has very many yaml files', () => {
|
||||
const modelPaths = Array.from({ length: 201 }, (_, i) => `models/m${i}.yml`);
|
||||
const allPaths = ['dbt_project.yml', ...modelPaths].sort();
|
||||
const { workUnits } = chunkDbtProject({ allPaths });
|
||||
const [first] = workUnits;
|
||||
expect(first).toBeDefined();
|
||||
expect(first?.peerFileIndex).toHaveLength(200);
|
||||
expect(first?.notes).toMatch(/capped at 200/);
|
||||
});
|
||||
|
||||
it('keeps large-project model work units when dbt_project.yml changes', () => {
|
||||
const modelPaths = Array.from({ length: 30 }, (_, i) => `models/m${i}.yml`);
|
||||
const allPaths = ['dbt_project.yml', ...modelPaths].sort();
|
||||
const { workUnits } = chunkDbtProject({ allPaths }, { diffSet: diffSet(['dbt_project.yml']) });
|
||||
|
||||
expect(workUnits).toHaveLength(30);
|
||||
expect(workUnits[0]?.rawFiles).toEqual(['models/m0.yml']);
|
||||
expect(workUnits[0]?.dependencyPaths).toContain('dbt_project.yml');
|
||||
});
|
||||
|
||||
it('keeps large-project model work units when non-model yaml peers change', () => {
|
||||
const modelPaths = Array.from({ length: 30 }, (_, i) => `models/m${i}.yml`);
|
||||
const allPaths = ['dbt_project.yml', 'seeds/seed_properties.yml', ...modelPaths].sort();
|
||||
const { workUnits } = chunkDbtProject({ allPaths }, { diffSet: diffSet(['seeds/seed_properties.yml']) });
|
||||
|
||||
expect(workUnits).toHaveLength(30);
|
||||
expect(workUnits[0]?.rawFiles).toEqual(['models/m0.yml']);
|
||||
expect(workUnits[0]?.dependencyPaths).toContain('seeds/seed_properties.yml');
|
||||
});
|
||||
});
|
||||
130
packages/context/src/ingest/adapters/dbt/chunk.ts
Normal file
130
packages/context/src/ingest/adapters/dbt/chunk.ts
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
import type { ChunkResult, DiffSet, WorkUnit } from '../../types.js';
|
||||
import type { ParsedDbtProject } from './parse.js';
|
||||
|
||||
interface ChunkOptions {
|
||||
diffSet?: DiffSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Per-model work units (when the project has more than 25 YAML files) only name `rawFiles` under
|
||||
* `models/**`. Other `.yml` (e.g. some `seeds/` or custom layouts) still appear in `peerFileIndex`
|
||||
* or in the small-project / no-models fallbacks — v1 does not emit one WU per non-models file.
|
||||
*/
|
||||
const MODELS_PREFIX = 'models/';
|
||||
|
||||
/** `peerFileIndex` is a hint only (agents may not read those paths). Cap to limit prompt size. */
|
||||
const MAX_PEER_FILE_INDEX = 200;
|
||||
|
||||
function projectYamlPath(allPaths: string[]): string | undefined {
|
||||
if (allPaths.includes('dbt_project.yml')) {
|
||||
return 'dbt_project.yml';
|
||||
}
|
||||
if (allPaths.includes('dbt_project.yaml')) {
|
||||
return 'dbt_project.yaml';
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function modelRelativePaths(allPaths: string[]): string[] {
|
||||
return allPaths.filter((p) => p.replace(/\\/g, '/').startsWith(MODELS_PREFIX)).sort();
|
||||
}
|
||||
|
||||
function unitKeyForModelFile(mf: string): string {
|
||||
const base = mf
|
||||
.replace(/\.(ya?ml)$/i, '')
|
||||
.replace(/\\/g, '/')
|
||||
.replace(/[^a-zA-Z0-9]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '');
|
||||
return `dbt-${base.toLowerCase()}`;
|
||||
}
|
||||
|
||||
function emitFirstRunWorkUnits(allPaths: string[], dbtDep: string | undefined): WorkUnit[] {
|
||||
if (allPaths.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (allPaths.length <= 25) {
|
||||
return [
|
||||
{
|
||||
unitKey: 'dbt-all',
|
||||
displayLabel: 'dbt project (all yaml)',
|
||||
rawFiles: [...allPaths],
|
||||
peerFileIndex: [],
|
||||
dependencyPaths: [],
|
||||
notes: 'dbt project — all YAML in one WorkUnit (≤25 files)',
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
const modelFiles = modelRelativePaths(allPaths);
|
||||
if (modelFiles.length === 0) {
|
||||
return [
|
||||
{
|
||||
unitKey: 'dbt-all',
|
||||
displayLabel: 'dbt project (all yaml, no models/**)',
|
||||
rawFiles: [...allPaths],
|
||||
peerFileIndex: [],
|
||||
dependencyPaths: dbtDep ? [dbtDep] : [],
|
||||
notes: 'dbt: no models/**/*.yml — single slice with dbt_project as dependency if present',
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
return modelFiles.map((mf) => {
|
||||
const allPeers = allPaths.filter((p) => p !== mf).sort();
|
||||
const truncated = allPeers.length > MAX_PEER_FILE_INDEX;
|
||||
const peerFileIndex = truncated ? allPeers.slice(0, MAX_PEER_FILE_INDEX) : allPeers;
|
||||
const dependencyPaths = dbtDep && allPaths.includes(dbtDep) && mf !== dbtDep ? [dbtDep].sort() : [];
|
||||
const notes = truncated
|
||||
? `dbt model schema slice (peer index capped at ${MAX_PEER_FILE_INDEX} of ${allPeers.length} paths)`
|
||||
: 'dbt model schema slice';
|
||||
return {
|
||||
unitKey: unitKeyForModelFile(mf),
|
||||
displayLabel: `dbt ${mf}`,
|
||||
rawFiles: [mf],
|
||||
peerFileIndex,
|
||||
dependencyPaths: dependencyPaths,
|
||||
notes,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function applyDiffSet(firstRunUnits: WorkUnit[], diffSet: DiffSet): ChunkResult {
|
||||
const touched = new Set([...diffSet.added, ...diffSet.modified]);
|
||||
const kept: WorkUnit[] = [];
|
||||
|
||||
for (const wu of firstRunUnits) {
|
||||
const touchedRawFiles = wu.rawFiles.filter((p) => touched.has(p));
|
||||
const touchedDependencies = wu.dependencyPaths.filter((p) => touched.has(p));
|
||||
const touchedPeerFiles = wu.peerFileIndex.filter((p) => touched.has(p));
|
||||
if (touchedRawFiles.length === 0 && touchedDependencies.length === 0 && touchedPeerFiles.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const rawFiles = touchedRawFiles.length > 0 ? touchedRawFiles : wu.rawFiles;
|
||||
const unchangedRaw = touchedRawFiles.length > 0 ? wu.rawFiles.filter((p) => !touched.has(p)) : [];
|
||||
for (const p of wu.rawFiles) {
|
||||
if (!rawFiles.includes(p) && !unchangedRaw.includes(p)) {
|
||||
unchangedRaw.push(p);
|
||||
}
|
||||
}
|
||||
const combinedDeps = new Set<string>([...wu.dependencyPaths, ...unchangedRaw, ...touchedPeerFiles]);
|
||||
kept.push({
|
||||
...wu,
|
||||
rawFiles: rawFiles.sort(),
|
||||
dependencyPaths: [...combinedDeps].sort(),
|
||||
});
|
||||
}
|
||||
|
||||
const eviction = diffSet.deleted.length > 0 ? { deletedRawPaths: [...diffSet.deleted].sort() } : undefined;
|
||||
return { workUnits: kept, eviction };
|
||||
}
|
||||
|
||||
export function chunkDbtProject(project: ParsedDbtProject, opts: ChunkOptions = {}): ChunkResult {
|
||||
const dbtDep = projectYamlPath(project.allPaths);
|
||||
const firstRun = emitFirstRunWorkUnits(project.allPaths, dbtDep);
|
||||
if (!opts.diffSet) {
|
||||
return { workUnits: firstRun };
|
||||
}
|
||||
return applyDiffSet(firstRun, opts.diffSet);
|
||||
}
|
||||
51
packages/context/src/ingest/adapters/dbt/dbt.adapter.test.ts
Normal file
51
packages/context/src/ingest/adapters/dbt/dbt.adapter.test.ts
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import type { SourceAdapter } from '../../types.js';
|
||||
import { DbtSourceAdapter } from './dbt.adapter.js';
|
||||
|
||||
describe('DbtSourceAdapter', () => {
|
||||
let stagedDir: string;
|
||||
let adapter: SourceAdapter;
|
||||
|
||||
beforeEach(async () => {
|
||||
stagedDir = await mkdtemp(join(tmpdir(), 'dbt-adapter-'));
|
||||
adapter = new DbtSourceAdapter();
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(stagedDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('declares the expected source key and skill list', () => {
|
||||
expect(adapter.source).toBe('dbt');
|
||||
expect(adapter.skillNames).toEqual(['dbt_ingest']);
|
||||
});
|
||||
|
||||
it('detects a staged dbt project root (dbt_project.yml)', async () => {
|
||||
await writeFile(join(stagedDir, 'dbt_project.yml'), "name: 'jaffle'\nversion: '1.0.0'\n", 'utf-8');
|
||||
expect(await adapter.detect(stagedDir)).toBe(true);
|
||||
});
|
||||
|
||||
it('chunk: dbt_project.yml + models/a.yml yields one WU (≤25 files)', async () => {
|
||||
await writeFile(join(stagedDir, 'dbt_project.yml'), "name: 'jaffle'\n", 'utf-8');
|
||||
await mkdir(join(stagedDir, 'models'), { recursive: true });
|
||||
await writeFile(
|
||||
join(stagedDir, 'models/a.yml'),
|
||||
'version: 2\nmodels:\n - name: orders\n description: Orders\n',
|
||||
'utf-8',
|
||||
);
|
||||
const result = await adapter.chunk(stagedDir);
|
||||
expect(result.workUnits).toHaveLength(1);
|
||||
expect(result.workUnits[0].unitKey).toBe('dbt-all');
|
||||
expect(result.parseArtifacts).toMatchObject({
|
||||
projectName: 'jaffle',
|
||||
tables: [{ name: 'orders', description: 'Orders' }],
|
||||
});
|
||||
});
|
||||
|
||||
it('implements fetch() for git-backed dbt source setup', () => {
|
||||
expect(adapter.fetch).toBeTypeOf('function');
|
||||
});
|
||||
});
|
||||
48
packages/context/src/ingest/adapters/dbt/dbt.adapter.ts
Normal file
48
packages/context/src/ingest/adapters/dbt/dbt.adapter.ts
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
import { join } from 'node:path';
|
||||
import type { ChunkResult, DiffSet, SourceAdapter } from '../../types.js';
|
||||
import type { FetchContext } from '../../types.js';
|
||||
import { loadProjectInfo } from '../../dbt-shared/project-vars.js';
|
||||
import { loadDbtSchemaFiles } from '../../dbt-shared/schema-files.js';
|
||||
import { parseDbtSchemaFiles } from '../dbt-descriptions/parse-schema.js';
|
||||
import { chunkDbtProject } from './chunk.js';
|
||||
import { detectDbtStagedDir } from './detect.js';
|
||||
import { fetchDbtRepo, type DbtPullConfig } from './fetch.js';
|
||||
import { parseDbtStagedDir } from './parse.js';
|
||||
|
||||
interface DbtSourceAdapterOptions {
|
||||
homeDir?: string;
|
||||
}
|
||||
|
||||
export class DbtSourceAdapter implements SourceAdapter {
|
||||
readonly source = 'dbt' as const;
|
||||
/** Runner merges: ingest_triage, sl_capture, knowledge_capture (see ingest-bundle.runner.ts) */
|
||||
readonly skillNames: string[] = ['dbt_ingest'];
|
||||
|
||||
constructor(private readonly options: DbtSourceAdapterOptions = {}) {}
|
||||
|
||||
detect(stagedDir: string): Promise<boolean> {
|
||||
return detectDbtStagedDir(stagedDir);
|
||||
}
|
||||
|
||||
async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
|
||||
const config = pullConfig as DbtPullConfig | undefined;
|
||||
if (!config?.repoUrl) {
|
||||
throw new Error('dbt fetch requires repoUrl');
|
||||
}
|
||||
await fetchDbtRepo({
|
||||
config,
|
||||
cacheDir: join(this.options.homeDir ?? '.klo/cache', 'dbt', ctx.connectionId),
|
||||
stagedDir,
|
||||
});
|
||||
}
|
||||
|
||||
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
|
||||
const project = await parseDbtStagedDir(stagedDir);
|
||||
const projectInfo = await loadProjectInfo(stagedDir);
|
||||
const schemaFiles = await loadDbtSchemaFiles(stagedDir);
|
||||
const parseArtifacts = parseDbtSchemaFiles(schemaFiles, projectInfo.variables, {
|
||||
projectName: projectInfo.projectName,
|
||||
});
|
||||
return { ...chunkDbtProject(project, { diffSet }), parseArtifacts };
|
||||
}
|
||||
}
|
||||
12
packages/context/src/ingest/adapters/dbt/detect.ts
Normal file
12
packages/context/src/ingest/adapters/dbt/detect.ts
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
import { access } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
|
||||
export async function detectDbtStagedDir(stagedDir: string): Promise<boolean> {
|
||||
for (const name of ['dbt_project.yml', 'dbt_project.yaml'] as const) {
|
||||
try {
|
||||
await access(join(stagedDir, name));
|
||||
return true;
|
||||
} catch {}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
38
packages/context/src/ingest/adapters/dbt/fetch.test.ts
Normal file
38
packages/context/src/ingest/adapters/dbt/fetch.test.ts
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
import { fetchDbtRepo } from './fetch.js';
|
||||
|
||||
describe('fetchDbtRepo', () => {
|
||||
let tempDir: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'klo-dbt-fetch-'));
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('copies dbt yaml files from a fetched repo subpath into staged dir', async () => {
|
||||
const cacheDir = join(tempDir, 'cache');
|
||||
const stagedDir = join(tempDir, 'staged');
|
||||
await mkdir(join(cacheDir, 'analytics', 'models'), { recursive: true });
|
||||
await writeFile(join(cacheDir, 'analytics', 'dbt_project.yml'), 'name: analytics\n', 'utf-8');
|
||||
await writeFile(join(cacheDir, 'analytics', 'models', 'orders.yml'), 'models: []\n', 'utf-8');
|
||||
const cloneOrPull = vi.fn(async () => ({ commitHash: 'abc123' }));
|
||||
|
||||
await expect(
|
||||
fetchDbtRepo({
|
||||
config: { repoUrl: 'https://github.com/acme/dbt.git', path: 'analytics' },
|
||||
cacheDir,
|
||||
stagedDir,
|
||||
deps: { cloneOrPull },
|
||||
}),
|
||||
).resolves.toEqual({ commitHash: 'abc123', filesCopied: 2 });
|
||||
|
||||
await expect(readFile(join(stagedDir, 'dbt_project.yml'), 'utf-8')).resolves.toContain('analytics');
|
||||
await expect(readFile(join(stagedDir, 'models', 'orders.yml'), 'utf-8')).resolves.toContain('models');
|
||||
});
|
||||
});
|
||||
60
packages/context/src/ingest/adapters/dbt/fetch.ts
Normal file
60
packages/context/src/ingest/adapters/dbt/fetch.ts
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
import { access, copyFile, mkdir, readdir } from 'node:fs/promises';
|
||||
import { dirname, join, relative } from 'node:path';
|
||||
import { cloneOrPull, sanitizeRepoError } from '../../repo-fetch.js';
|
||||
|
||||
export interface DbtPullConfig {
|
||||
repoUrl: string;
|
||||
branch?: string;
|
||||
path?: string;
|
||||
authToken?: string | null;
|
||||
}
|
||||
|
||||
export interface FetchDbtRepoParams {
|
||||
config: DbtPullConfig;
|
||||
cacheDir: string;
|
||||
stagedDir: string;
|
||||
deps?: {
|
||||
cloneOrPull?: typeof cloneOrPull;
|
||||
};
|
||||
}
|
||||
|
||||
export async function fetchDbtRepo(params: FetchDbtRepoParams): Promise<{ commitHash: string; filesCopied: number }> {
|
||||
try {
|
||||
const runCloneOrPull = params.deps?.cloneOrPull ?? cloneOrPull;
|
||||
const { commitHash } = await runCloneOrPull({
|
||||
repoUrl: params.config.repoUrl,
|
||||
authToken: params.config.authToken,
|
||||
cacheDir: params.cacheDir,
|
||||
branch: params.config.branch ?? 'main',
|
||||
});
|
||||
const sourceRoot = params.config.path ? join(params.cacheDir, params.config.path) : params.cacheDir;
|
||||
const filesCopied = await copyYamlFilesRecursive(sourceRoot, params.stagedDir);
|
||||
return { commitHash, filesCopied };
|
||||
} catch (error) {
|
||||
throw new Error(sanitizeRepoError(error, params.config.authToken));
|
||||
}
|
||||
}
|
||||
|
||||
async function copyYamlFilesRecursive(sourceRoot: string, destRoot: string): Promise<number> {
|
||||
try {
|
||||
await access(sourceRoot);
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
|
||||
await mkdir(destRoot, { recursive: true });
|
||||
const entries = await readdir(sourceRoot, { withFileTypes: true, recursive: true });
|
||||
let copied = 0;
|
||||
for (const entry of entries) {
|
||||
if (!entry.isFile() || !/\.ya?ml$/i.test(entry.name)) {
|
||||
continue;
|
||||
}
|
||||
const absSrc = join(entry.parentPath, entry.name);
|
||||
const rel = relative(sourceRoot, absSrc);
|
||||
const dest = join(destRoot, rel);
|
||||
await mkdir(dirname(dest), { recursive: true });
|
||||
await copyFile(absSrc, dest);
|
||||
copied += 1;
|
||||
}
|
||||
return copied;
|
||||
}
|
||||
8
packages/context/src/ingest/adapters/dbt/parse.test.ts
Normal file
8
packages/context/src/ingest/adapters/dbt/parse.test.ts
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { normalizeDbtPath } from './parse.js';
|
||||
|
||||
describe('normalizeDbtPath', () => {
|
||||
it('normalizes Windows separators to POSIX separators', () => {
|
||||
expect(normalizeDbtPath('models\\marts\\orders.yml')).toBe('models/marts/orders.yml');
|
||||
});
|
||||
});
|
||||
32
packages/context/src/ingest/adapters/dbt/parse.ts
Normal file
32
packages/context/src/ingest/adapters/dbt/parse.ts
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
import { readdir } from 'node:fs/promises';
|
||||
import { join, relative } from 'node:path';
|
||||
|
||||
const YAML_EXT_RE = /\.(ya?ml)$/i;
|
||||
|
||||
export function normalizeDbtPath(path: string): string {
|
||||
return path.replaceAll('\\', '/');
|
||||
}
|
||||
|
||||
async function collectYamlFiles(stagedDir: string): Promise<string[]> {
|
||||
const entries = await readdir(stagedDir, { withFileTypes: true, recursive: true });
|
||||
const paths: string[] = [];
|
||||
for (const entry of entries) {
|
||||
if (!entry.isFile() || !YAML_EXT_RE.test(entry.name)) {
|
||||
continue;
|
||||
}
|
||||
const abs = join(entry.parentPath, entry.name);
|
||||
paths.push(normalizeDbtPath(relative(stagedDir, abs)));
|
||||
}
|
||||
paths.sort();
|
||||
return paths;
|
||||
}
|
||||
|
||||
export interface ParsedDbtProject {
|
||||
/** All `.yml` / `.yaml` paths under stagedDir, relative + sorted. */
|
||||
allPaths: string[];
|
||||
}
|
||||
|
||||
export async function parseDbtStagedDir(stagedDir: string): Promise<ParsedDbtProject> {
|
||||
const allPaths = await collectYamlFiles(stagedDir);
|
||||
return { allPaths };
|
||||
}
|
||||
48
packages/context/src/ingest/adapters/fake/fake.adapter.ts
Normal file
48
packages/context/src/ingest/adapters/fake/fake.adapter.ts
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
import { readdir } from 'node:fs/promises';
|
||||
import { join, relative } from 'node:path';
|
||||
import type { ChunkResult, DiffSet, SourceAdapter, WorkUnit } from '../../types.js';
|
||||
|
||||
export class FakeSourceAdapter implements SourceAdapter {
|
||||
readonly source = 'fake';
|
||||
readonly skillNames: string[] = [];
|
||||
|
||||
detect(): Promise<boolean> {
|
||||
return Promise.resolve(true);
|
||||
}
|
||||
|
||||
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
|
||||
const subDirs = (await readdir(stagedDir, { withFileTypes: true }))
|
||||
.filter((e) => e.isDirectory())
|
||||
.map((e) => e.name)
|
||||
.sort();
|
||||
|
||||
const workUnits: WorkUnit[] = [];
|
||||
for (const subDir of subDirs) {
|
||||
const entries = await readdir(join(stagedDir, subDir), { withFileTypes: true, recursive: true });
|
||||
const rawFiles = entries
|
||||
.filter((e) => e.isFile())
|
||||
.map((e) => relative(stagedDir, join(e.parentPath, e.name)))
|
||||
.sort();
|
||||
if (rawFiles.length === 0) {
|
||||
continue;
|
||||
}
|
||||
if (diffSet) {
|
||||
const touched = new Set([...diffSet.added, ...diffSet.modified]);
|
||||
const anyTouched = rawFiles.some((p) => touched.has(p));
|
||||
if (!anyTouched) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
workUnits.push({
|
||||
unitKey: `fake-${subDir}`,
|
||||
displayLabel: subDir,
|
||||
rawFiles,
|
||||
peerFileIndex: [],
|
||||
dependencyPaths: [],
|
||||
});
|
||||
}
|
||||
|
||||
const eviction = diffSet && diffSet.deleted.length > 0 ? { deletedRawPaths: [...diffSet.deleted] } : undefined;
|
||||
return { workUnits, eviction };
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,146 @@
|
|||
{
|
||||
"name": "eviction-churn",
|
||||
"now": "2026-05-08T12:00:00.000Z",
|
||||
"connectionId": "warehouse",
|
||||
"probe": {
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"warnings": [
|
||||
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn"
|
||||
]
|
||||
},
|
||||
"snapshot": {
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"deallocCount": 3,
|
||||
"rows": [
|
||||
{
|
||||
"queryid": "501",
|
||||
"userid": "11",
|
||||
"username": "analyst",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 20,
|
||||
"totalExecTime": 500,
|
||||
"meanExecTime": 25,
|
||||
"totalRows": 40
|
||||
}
|
||||
]
|
||||
},
|
||||
"pullConfig": {
|
||||
"dialect": "postgres",
|
||||
"windowDays": 90,
|
||||
"lastSuccessfulCursor": null,
|
||||
"serviceAccountUserPatterns": [],
|
||||
"redactionPatterns": [],
|
||||
"maxTemplatesPerRun": 5000,
|
||||
"minCalls": 5
|
||||
},
|
||||
"analysisBySql": {
|
||||
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"tablesTouched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literalSlots": []
|
||||
}
|
||||
},
|
||||
"baseline": null,
|
||||
"expectedBaseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q501": {
|
||||
"firstObservedAt": "2026-05-08T12:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 20,
|
||||
"totalExecTime": 500,
|
||||
"totalRows": 40
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedFiles": {
|
||||
"manifest.json": {
|
||||
"json": {
|
||||
"source": "historic-sql",
|
||||
"connectionId": "warehouse",
|
||||
"dialect": "postgres",
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"windowStart": "2026-05-08T08:00:00.000Z",
|
||||
"windowEnd": "2026-05-08T12:00:00.000Z",
|
||||
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
|
||||
"templateCount": 1,
|
||||
"capped": false,
|
||||
"warnings": [
|
||||
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn",
|
||||
"pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn",
|
||||
"baseline_first_run:no_previous_pgss_baseline"
|
||||
],
|
||||
"degraded": true,
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"baselineFirstRun": true,
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"deallocCount": 3,
|
||||
"templates": [
|
||||
{
|
||||
"id": "db5_q501",
|
||||
"fingerprint": "fp_orders_status",
|
||||
"subClusterId": null,
|
||||
"path": "templates/db5_q501/page.md"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"templates/db5_q501/metadata.json": {
|
||||
"json": {
|
||||
"id": "db5_q501",
|
||||
"title": "postgres · analytics.orders [db5_q501]",
|
||||
"path": "templates/db5_q501/page.md",
|
||||
"objectType": "historic_sql_template",
|
||||
"lastEditedAt": null,
|
||||
"properties": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"sub_cluster_id": null,
|
||||
"dialect": "postgres",
|
||||
"tables_touched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literal_slots": [],
|
||||
"triage_signals": {
|
||||
"executions_bucket": "mid",
|
||||
"distinct_users_bucket": "solo",
|
||||
"error_rate_bucket": "ok",
|
||||
"recency_bucket": "active",
|
||||
"service_account_only": "false",
|
||||
"runtime_bucket": "fast"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"templates/db5_q501/page.md": {
|
||||
"text": "# db5_q501\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
|
||||
},
|
||||
"templates/db5_q501/usage.json": {
|
||||
"json": {
|
||||
"stats": {
|
||||
"executions": 20,
|
||||
"distinct_users": 1,
|
||||
"first_seen": "2026-05-08T12:00:00.000Z",
|
||||
"last_seen": "2026-05-08T12:00:00.000Z",
|
||||
"p50_runtime_ms": null,
|
||||
"p95_runtime_ms": null,
|
||||
"mean_runtime_ms": 25,
|
||||
"error_rate": 0,
|
||||
"rows_produced": 40
|
||||
},
|
||||
"literal_slots": [],
|
||||
"samples": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,144 @@
|
|||
{
|
||||
"name": "first-run",
|
||||
"now": "2026-05-08T12:00:00.000Z",
|
||||
"connectionId": "warehouse",
|
||||
"probe": {
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"warnings": []
|
||||
},
|
||||
"snapshot": {
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"deallocCount": 0,
|
||||
"rows": [
|
||||
{
|
||||
"queryid": "101",
|
||||
"userid": "11",
|
||||
"username": "analyst",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 10,
|
||||
"totalExecTime": 250,
|
||||
"meanExecTime": 25,
|
||||
"totalRows": 20
|
||||
}
|
||||
]
|
||||
},
|
||||
"pullConfig": {
|
||||
"dialect": "postgres",
|
||||
"windowDays": 90,
|
||||
"lastSuccessfulCursor": null,
|
||||
"serviceAccountUserPatterns": [
|
||||
"^svc_"
|
||||
],
|
||||
"redactionPatterns": [],
|
||||
"maxTemplatesPerRun": 5000,
|
||||
"minCalls": 5
|
||||
},
|
||||
"analysisBySql": {
|
||||
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"tablesTouched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literalSlots": []
|
||||
}
|
||||
},
|
||||
"baseline": null,
|
||||
"expectedBaseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q101": {
|
||||
"firstObservedAt": "2026-05-08T12:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 10,
|
||||
"totalExecTime": 250,
|
||||
"totalRows": 20
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedFiles": {
|
||||
"manifest.json": {
|
||||
"json": {
|
||||
"source": "historic-sql",
|
||||
"connectionId": "warehouse",
|
||||
"dialect": "postgres",
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"windowStart": "2026-05-08T08:00:00.000Z",
|
||||
"windowEnd": "2026-05-08T12:00:00.000Z",
|
||||
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
|
||||
"templateCount": 1,
|
||||
"capped": false,
|
||||
"warnings": [
|
||||
"baseline_first_run:no_previous_pgss_baseline"
|
||||
],
|
||||
"degraded": true,
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"baselineFirstRun": true,
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"deallocCount": 0,
|
||||
"templates": [
|
||||
{
|
||||
"id": "db5_q101",
|
||||
"fingerprint": "fp_orders_status",
|
||||
"subClusterId": null,
|
||||
"path": "templates/db5_q101/page.md"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"templates/db5_q101/metadata.json": {
|
||||
"json": {
|
||||
"id": "db5_q101",
|
||||
"title": "postgres · analytics.orders [db5_q101]",
|
||||
"path": "templates/db5_q101/page.md",
|
||||
"objectType": "historic_sql_template",
|
||||
"lastEditedAt": null,
|
||||
"properties": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"sub_cluster_id": null,
|
||||
"dialect": "postgres",
|
||||
"tables_touched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literal_slots": [],
|
||||
"triage_signals": {
|
||||
"executions_bucket": "mid",
|
||||
"distinct_users_bucket": "solo",
|
||||
"error_rate_bucket": "ok",
|
||||
"recency_bucket": "active",
|
||||
"service_account_only": "false",
|
||||
"runtime_bucket": "fast"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"templates/db5_q101/page.md": {
|
||||
"text": "# db5_q101\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
|
||||
},
|
||||
"templates/db5_q101/usage.json": {
|
||||
"json": {
|
||||
"stats": {
|
||||
"executions": 10,
|
||||
"distinct_users": 1,
|
||||
"first_seen": "2026-05-08T12:00:00.000Z",
|
||||
"last_seen": "2026-05-08T12:00:00.000Z",
|
||||
"p50_runtime_ms": null,
|
||||
"p95_runtime_ms": null,
|
||||
"mean_runtime_ms": 25,
|
||||
"error_rate": 0,
|
||||
"rows_produced": 20
|
||||
},
|
||||
"literal_slots": [],
|
||||
"samples": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,181 @@
|
|||
{
|
||||
"name": "normal-delta",
|
||||
"now": "2026-05-08T12:00:00.000Z",
|
||||
"connectionId": "warehouse",
|
||||
"probe": {
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"warnings": []
|
||||
},
|
||||
"snapshot": {
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"deallocCount": 0,
|
||||
"rows": [
|
||||
{
|
||||
"queryid": "201",
|
||||
"userid": "11",
|
||||
"username": "analyst",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 12,
|
||||
"totalExecTime": 160,
|
||||
"meanExecTime": 13.333333333333334,
|
||||
"totalRows": 58
|
||||
},
|
||||
{
|
||||
"queryid": "201",
|
||||
"userid": "12",
|
||||
"username": "svc_loader",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 5,
|
||||
"totalExecTime": 50,
|
||||
"meanExecTime": 10,
|
||||
"totalRows": 25
|
||||
}
|
||||
]
|
||||
},
|
||||
"pullConfig": {
|
||||
"dialect": "postgres",
|
||||
"windowDays": 90,
|
||||
"lastSuccessfulCursor": null,
|
||||
"serviceAccountUserPatterns": [
|
||||
"^svc_"
|
||||
],
|
||||
"redactionPatterns": [],
|
||||
"maxTemplatesPerRun": 5000,
|
||||
"minCalls": 5
|
||||
},
|
||||
"analysisBySql": {
|
||||
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"tablesTouched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literalSlots": []
|
||||
}
|
||||
},
|
||||
"baseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T10:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q201": {
|
||||
"firstObservedAt": "2026-05-08T09:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 10,
|
||||
"totalExecTime": 100,
|
||||
"totalRows": 50
|
||||
},
|
||||
"12": {
|
||||
"calls": 5,
|
||||
"totalExecTime": 50,
|
||||
"totalRows": 25
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedBaseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q201": {
|
||||
"firstObservedAt": "2026-05-08T09:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 12,
|
||||
"totalExecTime": 160,
|
||||
"totalRows": 58
|
||||
},
|
||||
"12": {
|
||||
"calls": 5,
|
||||
"totalExecTime": 50,
|
||||
"totalRows": 25
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedFiles": {
|
||||
"manifest.json": {
|
||||
"json": {
|
||||
"source": "historic-sql",
|
||||
"connectionId": "warehouse",
|
||||
"dialect": "postgres",
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"windowStart": "2026-05-08T10:00:00.000Z",
|
||||
"windowEnd": "2026-05-08T12:00:00.000Z",
|
||||
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
|
||||
"templateCount": 1,
|
||||
"capped": false,
|
||||
"warnings": [],
|
||||
"degraded": true,
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"baselineFirstRun": false,
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"deallocCount": 0,
|
||||
"templates": [
|
||||
{
|
||||
"id": "db5_q201",
|
||||
"fingerprint": "fp_orders_status",
|
||||
"subClusterId": null,
|
||||
"path": "templates/db5_q201/page.md"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"templates/db5_q201/metadata.json": {
|
||||
"json": {
|
||||
"id": "db5_q201",
|
||||
"title": "postgres · analytics.orders [db5_q201]",
|
||||
"path": "templates/db5_q201/page.md",
|
||||
"objectType": "historic_sql_template",
|
||||
"lastEditedAt": null,
|
||||
"properties": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"sub_cluster_id": null,
|
||||
"dialect": "postgres",
|
||||
"tables_touched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literal_slots": [],
|
||||
"triage_signals": {
|
||||
"executions_bucket": "low",
|
||||
"distinct_users_bucket": "solo",
|
||||
"error_rate_bucket": "ok",
|
||||
"recency_bucket": "active",
|
||||
"service_account_only": "false",
|
||||
"runtime_bucket": "fast"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"templates/db5_q201/page.md": {
|
||||
"text": "# db5_q201\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
|
||||
},
|
||||
"templates/db5_q201/usage.json": {
|
||||
"json": {
|
||||
"stats": {
|
||||
"executions": 2,
|
||||
"distinct_users": 1,
|
||||
"first_seen": "2026-05-08T09:00:00.000Z",
|
||||
"last_seen": "2026-05-08T12:00:00.000Z",
|
||||
"p50_runtime_ms": null,
|
||||
"p95_runtime_ms": null,
|
||||
"mean_runtime_ms": 30,
|
||||
"error_rate": 0,
|
||||
"rows_produced": 8
|
||||
},
|
||||
"literal_slots": [],
|
||||
"samples": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
{
|
||||
"name": "reset-detected",
|
||||
"now": "2026-05-08T12:00:00.000Z",
|
||||
"connectionId": "warehouse",
|
||||
"probe": {
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"warnings": []
|
||||
},
|
||||
"snapshot": {
|
||||
"statsResetAt": "2026-05-08T11:00:00.000Z",
|
||||
"deallocCount": 0,
|
||||
"rows": [
|
||||
{
|
||||
"queryid": "301",
|
||||
"userid": "11",
|
||||
"username": "analyst",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 3,
|
||||
"totalExecTime": 90,
|
||||
"meanExecTime": 30,
|
||||
"totalRows": 9
|
||||
}
|
||||
]
|
||||
},
|
||||
"pullConfig": {
|
||||
"dialect": "postgres",
|
||||
"windowDays": 90,
|
||||
"lastSuccessfulCursor": null,
|
||||
"serviceAccountUserPatterns": [],
|
||||
"redactionPatterns": [],
|
||||
"maxTemplatesPerRun": 5000,
|
||||
"minCalls": 5
|
||||
},
|
||||
"analysisBySql": {
|
||||
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"tablesTouched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literalSlots": []
|
||||
}
|
||||
},
|
||||
"baseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T10:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q301": {
|
||||
"firstObservedAt": "2026-05-08T09:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 100,
|
||||
"totalExecTime": 1000,
|
||||
"totalRows": 500
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedBaseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T11:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q301": {
|
||||
"firstObservedAt": "2026-05-08T12:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 3,
|
||||
"totalExecTime": 90,
|
||||
"totalRows": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedFiles": {
|
||||
"manifest.json": {
|
||||
"json": {
|
||||
"source": "historic-sql",
|
||||
"connectionId": "warehouse",
|
||||
"dialect": "postgres",
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"windowStart": "2026-05-08T10:00:00.000Z",
|
||||
"windowEnd": "2026-05-08T12:00:00.000Z",
|
||||
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
|
||||
"templateCount": 1,
|
||||
"capped": false,
|
||||
"warnings": [
|
||||
"baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z"
|
||||
],
|
||||
"degraded": true,
|
||||
"statsResetAt": "2026-05-08T11:00:00.000Z",
|
||||
"baselineFirstRun": true,
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"deallocCount": 0,
|
||||
"templates": [
|
||||
{
|
||||
"id": "db5_q301",
|
||||
"fingerprint": "fp_orders_status",
|
||||
"subClusterId": null,
|
||||
"path": "templates/db5_q301/page.md"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"templates/db5_q301/metadata.json": {
|
||||
"json": {
|
||||
"id": "db5_q301",
|
||||
"title": "postgres · analytics.orders [db5_q301]",
|
||||
"path": "templates/db5_q301/page.md",
|
||||
"objectType": "historic_sql_template",
|
||||
"lastEditedAt": null,
|
||||
"properties": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"sub_cluster_id": null,
|
||||
"dialect": "postgres",
|
||||
"tables_touched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literal_slots": [],
|
||||
"triage_signals": {
|
||||
"executions_bucket": "mid",
|
||||
"distinct_users_bucket": "solo",
|
||||
"error_rate_bucket": "ok",
|
||||
"recency_bucket": "active",
|
||||
"service_account_only": "false",
|
||||
"runtime_bucket": "fast"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"templates/db5_q301/page.md": {
|
||||
"text": "# db5_q301\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
|
||||
},
|
||||
"templates/db5_q301/usage.json": {
|
||||
"json": {
|
||||
"stats": {
|
||||
"executions": 3,
|
||||
"distinct_users": 1,
|
||||
"first_seen": "2026-05-08T12:00:00.000Z",
|
||||
"last_seen": "2026-05-08T12:00:00.000Z",
|
||||
"p50_runtime_ms": null,
|
||||
"p95_runtime_ms": null,
|
||||
"mean_runtime_ms": 30,
|
||||
"error_rate": 0,
|
||||
"rows_produced": 9
|
||||
},
|
||||
"literal_slots": [],
|
||||
"samples": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
{
|
||||
"name": "version-change",
|
||||
"now": "2026-05-08T12:00:00.000Z",
|
||||
"connectionId": "warehouse",
|
||||
"probe": {
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"warnings": []
|
||||
},
|
||||
"snapshot": {
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"deallocCount": 0,
|
||||
"rows": [
|
||||
{
|
||||
"queryid": "401",
|
||||
"userid": "11",
|
||||
"username": "analyst",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 4,
|
||||
"totalExecTime": 80,
|
||||
"meanExecTime": 20,
|
||||
"totalRows": 8
|
||||
}
|
||||
]
|
||||
},
|
||||
"pullConfig": {
|
||||
"dialect": "postgres",
|
||||
"windowDays": 90,
|
||||
"lastSuccessfulCursor": null,
|
||||
"serviceAccountUserPatterns": [],
|
||||
"redactionPatterns": [],
|
||||
"maxTemplatesPerRun": 5000,
|
||||
"minCalls": 5
|
||||
},
|
||||
"analysisBySql": {
|
||||
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"tablesTouched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literalSlots": []
|
||||
}
|
||||
},
|
||||
"baseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T10:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 15.7",
|
||||
"templates": {
|
||||
"db5_q401": {
|
||||
"firstObservedAt": "2026-05-08T09:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 100,
|
||||
"totalExecTime": 1000,
|
||||
"totalRows": 500
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedBaseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q401": {
|
||||
"firstObservedAt": "2026-05-08T12:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 4,
|
||||
"totalExecTime": 80,
|
||||
"totalRows": 8
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedFiles": {
|
||||
"manifest.json": {
|
||||
"json": {
|
||||
"source": "historic-sql",
|
||||
"connectionId": "warehouse",
|
||||
"dialect": "postgres",
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"windowStart": "2026-05-08T10:00:00.000Z",
|
||||
"windowEnd": "2026-05-08T12:00:00.000Z",
|
||||
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
|
||||
"templateCount": 1,
|
||||
"capped": false,
|
||||
"warnings": [
|
||||
"baseline_reset:pg_server_major changed from 15 to 16"
|
||||
],
|
||||
"degraded": true,
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"baselineFirstRun": true,
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"deallocCount": 0,
|
||||
"templates": [
|
||||
{
|
||||
"id": "db5_q401",
|
||||
"fingerprint": "fp_orders_status",
|
||||
"subClusterId": null,
|
||||
"path": "templates/db5_q401/page.md"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"templates/db5_q401/metadata.json": {
|
||||
"json": {
|
||||
"id": "db5_q401",
|
||||
"title": "postgres · analytics.orders [db5_q401]",
|
||||
"path": "templates/db5_q401/page.md",
|
||||
"objectType": "historic_sql_template",
|
||||
"lastEditedAt": null,
|
||||
"properties": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"sub_cluster_id": null,
|
||||
"dialect": "postgres",
|
||||
"tables_touched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literal_slots": [],
|
||||
"triage_signals": {
|
||||
"executions_bucket": "mid",
|
||||
"distinct_users_bucket": "solo",
|
||||
"error_rate_bucket": "ok",
|
||||
"recency_bucket": "active",
|
||||
"service_account_only": "false",
|
||||
"runtime_bucket": "fast"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"templates/db5_q401/page.md": {
|
||||
"text": "# db5_q401\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
|
||||
},
|
||||
"templates/db5_q401/usage.json": {
|
||||
"json": {
|
||||
"stats": {
|
||||
"executions": 4,
|
||||
"distinct_users": 1,
|
||||
"first_seen": "2026-05-08T12:00:00.000Z",
|
||||
"last_seen": "2026-05-08T12:00:00.000Z",
|
||||
"p50_runtime_ms": null,
|
||||
"p95_runtime_ms": null,
|
||||
"mean_runtime_ms": 20,
|
||||
"error_rate": 0,
|
||||
"rows_produced": 8
|
||||
},
|
||||
"literal_slots": [],
|
||||
"samples": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,200 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { BigQueryHistoricSqlQueryHistoryReader } from './bigquery-query-history-reader.js';
|
||||
import { HistoricSqlGrantsMissingError } from './errors.js';
|
||||
|
||||
interface FakeQueryResult {
|
||||
headers: string[];
|
||||
rows: unknown[][];
|
||||
totalRows: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
function queryClient(results: FakeQueryResult[]) {
|
||||
const executeQuery = vi.fn(async (_query: string) => {
|
||||
const next = results.shift();
|
||||
if (!next) {
|
||||
throw new Error('unexpected query');
|
||||
}
|
||||
return next;
|
||||
});
|
||||
return { executeQuery };
|
||||
}
|
||||
|
||||
function firstQuery(client: ReturnType<typeof queryClient>): string {
|
||||
const call = client.executeQuery.mock.calls[0];
|
||||
if (!call) {
|
||||
throw new Error('expected query client to be called');
|
||||
}
|
||||
return call[0];
|
||||
}
|
||||
|
||||
describe('BigQueryHistoricSqlQueryHistoryReader', () => {
|
||||
it('probes region-qualified INFORMATION_SCHEMA.JOBS_BY_PROJECT', async () => {
|
||||
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
|
||||
|
||||
await expect(reader.probe(client)).resolves.toBeUndefined();
|
||||
|
||||
expect(client.executeQuery).toHaveBeenCalledWith(
|
||||
'SELECT 1 FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` LIMIT 1',
|
||||
);
|
||||
});
|
||||
|
||||
it('turns probe result errors into HistoricSqlGrantsMissingError', async () => {
|
||||
const client = queryClient([{ headers: [], rows: [], totalRows: 0, error: 'Access Denied: jobs.listAll' }]);
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'us-central1' });
|
||||
|
||||
await expect(reader.probe(client)).rejects.toMatchObject({
|
||||
name: 'HistoricSqlGrantsMissingError',
|
||||
dialect: 'bigquery',
|
||||
remediation:
|
||||
'Grant roles/bigquery.resourceViewer on the BigQuery project, or grant a custom role containing bigquery.jobs.listAll.',
|
||||
});
|
||||
});
|
||||
|
||||
it('turns thrown probe failures into HistoricSqlGrantsMissingError', async () => {
|
||||
const client = {
|
||||
executeQuery: vi.fn(async () => {
|
||||
throw new Error('permission denied');
|
||||
}),
|
||||
};
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
|
||||
|
||||
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
|
||||
});
|
||||
|
||||
it('fetches BigQuery jobs with cursor and maps them into RawQueryRow shape without rowsProduced', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: [
|
||||
'job_id',
|
||||
'query',
|
||||
'user_email',
|
||||
'creation_time',
|
||||
'end_time',
|
||||
'runtime_ms',
|
||||
'total_slot_ms',
|
||||
'total_bytes_processed',
|
||||
'state',
|
||||
'error_reason',
|
||||
'error_message',
|
||||
'statement_type',
|
||||
],
|
||||
rows: [
|
||||
[
|
||||
'bquxjob_1',
|
||||
"SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'",
|
||||
'analyst-a@example.test',
|
||||
'2026-05-04T10:00:00.000Z',
|
||||
'2026-05-04T10:00:01.250Z',
|
||||
1250,
|
||||
3106,
|
||||
161164718,
|
||||
'DONE',
|
||||
null,
|
||||
null,
|
||||
'SELECT',
|
||||
],
|
||||
[
|
||||
'bquxjob_2',
|
||||
'SELECT * FROM `project-1.analytics.missing_table`',
|
||||
'analyst-b@example.test',
|
||||
new Date('2026-05-04T10:05:00.000Z'),
|
||||
null,
|
||||
null,
|
||||
0,
|
||||
0,
|
||||
'DONE',
|
||||
'notFound',
|
||||
'Not found: Table project-1.analytics.missing_table',
|
||||
'SELECT',
|
||||
],
|
||||
],
|
||||
totalRows: 2,
|
||||
},
|
||||
]);
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
|
||||
|
||||
const rows = [];
|
||||
for await (const row of reader.fetch(
|
||||
client,
|
||||
{
|
||||
start: new Date('2026-05-01T00:00:00.000Z'),
|
||||
end: new Date('2026-05-04T12:00:00.000Z'),
|
||||
},
|
||||
'2026-05-03T00:00:00.000Z',
|
||||
)) {
|
||||
rows.push(row);
|
||||
}
|
||||
|
||||
expect(client.executeQuery).toHaveBeenCalledTimes(1);
|
||||
const sql = firstQuery(client);
|
||||
expect(sql).toContain('FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`');
|
||||
expect(sql).toContain("creation_time >= TIMESTAMP('2026-05-03T00:00:00.000Z')");
|
||||
expect(sql).toContain("creation_time < TIMESTAMP('2026-05-04T12:00:00.000Z')");
|
||||
expect(sql).toContain("job_type = 'QUERY'");
|
||||
expect(sql).toContain("(statement_type IS NULL OR statement_type != 'SCRIPT')");
|
||||
expect(sql).toContain('ORDER BY creation_time ASC, job_id ASC');
|
||||
expect(sql).toContain('total_slot_ms');
|
||||
expect(sql).toContain('total_bytes_processed');
|
||||
expect(sql).not.toMatch(/total_rows/i);
|
||||
|
||||
expect(rows).toEqual([
|
||||
{
|
||||
id: 'bquxjob_1',
|
||||
sql: "SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'",
|
||||
user: 'analyst-a@example.test',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: '2026-05-04T10:00:01.250Z',
|
||||
runtimeMs: 1250,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'bquxjob_2',
|
||||
sql: 'SELECT * FROM `project-1.analytics.missing_table`',
|
||||
user: 'analyst-b@example.test',
|
||||
startedAt: '2026-05-04T10:05:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: null,
|
||||
success: false,
|
||||
errorMessage: 'notFound: Not found: Table project-1.analytics.missing_table',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('uses the window start when no cursor is available', async () => {
|
||||
const client = queryClient([{ headers: ['job_id'], rows: [], totalRows: 0 }]);
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'EU' });
|
||||
|
||||
for await (const _row of reader.fetch(client, {
|
||||
start: new Date('2026-02-03T12:00:00.000Z'),
|
||||
end: new Date('2026-05-04T12:00:00.000Z'),
|
||||
})) {
|
||||
throw new Error('empty result should not yield rows');
|
||||
}
|
||||
|
||||
const sql = firstQuery(client);
|
||||
expect(sql).toContain('FROM `project-1.region-eu.INFORMATION_SCHEMA.JOBS_BY_PROJECT`');
|
||||
expect(sql).toContain("creation_time >= TIMESTAMP('2026-02-03T12:00:00.000Z')");
|
||||
});
|
||||
|
||||
it('throws a clear error when the query client cannot execute SQL', async () => {
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
|
||||
|
||||
await expect(async () => {
|
||||
for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) {
|
||||
throw new Error('unreachable');
|
||||
}
|
||||
}).rejects.toThrow('Historic SQL BigQuery reader requires a query client with executeQuery(query)');
|
||||
});
|
||||
|
||||
it('rejects unsafe project and region identifiers before building SQL', () => {
|
||||
expect(() => new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project`1', region: 'US' })).toThrow(
|
||||
'Invalid BigQuery project id for historic-SQL ingest: project`1',
|
||||
);
|
||||
expect(() => new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US;DROP' })).toThrow(
|
||||
'Invalid BigQuery region for historic-SQL ingest: US;DROP',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,219 @@
|
|||
import { HistoricSqlGrantsMissingError } from './errors.js';
|
||||
import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js';
|
||||
|
||||
interface QueryResultLike {
|
||||
headers: string[];
|
||||
rows: unknown[][];
|
||||
totalRows: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
interface QueryClientLike {
|
||||
executeQuery(query: string): Promise<QueryResultLike>;
|
||||
}
|
||||
|
||||
export interface BigQueryHistoricSqlQueryHistoryReaderOptions {
|
||||
projectId: string;
|
||||
region: string;
|
||||
}
|
||||
|
||||
const BIGQUERY_GRANTS_REMEDIATION =
|
||||
'Grant roles/bigquery.resourceViewer on the BigQuery project, or grant a custom role containing bigquery.jobs.listAll.';
|
||||
|
||||
function queryClient(client: unknown): QueryClientLike {
|
||||
if (
|
||||
client &&
|
||||
typeof client === 'object' &&
|
||||
'executeQuery' in client &&
|
||||
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
|
||||
) {
|
||||
return client as QueryClientLike;
|
||||
}
|
||||
throw new Error('Historic SQL BigQuery reader requires a query client with executeQuery(query)');
|
||||
}
|
||||
|
||||
function grantsError(cause: unknown): HistoricSqlGrantsMissingError {
|
||||
const message =
|
||||
cause instanceof Error
|
||||
? cause.message
|
||||
: typeof cause === 'string'
|
||||
? cause
|
||||
: 'BigQuery principal cannot query INFORMATION_SCHEMA.JOBS_BY_PROJECT.';
|
||||
return new HistoricSqlGrantsMissingError({
|
||||
dialect: 'bigquery',
|
||||
message: `Missing BigQuery audit grants for historic-SQL ingest: ${message}`,
|
||||
remediation: BIGQUERY_GRANTS_REMEDIATION,
|
||||
cause,
|
||||
});
|
||||
}
|
||||
|
||||
function normalizeProjectId(value: string): string {
|
||||
if (!/^[A-Za-z0-9_-]+$/.test(value)) {
|
||||
throw new Error(`Invalid BigQuery project id for historic-SQL ingest: ${value}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function normalizeRegion(value: string): string {
|
||||
const region = value.trim().toLowerCase().replace(/^region-/, '');
|
||||
if (!/^[a-z0-9-]+$/.test(region)) {
|
||||
throw new Error(`Invalid BigQuery region for historic-SQL ingest: ${value}`);
|
||||
}
|
||||
return region;
|
||||
}
|
||||
|
||||
function timestampExpression(value: Date | string): string {
|
||||
const date = value instanceof Date ? value : new Date(value);
|
||||
if (Number.isNaN(date.getTime())) {
|
||||
throw new Error(`Invalid BigQuery query-history timestamp: ${String(value)}`);
|
||||
}
|
||||
return `TIMESTAMP('${date.toISOString().replace(/'/g, "\\'")}')`;
|
||||
}
|
||||
|
||||
function indexByHeader(headers: string[]): Map<string, number> {
|
||||
const out = new Map<string, number>();
|
||||
headers.forEach((header, index) => {
|
||||
out.set(header.toUpperCase(), index);
|
||||
});
|
||||
return out;
|
||||
}
|
||||
|
||||
function value(row: unknown[], indexes: Map<string, number>, name: string): unknown {
|
||||
const index = indexes.get(name.toUpperCase());
|
||||
return index === undefined ? null : row[index];
|
||||
}
|
||||
|
||||
function nullableString(raw: unknown): string | null {
|
||||
if (raw === null || raw === undefined) {
|
||||
return null;
|
||||
}
|
||||
const text = String(raw);
|
||||
return text.length > 0 ? text : null;
|
||||
}
|
||||
|
||||
function requiredString(raw: unknown, field: string): string {
|
||||
const text = nullableString(raw);
|
||||
if (!text) {
|
||||
throw new Error(`BigQuery JOBS_BY_PROJECT row is missing ${field}`);
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
function nullableNumber(raw: unknown): number | null {
|
||||
if (raw === null || raw === undefined || raw === '') {
|
||||
return null;
|
||||
}
|
||||
const number = typeof raw === 'number' ? raw : Number(raw);
|
||||
if (!Number.isFinite(number)) {
|
||||
return null;
|
||||
}
|
||||
return Math.max(0, number);
|
||||
}
|
||||
|
||||
function isoTimestamp(raw: unknown, field: string): string {
|
||||
if (raw instanceof Date) {
|
||||
return raw.toISOString();
|
||||
}
|
||||
const text = requiredString(raw, field);
|
||||
const date = new Date(text);
|
||||
if (Number.isNaN(date.getTime())) {
|
||||
throw new Error(`BigQuery JOBS_BY_PROJECT row has invalid ${field}: ${text}`);
|
||||
}
|
||||
return date.toISOString();
|
||||
}
|
||||
|
||||
function nullableIsoTimestamp(raw: unknown): string | null {
|
||||
if (raw === null || raw === undefined || raw === '') {
|
||||
return null;
|
||||
}
|
||||
return isoTimestamp(raw, 'end_time');
|
||||
}
|
||||
|
||||
function executionSucceeded(state: string | null, errorReason: string | null, errorMessage: string | null): boolean {
|
||||
if (errorReason || errorMessage) {
|
||||
return false;
|
||||
}
|
||||
return state === null || state.toUpperCase() === 'DONE';
|
||||
}
|
||||
|
||||
function combinedErrorMessage(errorReason: string | null, errorMessage: string | null): string | null {
|
||||
if (errorReason && errorMessage) {
|
||||
return `${errorReason}: ${errorMessage}`;
|
||||
}
|
||||
return errorMessage ?? errorReason;
|
||||
}
|
||||
|
||||
function mapRow(row: unknown[], indexes: Map<string, number>): HistoricSqlRawQueryRow {
|
||||
const errorReason = nullableString(value(row, indexes, 'error_reason'));
|
||||
const errorMessage = nullableString(value(row, indexes, 'error_message'));
|
||||
return {
|
||||
id: requiredString(value(row, indexes, 'job_id'), 'job_id'),
|
||||
sql: requiredString(value(row, indexes, 'query'), 'query'),
|
||||
user: nullableString(value(row, indexes, 'user_email')),
|
||||
startedAt: isoTimestamp(value(row, indexes, 'creation_time'), 'creation_time'),
|
||||
endedAt: nullableIsoTimestamp(value(row, indexes, 'end_time')),
|
||||
runtimeMs: nullableNumber(value(row, indexes, 'runtime_ms')),
|
||||
success: executionSucceeded(nullableString(value(row, indexes, 'state')), errorReason, errorMessage),
|
||||
errorMessage: combinedErrorMessage(errorReason, errorMessage),
|
||||
};
|
||||
}
|
||||
|
||||
export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader {
|
||||
private readonly viewPath: string;
|
||||
|
||||
constructor(options: BigQueryHistoricSqlQueryHistoryReaderOptions) {
|
||||
const projectId = normalizeProjectId(options.projectId);
|
||||
const region = normalizeRegion(options.region);
|
||||
this.viewPath = `\`${projectId}.region-${region}.INFORMATION_SCHEMA.JOBS_BY_PROJECT\``;
|
||||
}
|
||||
|
||||
async probe(client: unknown): Promise<void> {
|
||||
let result: QueryResultLike;
|
||||
try {
|
||||
result = await queryClient(client).executeQuery(`SELECT 1 FROM ${this.viewPath} LIMIT 1`);
|
||||
} catch (error) {
|
||||
throw grantsError(error);
|
||||
}
|
||||
if (result.error) {
|
||||
throw grantsError(result.error);
|
||||
}
|
||||
}
|
||||
|
||||
async *fetch(
|
||||
client: unknown,
|
||||
window: HistoricSqlTimeWindow,
|
||||
cursor?: string | null,
|
||||
): AsyncIterable<HistoricSqlRawQueryRow> {
|
||||
const start = timestampExpression(cursor ?? window.start);
|
||||
const end = timestampExpression(window.end);
|
||||
const sql = `
|
||||
SELECT
|
||||
job_id,
|
||||
query,
|
||||
user_email,
|
||||
creation_time,
|
||||
end_time,
|
||||
TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND) AS runtime_ms,
|
||||
total_slot_ms,
|
||||
total_bytes_processed,
|
||||
state,
|
||||
error_result.reason AS error_reason,
|
||||
error_result.message AS error_message,
|
||||
statement_type
|
||||
FROM ${this.viewPath}
|
||||
WHERE creation_time >= ${start}
|
||||
AND creation_time < ${end}
|
||||
AND job_type = 'QUERY'
|
||||
AND query IS NOT NULL
|
||||
AND (statement_type IS NULL OR statement_type != 'SCRIPT')
|
||||
ORDER BY creation_time ASC, job_id ASC`.trim();
|
||||
const result = await queryClient(client).executeQuery(sql);
|
||||
if (result.error) {
|
||||
throw grantsError(result.error);
|
||||
}
|
||||
const indexes = indexByHeader(result.headers);
|
||||
for (const row of result.rows) {
|
||||
yield mapRow(row, indexes);
|
||||
}
|
||||
}
|
||||
}
|
||||
251
packages/context/src/ingest/adapters/historic-sql/chunk.test.ts
Normal file
251
packages/context/src/ingest/adapters/historic-sql/chunk.test.ts
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js';
|
||||
|
||||
async function tempDir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-chunk-'));
|
||||
}
|
||||
|
||||
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
await mkdir(join(target, '..'), { recursive: true });
|
||||
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
|
||||
}
|
||||
|
||||
async function writeTemplate(root: string): Promise<void> {
|
||||
await writeJson(root, 'manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
templateCount: 1,
|
||||
capped: false,
|
||||
warnings: ['source warning'],
|
||||
templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }],
|
||||
});
|
||||
await writeJson(root, 'templates/fp_1/metadata.json', {
|
||||
id: 'fp_1',
|
||||
title: 'snowflake · analytics.orders [fp_1]',
|
||||
path: 'templates/fp_1/page.md',
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_1',
|
||||
sub_cluster_id: null,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
|
||||
triage_signals: {
|
||||
executions_bucket: 'high',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 0 runtime',
|
||||
},
|
||||
},
|
||||
});
|
||||
await writeFile(join(root, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8');
|
||||
await writeJson(root, 'templates/fp_1/usage.json', {
|
||||
stats: {
|
||||
executions: 20,
|
||||
distinct_users: 3,
|
||||
first_seen: '2026-05-01T00:00:00.000Z',
|
||||
last_seen: '2026-05-04T11:55:00.000Z',
|
||||
p50_runtime_ms: 100,
|
||||
p95_runtime_ms: 200,
|
||||
error_rate: 0,
|
||||
rows_produced: 20,
|
||||
},
|
||||
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }],
|
||||
samples: [],
|
||||
});
|
||||
}
|
||||
|
||||
async function writeSubclusterTemplates(root: string): Promise<void> {
|
||||
await writeJson(root, 'manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
templateCount: 2,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [
|
||||
{
|
||||
id: 'fp_order_status__cat_2b2ff2318877',
|
||||
fingerprint: 'fp_order_status',
|
||||
subClusterId: 'cat_2b2ff2318877',
|
||||
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
|
||||
},
|
||||
{
|
||||
id: 'fp_order_status__cat_34f037ddcbfa',
|
||||
fingerprint: 'fp_order_status',
|
||||
subClusterId: 'cat_34f037ddcbfa',
|
||||
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
for (const template of [
|
||||
{ id: 'fp_order_status__cat_2b2ff2318877', subClusterId: 'cat_2b2ff2318877' },
|
||||
{ id: 'fp_order_status__cat_34f037ddcbfa', subClusterId: 'cat_34f037ddcbfa' },
|
||||
]) {
|
||||
await writeJson(root, `templates/${template.id}/metadata.json`, {
|
||||
id: template.id,
|
||||
title: `snowflake · analytics.orders [fp_ord:${template.subClusterId.slice(-6)}]`,
|
||||
path: `templates/${template.id}/page.md`,
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_order_status',
|
||||
sub_cluster_id: template.subClusterId,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
|
||||
triage_signals: {
|
||||
executions_bucket: 'mid',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '0 constant, 0 runtime',
|
||||
},
|
||||
},
|
||||
});
|
||||
await writeFile(join(root, `templates/${template.id}/page.md`), `# ${template.id}\n`, 'utf-8');
|
||||
await writeJson(root, `templates/${template.id}/usage.json`, {
|
||||
stats: {
|
||||
executions: 3,
|
||||
distinct_users: 3,
|
||||
first_seen: '2026-05-04T10:00:00.000Z',
|
||||
last_seen: '2026-05-04T10:05:00.000Z',
|
||||
p50_runtime_ms: 120,
|
||||
p95_runtime_ms: 150,
|
||||
error_rate: 0,
|
||||
rows_produced: 36,
|
||||
},
|
||||
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }],
|
||||
samples: [],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
describe('chunkHistoricSqlStagedDir', () => {
|
||||
it('emits one WorkUnit per changed template and keeps usage as dependency', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeTemplate(stagedDir);
|
||||
|
||||
const result = await chunkHistoricSqlStagedDir(stagedDir, {
|
||||
added: ['templates/fp_1/metadata.json'],
|
||||
modified: [],
|
||||
deleted: [],
|
||||
unchanged: ['templates/fp_1/page.md', 'templates/fp_1/usage.json', 'manifest.json'],
|
||||
});
|
||||
|
||||
expect(result.workUnits).toEqual([
|
||||
{
|
||||
unitKey: 'historic-sql-fp-1',
|
||||
displayLabel: 'snowflake · analytics.orders [fp_1]',
|
||||
rawFiles: ['templates/fp_1/metadata.json'],
|
||||
dependencyPaths: ['manifest.json', 'templates/fp_1/usage.json'],
|
||||
peerFileIndex: ['templates/fp_1/page.md'],
|
||||
notes:
|
||||
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
|
||||
},
|
||||
]);
|
||||
expect(result.contextReport).toEqual({ capped: false, warnings: ['source warning'] });
|
||||
});
|
||||
|
||||
it('emits one WorkUnit per changed categorical sub-cluster', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeSubclusterTemplates(stagedDir);
|
||||
|
||||
const result = await chunkHistoricSqlStagedDir(stagedDir, {
|
||||
added: [
|
||||
'templates/fp_order_status__cat_2b2ff2318877/metadata.json',
|
||||
'templates/fp_order_status__cat_34f037ddcbfa/metadata.json',
|
||||
],
|
||||
modified: [],
|
||||
deleted: [],
|
||||
unchanged: [
|
||||
'manifest.json',
|
||||
'templates/fp_order_status__cat_2b2ff2318877/page.md',
|
||||
'templates/fp_order_status__cat_2b2ff2318877/usage.json',
|
||||
'templates/fp_order_status__cat_34f037ddcbfa/page.md',
|
||||
'templates/fp_order_status__cat_34f037ddcbfa/usage.json',
|
||||
],
|
||||
});
|
||||
|
||||
expect(
|
||||
result.workUnits.map((unit) => ({
|
||||
unitKey: unit.unitKey,
|
||||
displayLabel: unit.displayLabel,
|
||||
rawFiles: unit.rawFiles,
|
||||
dependencyPaths: unit.dependencyPaths,
|
||||
})),
|
||||
).toEqual([
|
||||
{
|
||||
unitKey: 'historic-sql-fp-order-status-cat-2b2ff2318877',
|
||||
displayLabel: 'snowflake · analytics.orders [fp_ord:318877]',
|
||||
rawFiles: ['templates/fp_order_status__cat_2b2ff2318877/metadata.json'],
|
||||
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_2b2ff2318877/usage.json'],
|
||||
},
|
||||
{
|
||||
unitKey: 'historic-sql-fp-order-status-cat-34f037ddcbfa',
|
||||
displayLabel: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
|
||||
rawFiles: ['templates/fp_order_status__cat_34f037ddcbfa/metadata.json'],
|
||||
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'],
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('emits zero WorkUnits for usage-only diffs', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeTemplate(stagedDir);
|
||||
|
||||
const result = await chunkHistoricSqlStagedDir(stagedDir, {
|
||||
added: [],
|
||||
modified: ['templates/fp_1/usage.json'],
|
||||
deleted: [],
|
||||
unchanged: ['templates/fp_1/metadata.json', 'templates/fp_1/page.md', 'manifest.json'],
|
||||
});
|
||||
|
||||
expect(result.workUnits).toEqual([]);
|
||||
expect(result.eviction).toBeUndefined();
|
||||
});
|
||||
|
||||
it('emits eviction only for deleted metadata or page files', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeTemplate(stagedDir);
|
||||
|
||||
const result = await chunkHistoricSqlStagedDir(stagedDir, {
|
||||
added: [],
|
||||
modified: [],
|
||||
deleted: ['templates/fp_1/usage.json', 'templates/fp_2/page.md'],
|
||||
unchanged: [],
|
||||
});
|
||||
|
||||
expect(result.eviction).toEqual({ deletedRawPaths: ['templates/fp_2/page.md'] });
|
||||
});
|
||||
|
||||
it('describes historic-sql scope without including unrelated paths', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeTemplate(stagedDir);
|
||||
|
||||
const scope = await describeHistoricSqlScope(stagedDir);
|
||||
|
||||
expect(scope.fingerprint).toHaveLength(64);
|
||||
expect(scope.isPathInScope('manifest.json')).toBe(true);
|
||||
expect(scope.isPathInScope('templates/fp_1/usage.json')).toBe(true);
|
||||
expect(scope.isPathInScope('pages/notion/page.md')).toBe(false);
|
||||
});
|
||||
});
|
||||
86
packages/context/src/ingest/adapters/historic-sql/chunk.ts
Normal file
86
packages/context/src/ingest/adapters/historic-sql/chunk.ts
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import { readFile, readdir } from 'node:fs/promises';
|
||||
import { join, relative } from 'node:path';
|
||||
import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js';
|
||||
import { historicSqlManifestSchema, historicSqlMetadataSchema } from './types.js';
|
||||
|
||||
async function walk(root: string): Promise<string[]> {
|
||||
const entries = await readdir(root, { withFileTypes: true, recursive: true });
|
||||
return entries
|
||||
.filter((entry) => entry.isFile())
|
||||
.map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/'))
|
||||
.sort();
|
||||
}
|
||||
|
||||
function safeUnitKey(id: string): string {
|
||||
return `historic-sql-${id.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
|
||||
}
|
||||
|
||||
async function readManifest(stagedDir: string) {
|
||||
try {
|
||||
return historicSqlManifestSchema.parse(JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')));
|
||||
} catch (error) {
|
||||
throw new Error(`Invalid historic-SQL manifest: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function chunkHistoricSqlStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
|
||||
const files = await walk(stagedDir);
|
||||
const manifest = await readManifest(stagedDir);
|
||||
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
|
||||
const workUnits: WorkUnit[] = [];
|
||||
|
||||
for (const pagePath of files.filter((path) => /^templates\/[^/]+\/page\.md$/.test(path))) {
|
||||
const metadataPath = pagePath.replace(/\/page\.md$/, '/metadata.json');
|
||||
const usagePath = pagePath.replace(/\/page\.md$/, '/usage.json');
|
||||
const primary = [metadataPath, pagePath].filter((path) => files.includes(path));
|
||||
if (touched && !primary.some((path) => touched.has(path))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const metadata = historicSqlMetadataSchema.parse(JSON.parse(await readFile(join(stagedDir, metadataPath), 'utf-8')));
|
||||
const rawFiles = touched ? primary.filter((path) => touched.has(path)).sort() : primary.sort();
|
||||
const dependencyPaths = ['manifest.json', files.includes(usagePath) ? usagePath : null]
|
||||
.filter((path): path is string => typeof path === 'string' && !rawFiles.includes(path))
|
||||
.sort();
|
||||
const excluded = new Set([...rawFiles, ...dependencyPaths]);
|
||||
const peerFileIndex = files.filter((path) => !excluded.has(path)).sort();
|
||||
|
||||
workUnits.push({
|
||||
unitKey: safeUnitKey(metadata.id),
|
||||
displayLabel: metadata.title,
|
||||
rawFiles,
|
||||
dependencyPaths,
|
||||
peerFileIndex,
|
||||
notes:
|
||||
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
|
||||
});
|
||||
}
|
||||
|
||||
const deletedPrimary = diffSet?.deleted.filter((path) => /^templates\/[^/]+\/(metadata\.json|page\.md)$/.test(path));
|
||||
|
||||
return {
|
||||
workUnits,
|
||||
eviction: deletedPrimary && deletedPrimary.length > 0 ? { deletedRawPaths: deletedPrimary.sort() } : undefined,
|
||||
reconcileNotes: [`Historic-SQL staged templates=${manifest.templateCount}`],
|
||||
contextReport: {
|
||||
capped: manifest.capped,
|
||||
warnings: manifest.warnings,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function describeHistoricSqlScope(stagedDir: string): Promise<ScopeDescriptor> {
|
||||
const manifest = await readManifest(stagedDir);
|
||||
const scopeKey = JSON.stringify({
|
||||
connectionId: manifest.connectionId,
|
||||
dialect: manifest.dialect,
|
||||
windowStart: manifest.windowStart,
|
||||
windowEnd: manifest.windowEnd,
|
||||
});
|
||||
const fingerprint = createHash('sha256').update(scopeKey).digest('hex');
|
||||
return {
|
||||
fingerprint,
|
||||
isPathInScope: (rawPath) => rawPath === 'manifest.json' || rawPath.startsWith('templates/'),
|
||||
};
|
||||
}
|
||||
197
packages/context/src/ingest/adapters/historic-sql/detect.test.ts
Normal file
197
packages/context/src/ingest/adapters/historic-sql/detect.test.ts
Normal file
|
|
@ -0,0 +1,197 @@
|
|||
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { detectHistoricSqlStagedDir } from './detect.js';
|
||||
import {
|
||||
HISTORIC_SQL_SOURCE_KEY,
|
||||
historicSqlManifestSchema,
|
||||
historicSqlMetadataSchema,
|
||||
historicSqlPullConfigSchema,
|
||||
historicSqlUsageSchema,
|
||||
} from './types.js';
|
||||
|
||||
async function tempDir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-detect-'));
|
||||
}
|
||||
|
||||
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
await mkdir(join(target, '..'), { recursive: true });
|
||||
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
|
||||
}
|
||||
|
||||
describe('historic-sql staged dir detection', () => {
|
||||
it('detects manifest source', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeJson(stagedDir, 'manifest.json', {
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
templateCount: 0,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [],
|
||||
});
|
||||
|
||||
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
|
||||
});
|
||||
|
||||
it('detects document-shaped template structure without manifest', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeFile(join(stagedDir, 'not-a-match.txt'), 'x', 'utf-8');
|
||||
await mkdir(join(stagedDir, 'templates', 'fp_1'), { recursive: true });
|
||||
await writeFile(join(stagedDir, 'templates', 'fp_1', 'metadata.json'), '{}', 'utf-8');
|
||||
await writeFile(join(stagedDir, 'templates', 'fp_1', 'page.md'), '# fp_1\n', 'utf-8');
|
||||
|
||||
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
|
||||
});
|
||||
|
||||
it('does not detect unrelated directories', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeJson(stagedDir, 'manifest.json', { source: 'notion' });
|
||||
|
||||
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('historic-sql schemas', () => {
|
||||
it('defaults disabled optional pull-config fields through the parser', () => {
|
||||
expect(
|
||||
historicSqlPullConfigSchema.parse({
|
||||
dialect: 'bigquery',
|
||||
}),
|
||||
).toEqual({
|
||||
dialect: 'bigquery',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts postgres pull config with a minCalls floor', () => {
|
||||
expect(
|
||||
historicSqlPullConfigSchema.parse({
|
||||
dialect: 'postgres',
|
||||
minCalls: 12,
|
||||
}),
|
||||
).toEqual({
|
||||
dialect: 'postgres',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 12,
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts postgres manifest fields with defaults for older dialects', () => {
|
||||
expect(
|
||||
historicSqlManifestSchema.parse({
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: 'conn_pg',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-08T12:00:00.000Z',
|
||||
windowStart: '2026-05-08T11:00:00.000Z',
|
||||
windowEnd: '2026-05-08T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
|
||||
templateCount: 0,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [],
|
||||
degraded: true,
|
||||
statsResetAt: '2026-05-01T00:00:00.000Z',
|
||||
baselineFirstRun: true,
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
deallocCount: 3,
|
||||
}),
|
||||
).toMatchObject({
|
||||
dialect: 'postgres',
|
||||
degraded: true,
|
||||
statsResetAt: '2026-05-01T00:00:00.000Z',
|
||||
baselineFirstRun: true,
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
deallocCount: 3,
|
||||
});
|
||||
|
||||
expect(
|
||||
historicSqlManifestSchema.parse({
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: 'conn_sf',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-08T12:00:00.000Z',
|
||||
windowStart: '2026-05-01T12:00:00.000Z',
|
||||
windowEnd: '2026-05-08T12:00:00.000Z',
|
||||
nextSuccessfulCursor: null,
|
||||
templateCount: 0,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [],
|
||||
}),
|
||||
).toMatchObject({
|
||||
degraded: false,
|
||||
statsResetAt: null,
|
||||
baselineFirstRun: false,
|
||||
pgServerVersion: null,
|
||||
deallocCount: null,
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts postgres usage stats with mean_runtime_ms and empty samples', () => {
|
||||
const parsed = historicSqlUsageSchema.parse({
|
||||
stats: {
|
||||
executions: 25,
|
||||
distinct_users: 2,
|
||||
first_seen: '2026-05-08T10:00:00.000Z',
|
||||
last_seen: '2026-05-08T12:00:00.000Z',
|
||||
p50_runtime_ms: null,
|
||||
p95_runtime_ms: null,
|
||||
mean_runtime_ms: 32.5,
|
||||
error_rate: 0,
|
||||
rows_produced: 1042,
|
||||
},
|
||||
literal_slots: [],
|
||||
samples: [],
|
||||
});
|
||||
|
||||
expect(parsed.stats.mean_runtime_ms).toBe(32.5);
|
||||
expect(parsed.samples).toEqual([]);
|
||||
});
|
||||
|
||||
it('pins the Notion-compatible metadata envelope', () => {
|
||||
const parsed = historicSqlMetadataSchema.parse({
|
||||
id: 'fp_1',
|
||||
title: 'snowflake · analytics.orders [fp_1]',
|
||||
path: 'templates/fp_1/page.md',
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_1',
|
||||
sub_cluster_id: null,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
|
||||
triage_signals: {
|
||||
executions_bucket: 'high',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 0 runtime',
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(parsed.objectType).toBe('historic_sql_template');
|
||||
expect(parsed.lastEditedAt).toBeNull();
|
||||
expect(parsed.properties.triage_signals.service_account_only).toBe('false');
|
||||
});
|
||||
});
|
||||
37
packages/context/src/ingest/adapters/historic-sql/detect.ts
Normal file
37
packages/context/src/ingest/adapters/historic-sql/detect.ts
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
import { readFile, readdir } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import { HISTORIC_SQL_SOURCE_KEY } from './types.js';
|
||||
|
||||
export async function detectHistoricSqlStagedDir(stagedDir: string): Promise<boolean> {
|
||||
try {
|
||||
const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as { source?: unknown };
|
||||
if (manifest.source === HISTORIC_SQL_SOURCE_KEY) {
|
||||
return true;
|
||||
}
|
||||
if (manifest.source !== undefined) {
|
||||
return false;
|
||||
}
|
||||
} catch {
|
||||
// Fall through to structural detection for stage-only fixtures.
|
||||
}
|
||||
|
||||
try {
|
||||
const entries = await readdir(join(stagedDir, 'templates'), { withFileTypes: true, recursive: true });
|
||||
const metadataDirs = new Set<string>();
|
||||
const pageDirs = new Set<string>();
|
||||
for (const entry of entries) {
|
||||
if (!entry.isFile()) {
|
||||
continue;
|
||||
}
|
||||
if (entry.name === 'metadata.json') {
|
||||
metadataDirs.add(entry.parentPath);
|
||||
}
|
||||
if (entry.name === 'page.md') {
|
||||
pageDirs.add(entry.parentPath);
|
||||
}
|
||||
}
|
||||
return [...metadataDirs].some((dir) => pageDirs.has(dir));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
61
packages/context/src/ingest/adapters/historic-sql/errors.ts
Normal file
61
packages/context/src/ingest/adapters/historic-sql/errors.ts
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
import type { HistoricSqlDialect } from './types.js';
|
||||
|
||||
interface HistoricSqlGrantsMissingErrorOptions {
|
||||
dialect: HistoricSqlDialect;
|
||||
message: string;
|
||||
remediation: string;
|
||||
cause?: unknown;
|
||||
}
|
||||
|
||||
export class HistoricSqlGrantsMissingError extends Error {
|
||||
readonly dialect: HistoricSqlDialect;
|
||||
readonly remediation: string;
|
||||
|
||||
constructor(options: HistoricSqlGrantsMissingErrorOptions) {
|
||||
super(options.message, options.cause === undefined ? undefined : { cause: options.cause });
|
||||
this.name = 'HistoricSqlGrantsMissingError';
|
||||
this.dialect = options.dialect;
|
||||
this.remediation = options.remediation;
|
||||
}
|
||||
}
|
||||
|
||||
interface HistoricSqlExtensionMissingErrorOptions {
|
||||
dialect: HistoricSqlDialect;
|
||||
message: string;
|
||||
remediation: string;
|
||||
cause?: unknown;
|
||||
}
|
||||
|
||||
export class HistoricSqlExtensionMissingError extends Error {
|
||||
readonly dialect: HistoricSqlDialect;
|
||||
readonly remediation: string;
|
||||
|
||||
constructor(options: HistoricSqlExtensionMissingErrorOptions) {
|
||||
super(options.message, options.cause === undefined ? undefined : { cause: options.cause });
|
||||
this.name = 'HistoricSqlExtensionMissingError';
|
||||
this.dialect = options.dialect;
|
||||
this.remediation = options.remediation;
|
||||
}
|
||||
}
|
||||
|
||||
interface HistoricSqlVersionUnsupportedErrorOptions {
|
||||
dialect: HistoricSqlDialect;
|
||||
detectedVersion: string;
|
||||
minimumVersion: string;
|
||||
}
|
||||
|
||||
export class HistoricSqlVersionUnsupportedError extends Error {
|
||||
readonly dialect: HistoricSqlDialect;
|
||||
readonly detectedVersion: string;
|
||||
readonly minimumVersion: string;
|
||||
|
||||
constructor(options: HistoricSqlVersionUnsupportedErrorOptions) {
|
||||
super(
|
||||
`Unsupported ${options.dialect} version for historic-SQL ingest: detected ${options.detectedVersion}; requires ${options.minimumVersion} or newer.`,
|
||||
);
|
||||
this.name = 'HistoricSqlVersionUnsupportedError';
|
||||
this.dialect = options.dialect;
|
||||
this.detectedVersion = options.detectedVersion;
|
||||
this.minimumVersion = options.minimumVersion;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,304 @@
|
|||
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js';
|
||||
import { pgssBaselinePath } from './stage-pgss.js';
|
||||
import type { HistoricSqlQueryHistoryReader, PostgresPgssReader } from './types.js';
|
||||
|
||||
async function tempDir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-adapter-'));
|
||||
}
|
||||
|
||||
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
await mkdir(join(target, '..'), { recursive: true });
|
||||
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
|
||||
}
|
||||
|
||||
const sqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint() {
|
||||
return {
|
||||
fingerprint: 'fp_1',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [{ position: 1, type: 'string', exampleValue: 'paid' }],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const reader: HistoricSqlQueryHistoryReader = {
|
||||
async probe() {},
|
||||
async *fetch() {
|
||||
yield {
|
||||
id: 'q1',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
|
||||
user: 'analyst',
|
||||
startedAt: '2026-05-04T11:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 10,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
describe('HistoricSqlSourceAdapter', () => {
|
||||
it('declares canonical adapter metadata', () => {
|
||||
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
|
||||
|
||||
expect(adapter.source).toBe('historic-sql');
|
||||
expect(adapter.skillNames).toEqual(['historic_sql_ingest']);
|
||||
expect(adapter.reconcileSkillNames).toEqual(['historic_sql_curator']);
|
||||
expect(adapter.evidenceIndexing).toBe('documents');
|
||||
expect(adapter.triageSupported).toBe(true);
|
||||
});
|
||||
|
||||
it('fetches staged templates through injected reader and SqlAnalysisPort', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const adapter = new HistoricSqlSourceAdapter({
|
||||
sqlAnalysis,
|
||||
reader,
|
||||
queryClient: {},
|
||||
now: () => new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
await adapter.fetch(
|
||||
{
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
},
|
||||
stagedDir,
|
||||
{ connectionId: 'conn_1', sourceKey: 'historic-sql' },
|
||||
);
|
||||
|
||||
await expect(adapter.detect(stagedDir)).resolves.toBe(true);
|
||||
});
|
||||
|
||||
it('reads triage signals from usage.json and metadata properties', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeJson(stagedDir, 'manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
templateCount: 1,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }],
|
||||
});
|
||||
await writeJson(stagedDir, 'templates/fp_1/metadata.json', {
|
||||
id: 'fp_1',
|
||||
title: 'snowflake · analytics.orders [fp_1]',
|
||||
path: 'templates/fp_1/page.md',
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_1',
|
||||
sub_cluster_id: null,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
|
||||
triage_signals: {
|
||||
executions_bucket: 'high',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 0 runtime',
|
||||
},
|
||||
},
|
||||
});
|
||||
await writeFile(join(stagedDir, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8');
|
||||
await writeJson(stagedDir, 'templates/fp_1/usage.json', {
|
||||
stats: {
|
||||
executions: 20,
|
||||
distinct_users: 3,
|
||||
first_seen: '2026-05-01T00:00:00.000Z',
|
||||
last_seen: '2026-05-04T11:55:00.000Z',
|
||||
p50_runtime_ms: 100,
|
||||
p95_runtime_ms: 200,
|
||||
error_rate: 0,
|
||||
},
|
||||
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }],
|
||||
samples: [],
|
||||
});
|
||||
|
||||
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
|
||||
|
||||
await expect(adapter.getTriageSignals(stagedDir, 'fp_1')).resolves.toEqual({
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: '2026-05-04T11:55:00.000Z',
|
||||
propertyHints: {
|
||||
executions_bucket: 'high',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 0 runtime',
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('dispatches postgres fetches through PGSS staging and writes the baseline only after pull success', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const baselineRootDir = await tempDir();
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
const unusedPerExecutionReader: HistoricSqlQueryHistoryReader = {
|
||||
async probe() {
|
||||
throw new Error('per-execution reader must not be used for postgres');
|
||||
},
|
||||
async *fetch() {
|
||||
throw new Error('per-execution reader must not be used for postgres');
|
||||
},
|
||||
};
|
||||
const postgresReader: PostgresPgssReader = {
|
||||
async probe() {
|
||||
return { pgServerVersion: 'PostgreSQL 16.4', warnings: [] };
|
||||
},
|
||||
async readSnapshot() {
|
||||
return {
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
deallocCount: 0,
|
||||
rows: [
|
||||
{
|
||||
queryid: '901',
|
||||
userid: '11',
|
||||
username: 'analyst',
|
||||
dbid: '5',
|
||||
database: 'warehouse',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 9,
|
||||
totalExecTime: 90,
|
||||
meanExecTime: 10,
|
||||
totalRows: 18,
|
||||
},
|
||||
],
|
||||
};
|
||||
},
|
||||
};
|
||||
const adapter = new HistoricSqlSourceAdapter({
|
||||
sqlAnalysis,
|
||||
reader: unusedPerExecutionReader,
|
||||
queryClient: {},
|
||||
postgresReader,
|
||||
postgresQueryClient: {
|
||||
async executeQuery() {
|
||||
return { headers: [], rows: [] };
|
||||
},
|
||||
},
|
||||
postgresBaselineRootDir: baselineRootDir,
|
||||
now: () => new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
await adapter.fetch(
|
||||
{
|
||||
dialect: 'postgres',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
stagedDir,
|
||||
{ connectionId: 'conn_pg', sourceKey: 'historic-sql' },
|
||||
);
|
||||
|
||||
const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as {
|
||||
dialect: string;
|
||||
baselineFirstRun: boolean;
|
||||
templates: Array<{ id: string }>;
|
||||
};
|
||||
expect(manifest.dialect).toBe('postgres');
|
||||
expect(manifest.baselineFirstRun).toBe(true);
|
||||
expect(manifest.templates).toEqual([
|
||||
{ id: 'db5_q901', fingerprint: 'fp_1', subClusterId: null, path: 'templates/db5_q901/page.md' },
|
||||
]);
|
||||
await expect(readFile(baselinePath, 'utf-8')).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
|
||||
await adapter.onPullSucceeded({
|
||||
connectionId: 'conn_pg',
|
||||
sourceKey: 'historic-sql',
|
||||
syncId: 'sync_pg',
|
||||
trigger: 'scheduled_pull',
|
||||
completedAt: new Date('2026-05-08T12:01:00.000Z'),
|
||||
stagedDir,
|
||||
});
|
||||
|
||||
const baseline = JSON.parse(await readFile(baselinePath, 'utf-8')) as {
|
||||
fetchedAt: string;
|
||||
templates: Record<string, { perUser: Record<string, { calls: number }> }>;
|
||||
};
|
||||
expect(baseline.fetchedAt).toBe('2026-05-08T12:00:00.000Z');
|
||||
expect(baseline.templates.db5_q901.perUser['11'].calls).toBe(9);
|
||||
});
|
||||
|
||||
it('fails postgres fetches clearly when no PGSS reader is configured', async () => {
|
||||
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
|
||||
|
||||
await expect(
|
||||
adapter.fetch(
|
||||
{
|
||||
dialect: 'postgres',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
await tempDir(),
|
||||
{ connectionId: 'conn_pg', sourceKey: 'historic-sql' },
|
||||
),
|
||||
).rejects.toThrow('Historic SQL Postgres fetch requires deps.postgresReader');
|
||||
});
|
||||
|
||||
it('forwards manifest cursor through onPullSucceeded without changing the SourceAdapter signature', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeJson(stagedDir, 'manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
templateCount: 0,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [],
|
||||
});
|
||||
const onPullSucceeded = vi.fn(async () => {});
|
||||
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {}, onPullSucceeded });
|
||||
const completedAt = new Date('2026-05-04T12:01:00.000Z');
|
||||
|
||||
await adapter.onPullSucceeded({
|
||||
connectionId: 'conn_1',
|
||||
sourceKey: 'historic-sql',
|
||||
syncId: 'sync_1',
|
||||
trigger: 'scheduled_pull',
|
||||
completedAt,
|
||||
stagedDir,
|
||||
});
|
||||
|
||||
expect(onPullSucceeded).toHaveBeenCalledWith({
|
||||
connectionId: 'conn_1',
|
||||
sourceKey: 'historic-sql',
|
||||
syncId: 'sync_1',
|
||||
trigger: 'scheduled_pull',
|
||||
completedAt,
|
||||
stagedDir,
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,135 @@
|
|||
import { readFile } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import type {
|
||||
ChunkResult,
|
||||
DiffSet,
|
||||
FetchContext,
|
||||
IngestTrigger,
|
||||
ScopeDescriptor,
|
||||
SourceAdapter,
|
||||
TriageSignals,
|
||||
} from '../../types.js';
|
||||
import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js';
|
||||
import { detectHistoricSqlStagedDir } from './detect.js';
|
||||
import { stageHistoricSqlTemplates } from './stage.js';
|
||||
import {
|
||||
pgssBaselinePath,
|
||||
stagePgStatStatementsTemplates,
|
||||
writePgssBaselineAtomic,
|
||||
type StagePgStatStatementsTemplatesResult,
|
||||
} from './stage-pgss.js';
|
||||
import {
|
||||
historicSqlManifestSchema,
|
||||
historicSqlMetadataSchema,
|
||||
historicSqlPullConfigSchema,
|
||||
historicSqlUsageSchema,
|
||||
type HistoricSqlSourceAdapterDeps,
|
||||
} from './types.js';
|
||||
|
||||
export class HistoricSqlSourceAdapter implements SourceAdapter {
|
||||
readonly source = 'historic-sql';
|
||||
readonly skillNames = ['historic_sql_ingest'];
|
||||
readonly reconcileSkillNames = ['historic_sql_curator'];
|
||||
readonly evidenceIndexing = 'documents' as const;
|
||||
readonly triageSupported = true;
|
||||
|
||||
private readonly pendingPgssBaselines = new Map<string, StagePgStatStatementsTemplatesResult>();
|
||||
|
||||
constructor(private readonly deps: HistoricSqlSourceAdapterDeps) {}
|
||||
|
||||
detect(stagedDir: string): Promise<boolean> {
|
||||
return detectHistoricSqlStagedDir(stagedDir);
|
||||
}
|
||||
|
||||
async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
|
||||
const config = historicSqlPullConfigSchema.parse(pullConfig);
|
||||
if (config.dialect === 'postgres') {
|
||||
if (!this.deps.postgresReader) {
|
||||
throw new Error('Historic SQL Postgres fetch requires deps.postgresReader');
|
||||
}
|
||||
const postgresQueryClient = this.deps.postgresQueryClient ?? this.deps.queryClient;
|
||||
if (
|
||||
!postgresQueryClient ||
|
||||
typeof postgresQueryClient !== 'object' ||
|
||||
!('executeQuery' in postgresQueryClient) ||
|
||||
typeof (postgresQueryClient as { executeQuery?: unknown }).executeQuery !== 'function'
|
||||
) {
|
||||
throw new Error('Historic SQL Postgres fetch requires deps.postgresQueryClient with executeQuery(sql, params?)');
|
||||
}
|
||||
const result = await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: ctx.connectionId,
|
||||
queryClient: postgresQueryClient as NonNullable<HistoricSqlSourceAdapterDeps['postgresQueryClient']>,
|
||||
reader: this.deps.postgresReader,
|
||||
sqlAnalysis: this.deps.sqlAnalysis,
|
||||
pullConfig: config,
|
||||
baselinePath: pgssBaselinePath(this.deps.postgresBaselineRootDir, ctx.connectionId),
|
||||
now: this.deps.now?.(),
|
||||
});
|
||||
this.pendingPgssBaselines.set(stagedDir, result);
|
||||
return;
|
||||
}
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: ctx.connectionId,
|
||||
queryClient: this.deps.queryClient,
|
||||
reader: this.deps.reader,
|
||||
sqlAnalysis: this.deps.sqlAnalysis,
|
||||
pullConfig: config,
|
||||
now: this.deps.now?.(),
|
||||
});
|
||||
}
|
||||
|
||||
chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
|
||||
return chunkHistoricSqlStagedDir(stagedDir, diffSet);
|
||||
}
|
||||
|
||||
describeScope(stagedDir: string): Promise<ScopeDescriptor> {
|
||||
return describeHistoricSqlScope(stagedDir);
|
||||
}
|
||||
|
||||
async getTriageSignals(stagedDir: string, externalId: string): Promise<TriageSignals> {
|
||||
const manifest = historicSqlManifestSchema.parse(
|
||||
JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')),
|
||||
);
|
||||
const template = manifest.templates.find((entry) => entry.id === externalId);
|
||||
if (!template) {
|
||||
return {};
|
||||
}
|
||||
const templateDir = template.path.replace(/\/page\.md$/, '');
|
||||
const metadata = historicSqlMetadataSchema.parse(
|
||||
JSON.parse(await readFile(join(stagedDir, templateDir, 'metadata.json'), 'utf-8')),
|
||||
);
|
||||
const usage = historicSqlUsageSchema.parse(
|
||||
JSON.parse(await readFile(join(stagedDir, templateDir, 'usage.json'), 'utf-8')),
|
||||
);
|
||||
|
||||
return {
|
||||
objectType: metadata.objectType,
|
||||
lastEditedAt: usage.stats.last_seen,
|
||||
propertyHints: metadata.properties.triage_signals,
|
||||
};
|
||||
}
|
||||
|
||||
async onPullSucceeded(ctx: {
|
||||
connectionId: string;
|
||||
sourceKey: string;
|
||||
syncId: string;
|
||||
trigger: IngestTrigger;
|
||||
completedAt: Date;
|
||||
stagedDir: string;
|
||||
}): Promise<void> {
|
||||
const manifest = historicSqlManifestSchema.parse(
|
||||
JSON.parse(await readFile(join(ctx.stagedDir, 'manifest.json'), 'utf-8')),
|
||||
);
|
||||
if (manifest.dialect === 'postgres') {
|
||||
const pending = this.pendingPgssBaselines.get(ctx.stagedDir);
|
||||
if (pending) {
|
||||
await writePgssBaselineAtomic(pending.baselinePath, pending.baseline);
|
||||
this.pendingPgssBaselines.delete(ctx.stagedDir);
|
||||
}
|
||||
}
|
||||
await this.deps.onPullSucceeded?.({ ...ctx, nextSuccessfulCursor: manifest.nextSuccessfulCursor });
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,281 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import {
|
||||
HistoricSqlExtensionMissingError,
|
||||
HistoricSqlGrantsMissingError,
|
||||
HistoricSqlVersionUnsupportedError,
|
||||
} from './errors.js';
|
||||
import { PostgresPgssQueryHistoryReader } from './postgres-pgss-query-history-reader.js';
|
||||
|
||||
interface FakeQueryResult {
|
||||
headers: string[];
|
||||
rows: unknown[][];
|
||||
totalRows?: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
function queryClient(results: Array<FakeQueryResult | Error>) {
|
||||
const executeQuery = vi.fn(async (_query: string, _params?: unknown[]) => {
|
||||
const next = results.shift();
|
||||
if (!next) {
|
||||
throw new Error('unexpected query');
|
||||
}
|
||||
if (next instanceof Error) {
|
||||
throw next;
|
||||
}
|
||||
return next;
|
||||
});
|
||||
return { executeQuery };
|
||||
}
|
||||
|
||||
function executedSql(client: ReturnType<typeof queryClient>, index: number): string {
|
||||
const call = client.executeQuery.mock.calls[index];
|
||||
if (!call) {
|
||||
throw new Error(`expected query client call ${index}`);
|
||||
}
|
||||
return call[0];
|
||||
}
|
||||
|
||||
describe('PostgresPgssQueryHistoryReader', () => {
|
||||
it('probes version, extension presence, grants, and tracking state', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: ['server_version_num', 'server_version'],
|
||||
rows: [[160004, 'PostgreSQL 16.4 on x86_64-apple-darwin']],
|
||||
},
|
||||
{ headers: ['?column?'], rows: [[1]] },
|
||||
{ headers: ['has_role'], rows: [[true]] },
|
||||
{ headers: ['track'], rows: [['top']] },
|
||||
{ headers: ['max'], rows: [['5000']] },
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
|
||||
await expect(reader.probe(client)).resolves.toEqual({
|
||||
pgServerVersion: 'PostgreSQL 16.4 on x86_64-apple-darwin',
|
||||
warnings: [],
|
||||
});
|
||||
|
||||
expect(executedSql(client, 0)).toContain("current_setting('server_version_num')::int");
|
||||
expect(executedSql(client, 1)).toBe('SELECT 1 FROM pg_stat_statements LIMIT 1');
|
||||
expect(executedSql(client, 2)).toBe(
|
||||
"SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role",
|
||||
);
|
||||
expect(executedSql(client, 3)).toBe("SELECT current_setting('pg_stat_statements.track') AS track");
|
||||
expect(executedSql(client, 4)).toBe("SELECT current_setting('pg_stat_statements.max') AS max");
|
||||
});
|
||||
|
||||
it('rejects PostgreSQL versions older than 14 without probing the extension', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: ['server_version_num', 'server_version'],
|
||||
rows: [[130012, 'PostgreSQL 13.12']],
|
||||
},
|
||||
{
|
||||
headers: ['stats_reset', 'dealloc'],
|
||||
rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]],
|
||||
},
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
|
||||
const promise = reader.probe(client);
|
||||
await expect(promise).rejects.toMatchObject({
|
||||
name: 'HistoricSqlVersionUnsupportedError',
|
||||
dialect: 'postgres',
|
||||
detectedVersion: 'PostgreSQL 13.12',
|
||||
minimumVersion: 'PostgreSQL 14',
|
||||
});
|
||||
await expect(promise).rejects.toBeInstanceOf(HistoricSqlVersionUnsupportedError);
|
||||
expect(client.executeQuery).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('maps a missing pg_stat_statements relation to HistoricSqlExtensionMissingError', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: ['server_version_num', 'server_version'],
|
||||
rows: [[160004, 'PostgreSQL 16.4']],
|
||||
},
|
||||
new Error('relation "pg_stat_statements" does not exist'),
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
|
||||
const promise = reader.probe(client);
|
||||
await expect(promise).rejects.toMatchObject({
|
||||
name: 'HistoricSqlExtensionMissingError',
|
||||
dialect: 'postgres',
|
||||
});
|
||||
await expect(promise).rejects.toBeInstanceOf(HistoricSqlExtensionMissingError);
|
||||
});
|
||||
|
||||
it('maps pg_stat_statements preload failures to HistoricSqlExtensionMissingError with preload remediation', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: ['server_version_num', 'server_version'],
|
||||
rows: [[160004, 'PostgreSQL 16.4']],
|
||||
},
|
||||
new Error('pg_stat_statements must be loaded via shared_preload_libraries'),
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
|
||||
const promise = reader.probe(client);
|
||||
await expect(promise).rejects.toMatchObject({
|
||||
name: 'HistoricSqlExtensionMissingError',
|
||||
dialect: 'postgres',
|
||||
message: 'pg_stat_statements is installed but not loaded via shared_preload_libraries.',
|
||||
remediation: expect.stringContaining("shared_preload_libraries includes 'pg_stat_statements'"),
|
||||
});
|
||||
await expect(promise).rejects.toBeInstanceOf(HistoricSqlExtensionMissingError);
|
||||
});
|
||||
|
||||
it('maps missing pg_read_all_stats membership to HistoricSqlGrantsMissingError', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: ['server_version_num', 'server_version'],
|
||||
rows: [[160004, 'PostgreSQL 16.4']],
|
||||
},
|
||||
{ headers: ['?column?'], rows: [[1]] },
|
||||
{ headers: ['has_role'], rows: [[false]] },
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
|
||||
const promise = reader.probe(client);
|
||||
await expect(promise).rejects.toMatchObject({
|
||||
name: 'HistoricSqlGrantsMissingError',
|
||||
dialect: 'postgres',
|
||||
remediation: 'GRANT pg_read_all_stats TO <connection role>;',
|
||||
});
|
||||
await expect(promise).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
|
||||
});
|
||||
|
||||
it('returns a warning instead of failing when pg_stat_statements.track is none', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: ['server_version_num', 'server_version'],
|
||||
rows: [[160004, 'PostgreSQL 16.4']],
|
||||
},
|
||||
{ headers: ['?column?'], rows: [[1]] },
|
||||
{ headers: ['has_role'], rows: [[true]] },
|
||||
{ headers: ['track'], rows: [['none']] },
|
||||
{ headers: ['max'], rows: [['5000']] },
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
|
||||
await expect(reader.probe(client)).resolves.toEqual({
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
warnings: [
|
||||
"pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config",
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('warns when pg_stat_statements.max is below the recommended floor', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: ['server_version_num', 'server_version'],
|
||||
rows: [[160004, 'PostgreSQL 16.4']],
|
||||
},
|
||||
{ headers: ['?column?'], rows: [[1]] },
|
||||
{ headers: ['has_role'], rows: [[true]] },
|
||||
{ headers: ['track'], rows: [['top']] },
|
||||
{ headers: ['max'], rows: [['1000']] },
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
|
||||
await expect(reader.probe(client)).resolves.toEqual({
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
warnings: [
|
||||
'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn',
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('reads a parameterized pg_stat_statements snapshot and stats info', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: [
|
||||
'queryid',
|
||||
'userid',
|
||||
'username',
|
||||
'dbid',
|
||||
'database',
|
||||
'query',
|
||||
'calls',
|
||||
'total_exec_time',
|
||||
'mean_exec_time',
|
||||
'total_rows',
|
||||
],
|
||||
rows: [
|
||||
[
|
||||
'922337203685477580',
|
||||
'16384',
|
||||
'analyst',
|
||||
'16385',
|
||||
'warehouse',
|
||||
'SELECT count(*) FROM public.orders WHERE status = $1',
|
||||
'42',
|
||||
'2100.5',
|
||||
'50.0119',
|
||||
'9001',
|
||||
],
|
||||
[
|
||||
'922337203685477581',
|
||||
'16386',
|
||||
'unknown',
|
||||
'16385',
|
||||
'warehouse',
|
||||
'SELECT * FROM public.customers WHERE id = $1',
|
||||
5,
|
||||
30,
|
||||
6,
|
||||
5,
|
||||
],
|
||||
],
|
||||
},
|
||||
{
|
||||
headers: ['stats_reset', 'dealloc'],
|
||||
rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]],
|
||||
},
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
|
||||
await expect(reader.readSnapshot(client, { minCalls: 5, maxTemplates: 500 })).resolves.toEqual({
|
||||
statsResetAt: '2026-05-01T00:00:00.000Z',
|
||||
deallocCount: 7,
|
||||
rows: [
|
||||
{
|
||||
queryid: '922337203685477580',
|
||||
userid: '16384',
|
||||
username: 'analyst',
|
||||
dbid: '16385',
|
||||
database: 'warehouse',
|
||||
query: 'SELECT count(*) FROM public.orders WHERE status = $1',
|
||||
calls: 42,
|
||||
totalExecTime: 2100.5,
|
||||
meanExecTime: 50.0119,
|
||||
totalRows: 9001,
|
||||
},
|
||||
{
|
||||
queryid: '922337203685477581',
|
||||
userid: '16386',
|
||||
username: 'unknown',
|
||||
dbid: '16385',
|
||||
database: 'warehouse',
|
||||
query: 'SELECT * FROM public.customers WHERE id = $1',
|
||||
calls: 5,
|
||||
totalExecTime: 30,
|
||||
meanExecTime: 6,
|
||||
totalRows: 5,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const snapshotSql = executedSql(client, 0);
|
||||
expect(snapshotSql).toContain('FROM pg_stat_statements s');
|
||||
expect(snapshotSql).toContain('LEFT JOIN pg_roles');
|
||||
expect(snapshotSql).toContain('LEFT JOIN pg_database');
|
||||
expect(snapshotSql).toContain('WHERE s.toplevel = true');
|
||||
expect(snapshotSql).toContain('AND s.calls >= $1');
|
||||
expect(snapshotSql).toContain('ORDER BY s.total_exec_time DESC');
|
||||
expect(snapshotSql).toContain('LIMIT $2');
|
||||
expect(client.executeQuery.mock.calls[0]?.[1]).toEqual([5, 500]);
|
||||
expect(executedSql(client, 1)).toBe('SELECT stats_reset, dealloc FROM pg_stat_statements_info');
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,262 @@
|
|||
import {
|
||||
HistoricSqlExtensionMissingError,
|
||||
HistoricSqlGrantsMissingError,
|
||||
HistoricSqlVersionUnsupportedError,
|
||||
} from './errors.js';
|
||||
import type {
|
||||
KloPostgresQueryClient,
|
||||
PostgresPgssProbeResult,
|
||||
PostgresPgssReader,
|
||||
PostgresPgssRow,
|
||||
PostgresPgssSnapshot,
|
||||
} from './types.js';
|
||||
|
||||
interface QueryResultLike {
|
||||
headers: string[];
|
||||
rows: unknown[][];
|
||||
totalRows?: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
const VERSION_SQL = `
|
||||
SELECT current_setting('server_version_num')::int AS server_version_num,
|
||||
version() AS server_version
|
||||
`.trim();
|
||||
|
||||
const EXTENSION_PROBE_SQL = 'SELECT 1 FROM pg_stat_statements LIMIT 1';
|
||||
const GRANTS_PROBE_SQL = "SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role";
|
||||
const TRACKING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.track') AS track";
|
||||
const MAX_SETTING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.max') AS max";
|
||||
const RECOMMENDED_PGSS_MAX = 5000;
|
||||
const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info';
|
||||
|
||||
const SNAPSHOT_SQL = `
|
||||
SELECT
|
||||
s.queryid::text AS queryid,
|
||||
s.userid::text AS userid,
|
||||
COALESCE(r.rolname, 'unknown') AS username,
|
||||
s.dbid::text AS dbid,
|
||||
d.datname AS database,
|
||||
s.query,
|
||||
s.calls,
|
||||
s.total_exec_time,
|
||||
s.mean_exec_time,
|
||||
s.rows AS total_rows
|
||||
FROM pg_stat_statements s
|
||||
LEFT JOIN pg_roles r ON s.userid = r.oid
|
||||
LEFT JOIN pg_database d ON s.dbid = d.oid
|
||||
WHERE s.toplevel = true
|
||||
AND s.calls >= $1
|
||||
ORDER BY s.total_exec_time DESC
|
||||
LIMIT $2
|
||||
`.trim();
|
||||
|
||||
const POSTGRES_EXTENSION_REMEDIATION = [
|
||||
'Run CREATE EXTENSION pg_stat_statements; against the connection database.',
|
||||
"Ensure shared_preload_libraries includes 'pg_stat_statements' in the Postgres parameter group or config.",
|
||||
].join(' ');
|
||||
|
||||
const POSTGRES_GRANTS_REMEDIATION = 'GRANT pg_read_all_stats TO <connection role>;';
|
||||
|
||||
function queryClient(client: unknown): KloPostgresQueryClient {
|
||||
if (
|
||||
client &&
|
||||
typeof client === 'object' &&
|
||||
'executeQuery' in client &&
|
||||
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
|
||||
) {
|
||||
return client as KloPostgresQueryClient;
|
||||
}
|
||||
throw new Error('Historic SQL Postgres PGSS reader requires a query client with executeQuery(sql, params?)');
|
||||
}
|
||||
|
||||
async function execute(client: KloPostgresQueryClient, sql: string, params?: unknown[]): Promise<QueryResultLike> {
|
||||
const result = await client.executeQuery(sql, params);
|
||||
if ('error' in result && typeof result.error === 'string' && result.error.length > 0) {
|
||||
throw new Error(result.error);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function indexes(headers: string[]): Map<string, number> {
|
||||
const out = new Map<string, number>();
|
||||
headers.forEach((header, index) => out.set(header.toLowerCase(), index));
|
||||
return out;
|
||||
}
|
||||
|
||||
function value(row: unknown[], headerIndexes: Map<string, number>, header: string): unknown {
|
||||
const index = headerIndexes.get(header.toLowerCase());
|
||||
return index === undefined ? null : row[index];
|
||||
}
|
||||
|
||||
function nullableString(raw: unknown): string | null {
|
||||
if (raw === null || raw === undefined) {
|
||||
return null;
|
||||
}
|
||||
const text = String(raw);
|
||||
return text.length > 0 ? text : null;
|
||||
}
|
||||
|
||||
function requiredString(raw: unknown, field: string): string {
|
||||
const text = nullableString(raw);
|
||||
if (!text) {
|
||||
throw new Error(`Postgres pg_stat_statements row is missing ${field}`);
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
function requiredFiniteNumber(raw: unknown, field: string): number {
|
||||
const number = typeof raw === 'number' ? raw : Number(raw);
|
||||
if (!Number.isFinite(number)) {
|
||||
throw new Error(`Postgres pg_stat_statements row has invalid ${field}: ${String(raw)}`);
|
||||
}
|
||||
return number;
|
||||
}
|
||||
|
||||
function nullableInteger(raw: unknown): number | null {
|
||||
if (raw === null || raw === undefined || raw === '') {
|
||||
return null;
|
||||
}
|
||||
const number = typeof raw === 'number' ? raw : Number(raw);
|
||||
return Number.isFinite(number) ? Math.trunc(number) : null;
|
||||
}
|
||||
|
||||
function nullableIsoTimestamp(raw: unknown): string | null {
|
||||
if (raw === null || raw === undefined || raw === '') {
|
||||
return null;
|
||||
}
|
||||
if (raw instanceof Date) {
|
||||
return raw.toISOString();
|
||||
}
|
||||
const date = new Date(String(raw));
|
||||
return Number.isNaN(date.getTime()) ? null : date.toISOString();
|
||||
}
|
||||
|
||||
function firstRow(result: QueryResultLike, context: string): { row: unknown[]; headers: Map<string, number> } {
|
||||
const row = result.rows[0];
|
||||
if (!row) {
|
||||
throw new Error(`Postgres historic-SQL ${context} query returned no rows`);
|
||||
}
|
||||
return { row, headers: indexes(result.headers) };
|
||||
}
|
||||
|
||||
function isMissingPgssRelation(error: unknown): boolean {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
return /relation ["']?pg_stat_statements["']? does not exist/i.test(message);
|
||||
}
|
||||
|
||||
function isPgssPreloadRequired(error: unknown): boolean {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
return /pg_stat_statements.*shared_preload_libraries/i.test(message);
|
||||
}
|
||||
|
||||
function extensionMissingError(cause: unknown, message?: string): HistoricSqlExtensionMissingError {
|
||||
return new HistoricSqlExtensionMissingError({
|
||||
dialect: 'postgres',
|
||||
message: message ?? 'pg_stat_statements extension is not installed in the connection database.',
|
||||
remediation: POSTGRES_EXTENSION_REMEDIATION,
|
||||
cause,
|
||||
});
|
||||
}
|
||||
|
||||
function grantsMissingError(): HistoricSqlGrantsMissingError {
|
||||
return new HistoricSqlGrantsMissingError({
|
||||
dialect: 'postgres',
|
||||
message: 'Postgres connection role lacks pg_read_all_stats for historic-SQL ingest.',
|
||||
remediation: POSTGRES_GRANTS_REMEDIATION,
|
||||
});
|
||||
}
|
||||
|
||||
function mapSnapshotRow(row: unknown[], headerIndexes: Map<string, number>): PostgresPgssRow {
|
||||
return {
|
||||
queryid: requiredString(value(row, headerIndexes, 'queryid'), 'queryid'),
|
||||
userid: requiredString(value(row, headerIndexes, 'userid'), 'userid'),
|
||||
username: nullableString(value(row, headerIndexes, 'username')),
|
||||
dbid: requiredString(value(row, headerIndexes, 'dbid'), 'dbid'),
|
||||
database: nullableString(value(row, headerIndexes, 'database')),
|
||||
query: requiredString(value(row, headerIndexes, 'query'), 'query'),
|
||||
calls: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'calls'), 'calls')),
|
||||
totalExecTime: requiredFiniteNumber(value(row, headerIndexes, 'total_exec_time'), 'total_exec_time'),
|
||||
meanExecTime: requiredFiniteNumber(value(row, headerIndexes, 'mean_exec_time'), 'mean_exec_time'),
|
||||
totalRows: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'total_rows'), 'total_rows')),
|
||||
};
|
||||
}
|
||||
|
||||
export class PostgresPgssQueryHistoryReader implements PostgresPgssReader {
|
||||
async probe(client: unknown): Promise<PostgresPgssProbeResult> {
|
||||
const pgClient = queryClient(client);
|
||||
const versionResult = await execute(pgClient, VERSION_SQL);
|
||||
const { row: versionRow, headers: versionHeaders } = firstRow(versionResult, 'version probe');
|
||||
const serverVersionNum = requiredFiniteNumber(
|
||||
value(versionRow, versionHeaders, 'server_version_num'),
|
||||
'server_version_num',
|
||||
);
|
||||
const pgServerVersion = requiredString(value(versionRow, versionHeaders, 'server_version'), 'server_version');
|
||||
|
||||
if (serverVersionNum < 140000) {
|
||||
throw new HistoricSqlVersionUnsupportedError({
|
||||
dialect: 'postgres',
|
||||
detectedVersion: pgServerVersion,
|
||||
minimumVersion: 'PostgreSQL 14',
|
||||
});
|
||||
}
|
||||
|
||||
try {
|
||||
await execute(pgClient, EXTENSION_PROBE_SQL);
|
||||
} catch (error) {
|
||||
if (isMissingPgssRelation(error)) {
|
||||
throw extensionMissingError(error);
|
||||
}
|
||||
if (isPgssPreloadRequired(error)) {
|
||||
throw extensionMissingError(
|
||||
error,
|
||||
'pg_stat_statements is installed but not loaded via shared_preload_libraries.',
|
||||
);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
|
||||
const grantsResult = await execute(pgClient, GRANTS_PROBE_SQL);
|
||||
const { row: grantsRow, headers: grantsHeaders } = firstRow(grantsResult, 'grant probe');
|
||||
if (value(grantsRow, grantsHeaders, 'has_role') !== true) {
|
||||
throw grantsMissingError();
|
||||
}
|
||||
|
||||
const trackingResult = await execute(pgClient, TRACKING_PROBE_SQL);
|
||||
const { row: trackingRow, headers: trackingHeaders } = firstRow(trackingResult, 'tracking probe');
|
||||
const track = nullableString(value(trackingRow, trackingHeaders, 'track'));
|
||||
|
||||
const maxResult = await execute(pgClient, MAX_SETTING_PROBE_SQL);
|
||||
const { row: maxRow, headers: maxHeaders } = firstRow(maxResult, 'max-setting probe');
|
||||
const pgssMax = nullableInteger(value(maxRow, maxHeaders, 'max'));
|
||||
|
||||
const warnings: string[] = [];
|
||||
if (track === 'none') {
|
||||
warnings.push('pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config');
|
||||
}
|
||||
if (pgssMax !== null && pgssMax < RECOMMENDED_PGSS_MAX) {
|
||||
warnings.push(
|
||||
`pg_stat_statements.max is ${pgssMax}; set it to at least ${RECOMMENDED_PGSS_MAX} to reduce query-template eviction churn`,
|
||||
);
|
||||
}
|
||||
|
||||
return { pgServerVersion, warnings };
|
||||
}
|
||||
|
||||
async readSnapshot(
|
||||
client: unknown,
|
||||
options: { minCalls: number; maxTemplates: number },
|
||||
): Promise<PostgresPgssSnapshot> {
|
||||
const pgClient = queryClient(client);
|
||||
const snapshotResult = await execute(pgClient, SNAPSHOT_SQL, [options.minCalls, options.maxTemplates]);
|
||||
const snapshotHeaders = indexes(snapshotResult.headers);
|
||||
const statsResult = await execute(pgClient, STATS_INFO_SQL);
|
||||
const { row: statsRow, headers: statsHeaders } = firstRow(statsResult, 'stats-info');
|
||||
|
||||
return {
|
||||
statsResetAt: nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')),
|
||||
deallocCount: nullableInteger(value(statsRow, statsHeaders, 'dealloc')),
|
||||
rows: snapshotResult.rows.map((row) => mapSnapshotRow(row, snapshotHeaders)),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,193 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { HistoricSqlGrantsMissingError } from './errors.js';
|
||||
import { SnowflakeHistoricSqlQueryHistoryReader } from './snowflake-query-history-reader.js';
|
||||
|
||||
interface FakeQueryResult {
|
||||
headers: string[];
|
||||
rows: unknown[][];
|
||||
totalRows: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
function queryClient(results: FakeQueryResult[]) {
|
||||
const executeQuery = vi.fn(async (_query: string) => {
|
||||
const next = results.shift();
|
||||
if (!next) {
|
||||
throw new Error('unexpected query');
|
||||
}
|
||||
return next;
|
||||
});
|
||||
return { executeQuery };
|
||||
}
|
||||
|
||||
function firstQuery(client: ReturnType<typeof queryClient>): string {
|
||||
const call = client.executeQuery.mock.calls[0];
|
||||
if (!call) {
|
||||
throw new Error('expected query client to be called');
|
||||
}
|
||||
return call[0];
|
||||
}
|
||||
|
||||
describe('SnowflakeHistoricSqlQueryHistoryReader', () => {
|
||||
it('probes SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY', async () => {
|
||||
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
|
||||
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
|
||||
|
||||
await expect(reader.probe(client)).resolves.toBeUndefined();
|
||||
|
||||
expect(client.executeQuery).toHaveBeenCalledWith(
|
||||
'SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY LIMIT 1',
|
||||
);
|
||||
});
|
||||
|
||||
it('turns probe result errors into HistoricSqlGrantsMissingError', async () => {
|
||||
const client = queryClient([{ headers: [], rows: [], totalRows: 0, error: 'Object does not exist or not authorized' }]);
|
||||
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
|
||||
|
||||
await expect(reader.probe(client)).rejects.toMatchObject({
|
||||
name: 'HistoricSqlGrantsMissingError',
|
||||
dialect: 'snowflake',
|
||||
remediation: 'GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE <connection role>;',
|
||||
});
|
||||
});
|
||||
|
||||
it('turns thrown probe failures into HistoricSqlGrantsMissingError', async () => {
|
||||
const client = {
|
||||
executeQuery: vi.fn(async () => {
|
||||
throw new Error('permission denied');
|
||||
}),
|
||||
};
|
||||
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
|
||||
|
||||
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
|
||||
});
|
||||
|
||||
it('fetches query-history rows with cursor and maps them into RawQueryRow shape', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: [
|
||||
'QUERY_ID',
|
||||
'QUERY_TEXT',
|
||||
'USER_NAME',
|
||||
'ROLE_NAME',
|
||||
'WAREHOUSE_NAME',
|
||||
'DATABASE_NAME',
|
||||
'SCHEMA_NAME',
|
||||
'START_TIME',
|
||||
'END_TIME',
|
||||
'TOTAL_ELAPSED_TIME',
|
||||
'ROWS_PRODUCED',
|
||||
'EXECUTION_STATUS',
|
||||
'ERROR_CODE',
|
||||
'ERROR_MESSAGE',
|
||||
],
|
||||
rows: [
|
||||
[
|
||||
'01a',
|
||||
"SELECT count(*) FROM ANALYTICS.ORDERS WHERE STATUS = 'paid'",
|
||||
'ANALYST_A',
|
||||
'ANALYST_ROLE',
|
||||
'WH_XS',
|
||||
'ANALYTICS',
|
||||
'PUBLIC',
|
||||
'2026-05-04T10:00:00.000Z',
|
||||
'2026-05-04T10:00:01.250Z',
|
||||
1250,
|
||||
12,
|
||||
'SUCCESS',
|
||||
null,
|
||||
null,
|
||||
],
|
||||
[
|
||||
'01b',
|
||||
'SELECT * FROM MISSING_TABLE',
|
||||
'ANALYST_B',
|
||||
'ANALYST_ROLE',
|
||||
'WH_XS',
|
||||
'ANALYTICS',
|
||||
'PUBLIC',
|
||||
new Date('2026-05-04T10:05:00.000Z'),
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
'FAILED_WITH_ERROR',
|
||||
'002003',
|
||||
'SQL compilation error',
|
||||
],
|
||||
],
|
||||
totalRows: 2,
|
||||
},
|
||||
]);
|
||||
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
|
||||
|
||||
const rows = [];
|
||||
for await (const row of reader.fetch(
|
||||
client,
|
||||
{
|
||||
start: new Date('2026-05-01T00:00:00.000Z'),
|
||||
end: new Date('2026-05-04T12:00:00.000Z'),
|
||||
},
|
||||
'2026-05-03T00:00:00.000Z',
|
||||
)) {
|
||||
rows.push(row);
|
||||
}
|
||||
|
||||
expect(client.executeQuery).toHaveBeenCalledTimes(1);
|
||||
const sql = firstQuery(client);
|
||||
expect(sql).toContain('FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY');
|
||||
expect(sql).toContain("START_TIME >= '2026-05-03T00:00:00.000Z'::TIMESTAMP_TZ");
|
||||
expect(sql).toContain("START_TIME < '2026-05-04T12:00:00.000Z'::TIMESTAMP_TZ");
|
||||
expect(sql).toContain('ORDER BY START_TIME ASC, QUERY_ID ASC');
|
||||
expect(sql).toContain('ROWS_PRODUCED');
|
||||
|
||||
expect(rows).toEqual([
|
||||
{
|
||||
id: '01a',
|
||||
sql: "SELECT count(*) FROM ANALYTICS.ORDERS WHERE STATUS = 'paid'",
|
||||
user: 'ANALYST_A',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: '2026-05-04T10:00:01.250Z',
|
||||
runtimeMs: 1250,
|
||||
rowsProduced: 12,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: '01b',
|
||||
sql: 'SELECT * FROM MISSING_TABLE',
|
||||
user: 'ANALYST_B',
|
||||
startedAt: '2026-05-04T10:05:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: null,
|
||||
rowsProduced: null,
|
||||
success: false,
|
||||
errorMessage: '002003: SQL compilation error',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('uses the window start when no cursor is available', async () => {
|
||||
const client = queryClient([{ headers: ['QUERY_ID'], rows: [], totalRows: 0 }]);
|
||||
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
|
||||
|
||||
for await (const _row of reader.fetch(client, {
|
||||
start: new Date('2026-02-03T12:00:00.000Z'),
|
||||
end: new Date('2026-05-04T12:00:00.000Z'),
|
||||
})) {
|
||||
throw new Error('empty result should not yield rows');
|
||||
}
|
||||
|
||||
const sql = firstQuery(client);
|
||||
expect(sql).toContain("START_TIME >= '2026-02-03T12:00:00.000Z'::TIMESTAMP_TZ");
|
||||
});
|
||||
|
||||
it('throws a clear error when the query client cannot execute SQL', async () => {
|
||||
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
|
||||
|
||||
await expect(async () => {
|
||||
for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) {
|
||||
throw new Error('unreachable');
|
||||
}
|
||||
}).rejects.toThrow('Historic SQL Snowflake reader requires a query client with executeQuery(query)');
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,203 @@
|
|||
import { HistoricSqlGrantsMissingError } from './errors.js';
|
||||
import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js';
|
||||
|
||||
interface QueryResultLike {
|
||||
headers: string[];
|
||||
rows: unknown[][];
|
||||
totalRows: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
interface QueryClientLike {
|
||||
executeQuery(query: string): Promise<QueryResultLike>;
|
||||
}
|
||||
|
||||
const PROBE_SQL = 'SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY LIMIT 1';
|
||||
|
||||
const SNOWFLAKE_GRANTS_REMEDIATION =
|
||||
'GRANT IMPORTED PRIVILEGES ON DATABASE SNOWFLAKE TO ROLE <connection role>;';
|
||||
|
||||
function queryClient(client: unknown): QueryClientLike {
|
||||
if (
|
||||
client &&
|
||||
typeof client === 'object' &&
|
||||
'executeQuery' in client &&
|
||||
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
|
||||
) {
|
||||
return client as QueryClientLike;
|
||||
}
|
||||
throw new Error('Historic SQL Snowflake reader requires a query client with executeQuery(query)');
|
||||
}
|
||||
|
||||
function grantsError(cause: unknown): HistoricSqlGrantsMissingError {
|
||||
const message =
|
||||
cause instanceof Error
|
||||
? cause.message
|
||||
: typeof cause === 'string'
|
||||
? cause
|
||||
: 'Snowflake role cannot query SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY.';
|
||||
return new HistoricSqlGrantsMissingError({
|
||||
dialect: 'snowflake',
|
||||
message: `Missing Snowflake audit grants for historic-SQL ingest: ${message}`,
|
||||
remediation: SNOWFLAKE_GRANTS_REMEDIATION,
|
||||
cause,
|
||||
});
|
||||
}
|
||||
|
||||
function timestampLiteral(value: Date | string): string {
|
||||
const date = value instanceof Date ? value : new Date(value);
|
||||
if (Number.isNaN(date.getTime())) {
|
||||
throw new Error(`Invalid Snowflake query-history timestamp: ${String(value)}`);
|
||||
}
|
||||
return `'${date.toISOString().replace(/'/g, "''")}'::TIMESTAMP_TZ`;
|
||||
}
|
||||
|
||||
function queryHistorySql(window: HistoricSqlTimeWindow, cursor?: string | null): string {
|
||||
const start = timestampLiteral(cursor ?? window.start);
|
||||
const end = timestampLiteral(window.end);
|
||||
return `
|
||||
SELECT
|
||||
QUERY_ID,
|
||||
QUERY_TEXT,
|
||||
USER_NAME,
|
||||
ROLE_NAME,
|
||||
WAREHOUSE_NAME,
|
||||
DATABASE_NAME,
|
||||
SCHEMA_NAME,
|
||||
START_TIME,
|
||||
END_TIME,
|
||||
TOTAL_ELAPSED_TIME,
|
||||
ROWS_PRODUCED,
|
||||
EXECUTION_STATUS,
|
||||
ERROR_CODE,
|
||||
ERROR_MESSAGE
|
||||
FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
|
||||
WHERE START_TIME >= ${start}
|
||||
AND START_TIME < ${end}
|
||||
AND QUERY_TEXT IS NOT NULL
|
||||
ORDER BY START_TIME ASC, QUERY_ID ASC`.trim();
|
||||
}
|
||||
|
||||
function indexByHeader(headers: string[]): Map<string, number> {
|
||||
const out = new Map<string, number>();
|
||||
headers.forEach((header, index) => {
|
||||
out.set(header.toUpperCase(), index);
|
||||
});
|
||||
return out;
|
||||
}
|
||||
|
||||
function value(row: unknown[], indexes: Map<string, number>, name: string): unknown {
|
||||
const index = indexes.get(name);
|
||||
return index === undefined ? null : row[index];
|
||||
}
|
||||
|
||||
function nullableString(raw: unknown): string | null {
|
||||
if (raw === null || raw === undefined) {
|
||||
return null;
|
||||
}
|
||||
const text = String(raw);
|
||||
return text.length > 0 ? text : null;
|
||||
}
|
||||
|
||||
function requiredString(raw: unknown, field: string): string {
|
||||
const text = nullableString(raw);
|
||||
if (!text) {
|
||||
throw new Error(`Snowflake QUERY_HISTORY row is missing ${field}`);
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
function nullableNumber(raw: unknown): number | null {
|
||||
if (raw === null || raw === undefined || raw === '') {
|
||||
return null;
|
||||
}
|
||||
const number = typeof raw === 'number' ? raw : Number(raw);
|
||||
if (!Number.isFinite(number)) {
|
||||
return null;
|
||||
}
|
||||
return number;
|
||||
}
|
||||
|
||||
function nullableInteger(raw: unknown): number | null {
|
||||
const number = nullableNumber(raw);
|
||||
return number === null ? null : Math.trunc(number);
|
||||
}
|
||||
|
||||
function isoTimestamp(raw: unknown, field: string): string {
|
||||
if (raw instanceof Date) {
|
||||
return raw.toISOString();
|
||||
}
|
||||
const text = requiredString(raw, field);
|
||||
const date = new Date(text);
|
||||
if (Number.isNaN(date.getTime())) {
|
||||
throw new Error(`Snowflake QUERY_HISTORY row has invalid ${field}: ${text}`);
|
||||
}
|
||||
return date.toISOString();
|
||||
}
|
||||
|
||||
function nullableIsoTimestamp(raw: unknown): string | null {
|
||||
if (raw === null || raw === undefined || raw === '') {
|
||||
return null;
|
||||
}
|
||||
return isoTimestamp(raw, 'END_TIME');
|
||||
}
|
||||
|
||||
function executionSucceeded(status: string | null, errorCode: string | null, errorMessage: string | null): boolean {
|
||||
if (errorCode || errorMessage) {
|
||||
return false;
|
||||
}
|
||||
return status === null || status.toUpperCase().startsWith('SUCCESS');
|
||||
}
|
||||
|
||||
function combinedErrorMessage(errorCode: string | null, errorMessage: string | null): string | null {
|
||||
if (errorCode && errorMessage) {
|
||||
return `${errorCode}: ${errorMessage}`;
|
||||
}
|
||||
return errorMessage ?? errorCode;
|
||||
}
|
||||
|
||||
function mapRow(row: unknown[], indexes: Map<string, number>): HistoricSqlRawQueryRow {
|
||||
const errorCode = nullableString(value(row, indexes, 'ERROR_CODE'));
|
||||
const errorMessage = nullableString(value(row, indexes, 'ERROR_MESSAGE'));
|
||||
const rowsProduced = nullableInteger(value(row, indexes, 'ROWS_PRODUCED'));
|
||||
return {
|
||||
id: requiredString(value(row, indexes, 'QUERY_ID'), 'QUERY_ID'),
|
||||
sql: requiredString(value(row, indexes, 'QUERY_TEXT'), 'QUERY_TEXT'),
|
||||
user: nullableString(value(row, indexes, 'USER_NAME')),
|
||||
startedAt: isoTimestamp(value(row, indexes, 'START_TIME'), 'START_TIME'),
|
||||
endedAt: nullableIsoTimestamp(value(row, indexes, 'END_TIME')),
|
||||
runtimeMs: nullableNumber(value(row, indexes, 'TOTAL_ELAPSED_TIME')),
|
||||
rowsProduced,
|
||||
success: executionSucceeded(nullableString(value(row, indexes, 'EXECUTION_STATUS')), errorCode, errorMessage),
|
||||
errorMessage: combinedErrorMessage(errorCode, errorMessage),
|
||||
};
|
||||
}
|
||||
|
||||
export class SnowflakeHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader {
|
||||
async probe(client: unknown): Promise<void> {
|
||||
let result: QueryResultLike;
|
||||
try {
|
||||
result = await queryClient(client).executeQuery(PROBE_SQL);
|
||||
} catch (error) {
|
||||
throw grantsError(error);
|
||||
}
|
||||
if (result.error) {
|
||||
throw grantsError(result.error);
|
||||
}
|
||||
}
|
||||
|
||||
async *fetch(
|
||||
client: unknown,
|
||||
window: HistoricSqlTimeWindow,
|
||||
cursor?: string | null,
|
||||
): AsyncIterable<HistoricSqlRawQueryRow> {
|
||||
const result = await queryClient(client).executeQuery(queryHistorySql(window, cursor));
|
||||
if (result.error) {
|
||||
throw grantsError(result.error);
|
||||
}
|
||||
const indexes = indexByHeader(result.headers);
|
||||
for (const row of result.rows) {
|
||||
yield mapRow(row, indexes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
import { mkdir, mkdtemp, readdir, readFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { dirname, join, relative } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import { stagePgStatStatementsTemplates, writePgssBaselineAtomic, type PgssBaseline } from './stage-pgss.js';
|
||||
import type { HistoricSqlPullConfig, KloPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js';
|
||||
|
||||
const FIXTURE_ROOT = join(__dirname, '__fixtures__/postgres');
|
||||
|
||||
interface GoldenFixture {
|
||||
name: string;
|
||||
now: string;
|
||||
connectionId: string;
|
||||
probe: {
|
||||
pgServerVersion: string;
|
||||
warnings: string[];
|
||||
};
|
||||
snapshot: {
|
||||
statsResetAt: string | null;
|
||||
deallocCount: number | null;
|
||||
rows: PostgresPgssRow[];
|
||||
};
|
||||
pullConfig: HistoricSqlPullConfig & { dialect: 'postgres' };
|
||||
analysisBySql: Record<
|
||||
string,
|
||||
{
|
||||
fingerprint: string;
|
||||
normalizedSql: string;
|
||||
tablesTouched: string[];
|
||||
literalSlots: [];
|
||||
error?: string;
|
||||
}
|
||||
>;
|
||||
baseline: PgssBaseline | null;
|
||||
expectedBaseline: PgssBaseline;
|
||||
expectedFiles: Record<string, { json?: unknown; text?: string }>;
|
||||
}
|
||||
|
||||
async function readFixture(name: string): Promise<GoldenFixture> {
|
||||
return JSON.parse(await readFile(join(FIXTURE_ROOT, name, 'input.json'), 'utf-8')) as GoldenFixture;
|
||||
}
|
||||
|
||||
async function tempDir(prefix: string): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), prefix));
|
||||
}
|
||||
|
||||
function fakePgClient(): KloPostgresQueryClient {
|
||||
return {
|
||||
async executeQuery() {
|
||||
return { headers: [], rows: [] };
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function fixtureReader(fixture: GoldenFixture): PostgresPgssReader {
|
||||
return {
|
||||
async probe() {
|
||||
return fixture.probe;
|
||||
},
|
||||
async readSnapshot(_client, options) {
|
||||
return {
|
||||
statsResetAt: fixture.snapshot.statsResetAt,
|
||||
deallocCount: fixture.snapshot.deallocCount,
|
||||
rows: fixture.snapshot.rows.slice(0, options.maxTemplates),
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function fixtureSqlAnalysis(fixture: GoldenFixture): SqlAnalysisPort {
|
||||
return {
|
||||
async analyzeForFingerprint(sql) {
|
||||
const result = fixture.analysisBySql[sql];
|
||||
if (!result) {
|
||||
return {
|
||||
fingerprint: '',
|
||||
normalizedSql: '',
|
||||
tablesTouched: [],
|
||||
literalSlots: [],
|
||||
error: `missing fixture analysis for ${sql}`,
|
||||
};
|
||||
}
|
||||
return result;
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async function writeFixtureBaseline(path: string, baseline: PgssBaseline | null): Promise<void> {
|
||||
if (!baseline) {
|
||||
return;
|
||||
}
|
||||
await writePgssBaselineAtomic(path, baseline);
|
||||
}
|
||||
|
||||
async function listFiles(root: string, current = root): Promise<string[]> {
|
||||
const entries = await readdir(current, { withFileTypes: true });
|
||||
const files: string[] = [];
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(current, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await listFiles(root, fullPath)));
|
||||
} else {
|
||||
files.push(relative(root, fullPath));
|
||||
}
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
async function expectGoldenFiles(stagedDir: string, expectedFiles: GoldenFixture['expectedFiles']): Promise<void> {
|
||||
const actualFiles = await listFiles(stagedDir);
|
||||
const expectedPaths = Object.keys(expectedFiles).sort();
|
||||
expect(actualFiles.sort()).toEqual(expectedPaths);
|
||||
|
||||
for (const path of expectedPaths) {
|
||||
const expected = expectedFiles[path];
|
||||
const actual = await readFile(join(stagedDir, path), 'utf-8');
|
||||
if ('json' in expected) {
|
||||
expect(JSON.parse(actual)).toEqual(expected.json);
|
||||
} else {
|
||||
expect(actual).toBe(expected.text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('stagePgStatStatementsTemplates golden fixtures', () => {
|
||||
it.each(['first-run', 'normal-delta', 'reset-detected', 'version-change', 'eviction-churn'] as const)(
|
||||
'matches the committed %s golden output',
|
||||
async (fixtureName) => {
|
||||
const fixture = await readFixture(fixtureName);
|
||||
const root = await tempDir(`pgss-golden-${fixtureName}-`);
|
||||
const stagedDir = join(root, 'staged');
|
||||
const baselinePath = join(root, 'cache', fixture.connectionId, 'pgss-baseline.json');
|
||||
await mkdir(dirname(baselinePath), { recursive: true });
|
||||
await writeFixtureBaseline(baselinePath, fixture.baseline);
|
||||
|
||||
const result = await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: fixture.connectionId,
|
||||
queryClient: fakePgClient(),
|
||||
reader: fixtureReader(fixture),
|
||||
sqlAnalysis: fixtureSqlAnalysis(fixture),
|
||||
pullConfig: fixture.pullConfig,
|
||||
baselinePath,
|
||||
now: new Date(fixture.now),
|
||||
});
|
||||
|
||||
await expectGoldenFiles(stagedDir, fixture.expectedFiles);
|
||||
expect(result.baseline).toEqual(fixture.expectedBaseline);
|
||||
},
|
||||
);
|
||||
});
|
||||
|
|
@ -0,0 +1,652 @@
|
|||
import { mkdtemp, readFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import {
|
||||
pgssBaselinePath,
|
||||
readPgssBaseline,
|
||||
stagePgStatStatementsTemplates,
|
||||
writePgssBaselineAtomic,
|
||||
type PgssBaseline,
|
||||
} from './stage-pgss.js';
|
||||
import { historicSqlManifestSchema, historicSqlMetadataSchema, historicSqlUsageSchema } from './types.js';
|
||||
import type { KloPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js';
|
||||
|
||||
async function tempDir(prefix: string): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), prefix));
|
||||
}
|
||||
|
||||
async function readJson<T>(root: string, relPath: string): Promise<T> {
|
||||
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
|
||||
}
|
||||
|
||||
function fakePgClient(): KloPostgresQueryClient {
|
||||
return {
|
||||
async executeQuery() {
|
||||
return { headers: [], rows: [] };
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function row(overrides: Partial<PostgresPgssRow> & Pick<PostgresPgssRow, 'queryid' | 'query'>): PostgresPgssRow {
|
||||
return {
|
||||
userid: '11',
|
||||
username: 'analyst',
|
||||
dbid: '5',
|
||||
database: 'warehouse',
|
||||
calls: 10,
|
||||
totalExecTime: 250,
|
||||
meanExecTime: 25,
|
||||
totalRows: 20,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function fakeReader(input: {
|
||||
pgServerVersion?: string;
|
||||
warnings?: string[];
|
||||
statsResetAt?: string | null;
|
||||
deallocCount?: number | null;
|
||||
rows: PostgresPgssRow[];
|
||||
}): PostgresPgssReader {
|
||||
return {
|
||||
probe: vi.fn(async () => ({
|
||||
pgServerVersion: input.pgServerVersion ?? 'PostgreSQL 16.4',
|
||||
warnings: input.warnings ?? [],
|
||||
})),
|
||||
readSnapshot: vi.fn(async (_client, options) => ({
|
||||
statsResetAt: input.statsResetAt ?? '2026-05-08T08:00:00.000Z',
|
||||
deallocCount: input.deallocCount ?? 0,
|
||||
rows: input.rows.slice(0, options.maxTemplates),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
const sqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
if (sql.includes('broken')) {
|
||||
return {
|
||||
fingerprint: '',
|
||||
normalizedSql: '',
|
||||
tablesTouched: [],
|
||||
literalSlots: [],
|
||||
error: 'parse failed',
|
||||
};
|
||||
}
|
||||
if (sql.includes('customers')) {
|
||||
return {
|
||||
fingerprint: 'fp_customers',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.customers',
|
||||
tablesTouched: ['analytics.customers'],
|
||||
literalSlots: [],
|
||||
};
|
||||
}
|
||||
return {
|
||||
fingerprint: 'fp_orders',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
function postgresPullConfig(maxTemplatesPerRun = 5000) {
|
||||
return {
|
||||
dialect: 'postgres' as const,
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: ['^svc_'],
|
||||
redactionPatterns: ['secret'],
|
||||
maxTemplatesPerRun,
|
||||
minCalls: 5,
|
||||
};
|
||||
}
|
||||
|
||||
describe('stagePgStatStatementsTemplates', () => {
|
||||
it('stages first-run PGSS templates as degraded aggregate templates and builds a next baseline', async () => {
|
||||
const stagedDir = await tempDir('pgss-stage-first-');
|
||||
const baselineRootDir = await tempDir('pgss-baseline-first-');
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
|
||||
const result = await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
warnings: ['pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config'],
|
||||
deallocCount: 2,
|
||||
rows: [
|
||||
row({
|
||||
queryid: '101',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 10,
|
||||
totalExecTime: 250,
|
||||
totalRows: 20,
|
||||
}),
|
||||
row({
|
||||
queryid: '102',
|
||||
query: 'SELECT * FROM pg_catalog.pg_class',
|
||||
calls: 50,
|
||||
totalExecTime: 500,
|
||||
}),
|
||||
row({
|
||||
queryid: '103',
|
||||
query: 'BEGIN',
|
||||
calls: 75,
|
||||
totalExecTime: 75,
|
||||
}),
|
||||
row({
|
||||
queryid: '104',
|
||||
query: 'SELECT broken FROM analytics.orders',
|
||||
calls: 8,
|
||||
totalExecTime: 80,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest).toMatchObject({
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_pg',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-08T12:00:00.000Z',
|
||||
windowEnd: '2026-05-08T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
|
||||
templateCount: 1,
|
||||
capped: false,
|
||||
degraded: true,
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
baselineFirstRun: true,
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
deallocCount: 2,
|
||||
});
|
||||
expect(manifest.warnings).toEqual([
|
||||
'pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config',
|
||||
'pgss_dealloc_count:2; pg_stat_statements.max may be too low, causing template eviction churn',
|
||||
'baseline_first_run:no_previous_pgss_baseline',
|
||||
'analysis_failed:db5_q104',
|
||||
]);
|
||||
expect(manifest.templates).toEqual([
|
||||
{
|
||||
id: 'db5_q101',
|
||||
fingerprint: 'fp_orders',
|
||||
subClusterId: null,
|
||||
path: 'templates/db5_q101/page.md',
|
||||
},
|
||||
]);
|
||||
|
||||
const metadata = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q101/metadata.json'));
|
||||
expect(metadata).toMatchObject({
|
||||
id: 'db5_q101',
|
||||
title: 'postgres · analytics.orders [db5_q101]',
|
||||
path: 'templates/db5_q101/page.md',
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_orders',
|
||||
sub_cluster_id: null,
|
||||
dialect: 'postgres',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [],
|
||||
},
|
||||
});
|
||||
expect(metadata.properties.triage_signals).toEqual({
|
||||
executions_bucket: 'mid',
|
||||
distinct_users_bucket: 'solo',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
runtime_bucket: 'fast',
|
||||
});
|
||||
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q101/usage.json'));
|
||||
expect(usage).toEqual({
|
||||
stats: {
|
||||
executions: 10,
|
||||
distinct_users: 1,
|
||||
first_seen: '2026-05-08T12:00:00.000Z',
|
||||
last_seen: '2026-05-08T12:00:00.000Z',
|
||||
p50_runtime_ms: null,
|
||||
p95_runtime_ms: null,
|
||||
mean_runtime_ms: 25,
|
||||
error_rate: 0,
|
||||
rows_produced: 20,
|
||||
},
|
||||
literal_slots: [],
|
||||
samples: [],
|
||||
});
|
||||
|
||||
expect(await readFile(join(stagedDir, 'templates/db5_q101/page.md'), 'utf-8')).toContain(
|
||||
'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
);
|
||||
expect(result.baselinePath).toBe(baselinePath);
|
||||
expect(result.baseline.templates.db5_q101.perUser['11']).toEqual({
|
||||
calls: 10,
|
||||
totalExecTime: 250,
|
||||
totalRows: 20,
|
||||
});
|
||||
await expect(readPgssBaseline(baselinePath)).resolves.toBeNull();
|
||||
});
|
||||
|
||||
it('warns when pg_stat_statements reports dealloc churn', async () => {
|
||||
const root = await tempDir('pgss-churn-');
|
||||
const stagedDir = join(root, 'staged');
|
||||
const baselinePath = join(root, 'cache', 'warehouse', 'pgss-baseline.json');
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'warehouse',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
rows: [
|
||||
row({
|
||||
queryid: '901',
|
||||
query: 'SELECT COUNT(*) FROM public.orders WHERE status = $1',
|
||||
calls: 20,
|
||||
totalExecTime: 500,
|
||||
meanExecTime: 25,
|
||||
}),
|
||||
],
|
||||
deallocCount: 3,
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(50),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = await readJson<{ warnings: string[]; deallocCount: number }>(stagedDir, 'manifest.json');
|
||||
expect(manifest.deallocCount).toBe(3);
|
||||
expect(manifest.warnings).toContain(
|
||||
'pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn',
|
||||
);
|
||||
});
|
||||
|
||||
it('uses the saved cumulative baseline to stage only positive deltas on later runs', async () => {
|
||||
const stagedDir = await tempDir('pgss-stage-delta-');
|
||||
const baselineRootDir = await tempDir('pgss-baseline-delta-');
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
const baseline: PgssBaseline = {
|
||||
version: 1,
|
||||
fetchedAt: '2026-05-08T10:00:00.000Z',
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
templates: {
|
||||
db5_q201: {
|
||||
firstObservedAt: '2026-05-08T09:00:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 10, totalExecTime: 100, totalRows: 50 },
|
||||
'12': { calls: 5, totalExecTime: 50, totalRows: 25 },
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
await writePgssBaselineAtomic(baselinePath, baseline);
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
rows: [
|
||||
row({
|
||||
queryid: '201',
|
||||
userid: '11',
|
||||
username: 'analyst',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 12,
|
||||
totalExecTime: 160,
|
||||
totalRows: 58,
|
||||
}),
|
||||
row({
|
||||
queryid: '201',
|
||||
userid: '12',
|
||||
username: 'svc_loader',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 5,
|
||||
totalExecTime: 50,
|
||||
totalRows: 25,
|
||||
}),
|
||||
row({
|
||||
queryid: '202',
|
||||
userid: '13',
|
||||
username: 'analyst_2',
|
||||
query: 'SELECT count(*) FROM analytics.customers',
|
||||
calls: 7,
|
||||
totalExecTime: 210,
|
||||
totalRows: 7,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.baselineFirstRun).toBe(false);
|
||||
expect(manifest.windowStart).toBe('2026-05-08T10:00:00.000Z');
|
||||
expect(manifest.templateCount).toBe(2);
|
||||
expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q202', 'db5_q201']);
|
||||
|
||||
const usage201 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q201/usage.json'));
|
||||
expect(usage201.stats).toMatchObject({
|
||||
executions: 2,
|
||||
distinct_users: 1,
|
||||
first_seen: '2026-05-08T09:00:00.000Z',
|
||||
last_seen: '2026-05-08T12:00:00.000Z',
|
||||
mean_runtime_ms: 30,
|
||||
rows_produced: 8,
|
||||
});
|
||||
const metadata201 = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q201/metadata.json'));
|
||||
expect(metadata201.properties.triage_signals.service_account_only).toBe('false');
|
||||
|
||||
const usage202 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q202/usage.json'));
|
||||
expect(usage202.stats).toMatchObject({
|
||||
executions: 7,
|
||||
distinct_users: 1,
|
||||
first_seen: '2026-05-08T12:00:00.000Z',
|
||||
mean_runtime_ms: 30,
|
||||
rows_produced: 7,
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps matching queryid values from different databases as distinct templates and baseline entries', async () => {
|
||||
const stagedDir = await tempDir('pgss-stage-db-key-');
|
||||
const baselineRootDir = await tempDir('pgss-baseline-db-key-');
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
await writePgssBaselineAtomic(baselinePath, {
|
||||
version: 1,
|
||||
fetchedAt: '2026-05-08T10:00:00.000Z',
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
templates: {
|
||||
db5_q701: {
|
||||
firstObservedAt: '2026-05-08T09:00:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 10, totalExecTime: 100, totalRows: 50 },
|
||||
},
|
||||
},
|
||||
db6_q701: {
|
||||
firstObservedAt: '2026-05-08T09:30:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 4, totalExecTime: 40, totalRows: 20 },
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const result = await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
rows: [
|
||||
row({
|
||||
queryid: '701',
|
||||
dbid: '5',
|
||||
database: 'warehouse',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 12,
|
||||
totalExecTime: 160,
|
||||
totalRows: 58,
|
||||
}),
|
||||
row({
|
||||
queryid: '701',
|
||||
dbid: '6',
|
||||
database: 'app',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 9,
|
||||
totalExecTime: 130,
|
||||
totalRows: 35,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.templates.map((template) => template.id).sort()).toEqual(['db5_q701', 'db6_q701']);
|
||||
|
||||
const warehouseUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q701/usage.json'));
|
||||
expect(warehouseUsage.stats).toMatchObject({
|
||||
executions: 2,
|
||||
rows_produced: 8,
|
||||
first_seen: '2026-05-08T09:00:00.000Z',
|
||||
});
|
||||
|
||||
const appUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db6_q701/usage.json'));
|
||||
expect(appUsage.stats).toMatchObject({
|
||||
executions: 5,
|
||||
rows_produced: 15,
|
||||
first_seen: '2026-05-08T09:30:00.000Z',
|
||||
});
|
||||
|
||||
expect(result.baseline.templates.db5_q701.perUser['11']).toEqual({
|
||||
calls: 12,
|
||||
totalExecTime: 160,
|
||||
totalRows: 58,
|
||||
});
|
||||
expect(result.baseline.templates.db6_q701.perUser['11']).toEqual({
|
||||
calls: 9,
|
||||
totalExecTime: 130,
|
||||
totalRows: 35,
|
||||
});
|
||||
});
|
||||
|
||||
it('treats stats_reset advancement and major-version changes as fresh baselines', async () => {
|
||||
const resetStagedDir = await tempDir('pgss-stage-reset-');
|
||||
const resetBaselineRootDir = await tempDir('pgss-baseline-reset-');
|
||||
const resetBaselinePath = pgssBaselinePath(resetBaselineRootDir, 'conn_pg');
|
||||
await writePgssBaselineAtomic(resetBaselinePath, {
|
||||
version: 1,
|
||||
fetchedAt: '2026-05-08T10:00:00.000Z',
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
templates: {
|
||||
db5_q301: {
|
||||
firstObservedAt: '2026-05-08T09:00:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir: resetStagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
statsResetAt: '2026-05-08T11:00:00.000Z',
|
||||
rows: [
|
||||
row({
|
||||
queryid: '301',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 3,
|
||||
totalExecTime: 90,
|
||||
totalRows: 9,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath: resetBaselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const resetManifest = historicSqlManifestSchema.parse(await readJson(resetStagedDir, 'manifest.json'));
|
||||
expect(resetManifest.baselineFirstRun).toBe(true);
|
||||
expect(resetManifest.warnings).toContain(
|
||||
'baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z',
|
||||
);
|
||||
const resetUsage = historicSqlUsageSchema.parse(await readJson(resetStagedDir, 'templates/db5_q301/usage.json'));
|
||||
expect(resetUsage.stats.executions).toBe(3);
|
||||
|
||||
const versionStagedDir = await tempDir('pgss-stage-version-');
|
||||
const versionBaselineRootDir = await tempDir('pgss-baseline-version-');
|
||||
const versionBaselinePath = pgssBaselinePath(versionBaselineRootDir, 'conn_pg');
|
||||
await writePgssBaselineAtomic(versionBaselinePath, {
|
||||
version: 1,
|
||||
fetchedAt: '2026-05-08T10:00:00.000Z',
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
pgServerVersion: 'PostgreSQL 15.7',
|
||||
templates: {
|
||||
db5_q302: {
|
||||
firstObservedAt: '2026-05-08T09:00:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir: versionStagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
rows: [
|
||||
row({
|
||||
queryid: '302',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 4,
|
||||
totalExecTime: 80,
|
||||
totalRows: 8,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath: versionBaselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const versionManifest = historicSqlManifestSchema.parse(await readJson(versionStagedDir, 'manifest.json'));
|
||||
expect(versionManifest.baselineFirstRun).toBe(true);
|
||||
expect(versionManifest.warnings).toContain('baseline_reset:pg_server_major changed from 15 to 16');
|
||||
});
|
||||
|
||||
it('handles scoped counter regressions without forcing a global first-run baseline', async () => {
|
||||
const stagedDir = await tempDir('pgss-stage-scoped-');
|
||||
const baselineRootDir = await tempDir('pgss-baseline-scoped-');
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
await writePgssBaselineAtomic(baselinePath, {
|
||||
version: 1,
|
||||
fetchedAt: '2026-05-08T10:00:00.000Z',
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
templates: {
|
||||
db5_q401: {
|
||||
firstObservedAt: '2026-05-08T09:00:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
|
||||
'12': { calls: 50, totalExecTime: 500, totalRows: 250 },
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
rows: [
|
||||
row({
|
||||
queryid: '401',
|
||||
userid: '11',
|
||||
username: 'analyst',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 2,
|
||||
totalExecTime: 30,
|
||||
totalRows: 6,
|
||||
}),
|
||||
row({
|
||||
queryid: '401',
|
||||
userid: '12',
|
||||
username: 'svc_loader',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 55,
|
||||
totalExecTime: 650,
|
||||
totalRows: 275,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.baselineFirstRun).toBe(false);
|
||||
expect(manifest.warnings).toContain('scoped_reset:dbid=5 queryid=401 userid=11');
|
||||
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q401/usage.json'));
|
||||
expect(usage.stats).toMatchObject({
|
||||
executions: 7,
|
||||
distinct_users: 2,
|
||||
mean_runtime_ms: 25.714285714285715,
|
||||
rows_produced: 31,
|
||||
});
|
||||
});
|
||||
|
||||
it('ranks and caps selected PGSS templates after skip and analysis filtering', async () => {
|
||||
const stagedDir = await tempDir('pgss-stage-cap-');
|
||||
const baselineRootDir = await tempDir('pgss-baseline-cap-');
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
rows: [
|
||||
row({
|
||||
queryid: '501',
|
||||
username: 'analyst_a',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 2,
|
||||
totalExecTime: 20,
|
||||
}),
|
||||
row({
|
||||
queryid: '502',
|
||||
username: 'analyst_b',
|
||||
query: 'SELECT count(*) FROM analytics.customers',
|
||||
calls: 20,
|
||||
totalExecTime: 200,
|
||||
}),
|
||||
row({
|
||||
queryid: '503',
|
||||
username: 'analyst_c',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 10,
|
||||
totalExecTime: 100,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(2),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.capped).toBe(true);
|
||||
expect(manifest.warnings).toContain('templates_truncated: kept 2 of 3 templates');
|
||||
expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q502', 'db5_q503']);
|
||||
});
|
||||
});
|
||||
508
packages/context/src/ingest/adapters/historic-sql/stage-pgss.ts
Normal file
508
packages/context/src/ingest/adapters/historic-sql/stage-pgss.ts
Normal file
|
|
@ -0,0 +1,508 @@
|
|||
import { mkdir, readFile, rename, writeFile } from 'node:fs/promises';
|
||||
import { dirname, join } from 'node:path';
|
||||
import { z } from 'zod';
|
||||
import type { SqlAnalysisFingerprintResult, SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import {
|
||||
HISTORIC_SQL_OBJECT_TYPE,
|
||||
HISTORIC_SQL_SOURCE_KEY,
|
||||
historicSqlPullConfigSchema,
|
||||
type HistoricSqlManifest,
|
||||
type HistoricSqlMetadata,
|
||||
type HistoricSqlPullConfig,
|
||||
type HistoricSqlUsage,
|
||||
type KloPostgresQueryClient,
|
||||
type PostgresPgssAggregateRow,
|
||||
type PostgresPgssReader,
|
||||
type PostgresPgssRow,
|
||||
} from './types.js';
|
||||
|
||||
const PGSS_BASELINE_VERSION = 1 as const;
|
||||
|
||||
const pgssCounterSchema = z.object({
|
||||
calls: z.number().int().nonnegative(),
|
||||
totalExecTime: z.number().nonnegative(),
|
||||
totalRows: z.number().int().nonnegative(),
|
||||
});
|
||||
|
||||
const pgssBaselineSchema = z.object({
|
||||
version: z.literal(PGSS_BASELINE_VERSION),
|
||||
fetchedAt: z.string().datetime(),
|
||||
statsResetAt: z.string().datetime().nullable(),
|
||||
pgServerVersion: z.string(),
|
||||
templates: z.record(
|
||||
z.string(),
|
||||
z.object({
|
||||
firstObservedAt: z.string().datetime(),
|
||||
perUser: z.record(z.string(), pgssCounterSchema),
|
||||
}),
|
||||
),
|
||||
});
|
||||
|
||||
export type PgssBaseline = z.infer<typeof pgssBaselineSchema>;
|
||||
|
||||
export interface StagePgStatStatementsTemplatesInput {
|
||||
stagedDir: string;
|
||||
connectionId: string;
|
||||
queryClient: KloPostgresQueryClient;
|
||||
reader: PostgresPgssReader;
|
||||
sqlAnalysis: SqlAnalysisPort;
|
||||
pullConfig: HistoricSqlPullConfig;
|
||||
baselinePath: string;
|
||||
now?: Date;
|
||||
}
|
||||
|
||||
export interface StagePgStatStatementsTemplatesResult {
|
||||
baselinePath: string;
|
||||
baseline: PgssBaseline;
|
||||
}
|
||||
|
||||
interface PgssBaselineCounter {
|
||||
calls: number;
|
||||
totalExecTime: number;
|
||||
totalRows: number;
|
||||
}
|
||||
|
||||
interface PgssAggregateMutable {
|
||||
id: string;
|
||||
queryid: string;
|
||||
dbid: string;
|
||||
database: string | null;
|
||||
query: string;
|
||||
deltaCalls: number;
|
||||
deltaExecTime: number;
|
||||
deltaRows: number;
|
||||
users: Set<string>;
|
||||
firstObservedAt: string;
|
||||
}
|
||||
|
||||
interface AnalyzedPgssTemplate {
|
||||
aggregate: PostgresPgssAggregateRow;
|
||||
analysis: SqlAnalysisFingerprintResult;
|
||||
}
|
||||
|
||||
const ZERO_COUNTER: PgssBaselineCounter = {
|
||||
calls: 0,
|
||||
totalExecTime: 0,
|
||||
totalRows: 0,
|
||||
};
|
||||
|
||||
const PGSS_SNAPSHOT_READ_LIMIT = 5000;
|
||||
const PGSS_HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET|BEGIN|COMMIT|ROLLBACK|VACUUM|ANALYZE)\b/i;
|
||||
const PGSS_HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|pg_catalog\.|pg_toast\.|pg_stat_)/i;
|
||||
|
||||
function pgssTemplateId(row: Pick<PostgresPgssRow, 'dbid' | 'queryid'>): string {
|
||||
return `db${row.dbid}_q${row.queryid}`;
|
||||
}
|
||||
|
||||
export function pgssBaselinePath(rootDir: string | undefined, connectionId: string): string {
|
||||
return join(rootDir ?? join(process.cwd(), '.klo/cache/historic-sql'), connectionId, 'pgss-baseline.json');
|
||||
}
|
||||
|
||||
export async function readPgssBaseline(path: string): Promise<PgssBaseline | null> {
|
||||
try {
|
||||
return pgssBaselineSchema.parse(JSON.parse(await readFile(path, 'utf-8')));
|
||||
} catch (error) {
|
||||
if (error && typeof error === 'object' && 'code' in error && error.code === 'ENOENT') {
|
||||
return null;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export async function writePgssBaselineAtomic(path: string, baseline: PgssBaseline): Promise<void> {
|
||||
const parsed = pgssBaselineSchema.parse(baseline);
|
||||
await mkdir(dirname(path), { recursive: true });
|
||||
const tempPath = `${path}.tmp`;
|
||||
await writeFile(tempPath, `${JSON.stringify(parsed, null, 2)}\n`, 'utf-8');
|
||||
await rename(tempPath, path);
|
||||
}
|
||||
|
||||
export async function stagePgStatStatementsTemplates(
|
||||
input: StagePgStatStatementsTemplatesInput,
|
||||
): Promise<StagePgStatStatementsTemplatesResult> {
|
||||
const config = historicSqlPullConfigSchema.parse(input.pullConfig);
|
||||
if (config.dialect !== 'postgres') {
|
||||
throw new Error(`stagePgStatStatementsTemplates requires dialect postgres, got ${config.dialect}`);
|
||||
}
|
||||
|
||||
const now = input.now ?? new Date();
|
||||
const fetchedAt = now.toISOString();
|
||||
const probe = await input.reader.probe(input.queryClient);
|
||||
const warnings = [...probe.warnings];
|
||||
const baseline = await readPgssBaseline(input.baselinePath);
|
||||
const snapshot = await input.reader.readSnapshot(input.queryClient, {
|
||||
minCalls: config.minCalls,
|
||||
maxTemplates: PGSS_SNAPSHOT_READ_LIMIT,
|
||||
});
|
||||
if (snapshot.deallocCount !== null && snapshot.deallocCount > 0) {
|
||||
warnings.push(
|
||||
`pgss_dealloc_count:${snapshot.deallocCount}; pg_stat_statements.max may be too low, causing template eviction churn`,
|
||||
);
|
||||
}
|
||||
const reset = detectBaselineReset({
|
||||
baseline,
|
||||
snapshotStatsResetAt: snapshot.statsResetAt,
|
||||
currentPgServerVersion: probe.pgServerVersion,
|
||||
});
|
||||
warnings.push(...reset.warnings);
|
||||
|
||||
const aggregates = aggregatePgssRows({
|
||||
rows: snapshot.rows,
|
||||
baseline,
|
||||
baselineFirstRun: reset.baselineFirstRun,
|
||||
fetchedAt,
|
||||
warnings,
|
||||
}).filter((aggregate) => !shouldSkipPgssSql(aggregate.query));
|
||||
|
||||
const analyzed: AnalyzedPgssTemplate[] = [];
|
||||
for (const aggregate of aggregates) {
|
||||
const analysis = await input.sqlAnalysis.analyzeForFingerprint(aggregate.query, 'postgres');
|
||||
if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) {
|
||||
warnings.push(`analysis_failed:${aggregate.id}`);
|
||||
continue;
|
||||
}
|
||||
analyzed.push({ aggregate, analysis });
|
||||
}
|
||||
|
||||
const selected = selectPgssTemplates(analyzed, config.maxTemplatesPerRun);
|
||||
if (selected.length < analyzed.length) {
|
||||
warnings.push(`templates_truncated: kept ${selected.length} of ${analyzed.length} templates`);
|
||||
}
|
||||
|
||||
await mkdir(input.stagedDir, { recursive: true });
|
||||
const templates: HistoricSqlManifest['templates'] = [];
|
||||
for (const template of selected) {
|
||||
const staged = buildPgssStagedTemplate(template, config, now);
|
||||
const basePath = `templates/${staged.metadata.id}`;
|
||||
await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata);
|
||||
await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown);
|
||||
await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage);
|
||||
templates.push({
|
||||
id: staged.metadata.id,
|
||||
fingerprint: staged.metadata.properties.fingerprint,
|
||||
subClusterId: staged.metadata.properties.sub_cluster_id,
|
||||
path: staged.metadata.path,
|
||||
});
|
||||
}
|
||||
|
||||
await writeJson(input.stagedDir, 'manifest.json', {
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: input.connectionId,
|
||||
dialect: 'postgres',
|
||||
fetchedAt,
|
||||
windowStart: baseline?.fetchedAt ?? snapshot.statsResetAt ?? fetchedAt,
|
||||
windowEnd: fetchedAt,
|
||||
nextSuccessfulCursor: fetchedAt,
|
||||
templateCount: selected.length,
|
||||
capped: selected.length < analyzed.length,
|
||||
warnings,
|
||||
degraded: true,
|
||||
statsResetAt: snapshot.statsResetAt,
|
||||
baselineFirstRun: reset.baselineFirstRun,
|
||||
pgServerVersion: probe.pgServerVersion,
|
||||
deallocCount: snapshot.deallocCount,
|
||||
templates,
|
||||
} satisfies HistoricSqlManifest);
|
||||
|
||||
return {
|
||||
baselinePath: input.baselinePath,
|
||||
baseline: buildNextBaseline({
|
||||
rows: snapshot.rows,
|
||||
fetchedAt,
|
||||
statsResetAt: snapshot.statsResetAt,
|
||||
pgServerVersion: probe.pgServerVersion,
|
||||
previousBaseline: reset.baselineFirstRun ? null : baseline,
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
function detectBaselineReset(input: {
|
||||
baseline: PgssBaseline | null;
|
||||
snapshotStatsResetAt: string | null;
|
||||
currentPgServerVersion: string;
|
||||
}): { baselineFirstRun: boolean; warnings: string[] } {
|
||||
if (!input.baseline) {
|
||||
return { baselineFirstRun: true, warnings: ['baseline_first_run:no_previous_pgss_baseline'] };
|
||||
}
|
||||
|
||||
const warnings: string[] = [];
|
||||
if (
|
||||
input.baseline.statsResetAt &&
|
||||
input.snapshotStatsResetAt &&
|
||||
input.baseline.statsResetAt < input.snapshotStatsResetAt
|
||||
) {
|
||||
warnings.push(
|
||||
`baseline_reset:stats_reset advanced from ${input.baseline.statsResetAt} to ${input.snapshotStatsResetAt}`,
|
||||
);
|
||||
}
|
||||
|
||||
const previousMajor = postgresMajor(input.baseline.pgServerVersion);
|
||||
const currentMajor = postgresMajor(input.currentPgServerVersion);
|
||||
if (previousMajor && currentMajor && previousMajor !== currentMajor) {
|
||||
warnings.push(`baseline_reset:pg_server_major changed from ${previousMajor} to ${currentMajor}`);
|
||||
}
|
||||
|
||||
return { baselineFirstRun: warnings.length > 0, warnings };
|
||||
}
|
||||
|
||||
function postgresMajor(version: string): string | null {
|
||||
return version.match(/PostgreSQL\s+(\d+)/i)?.[1] ?? version.match(/^(\d+)(?:\.|$)/)?.[1] ?? null;
|
||||
}
|
||||
|
||||
function aggregatePgssRows(input: {
|
||||
rows: PostgresPgssRow[];
|
||||
baseline: PgssBaseline | null;
|
||||
baselineFirstRun: boolean;
|
||||
fetchedAt: string;
|
||||
warnings: string[];
|
||||
}): PostgresPgssAggregateRow[] {
|
||||
const aggregates = new Map<string, PgssAggregateMutable>();
|
||||
|
||||
for (const row of input.rows) {
|
||||
const templateId = pgssTemplateId(row);
|
||||
const baselineTemplate = input.baselineFirstRun ? undefined : input.baseline?.templates[templateId];
|
||||
const baselineCounter = baselineTemplate?.perUser[row.userid];
|
||||
const previous = scopedCounterBaseline(row, baselineCounter, input.baselineFirstRun, input.warnings);
|
||||
const deltaCalls = row.calls - previous.calls;
|
||||
const deltaExecTime = row.totalExecTime - previous.totalExecTime;
|
||||
const deltaRows = row.totalRows - previous.totalRows;
|
||||
if (deltaCalls === 0 && !input.baselineFirstRun) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const existing =
|
||||
aggregates.get(templateId) ??
|
||||
({
|
||||
id: templateId,
|
||||
queryid: row.queryid,
|
||||
dbid: row.dbid,
|
||||
database: row.database,
|
||||
query: row.query,
|
||||
deltaCalls: 0,
|
||||
deltaExecTime: 0,
|
||||
deltaRows: 0,
|
||||
users: new Set<string>(),
|
||||
firstObservedAt: baselineTemplate?.firstObservedAt ?? input.fetchedAt,
|
||||
} satisfies PgssAggregateMutable);
|
||||
|
||||
existing.deltaCalls += Math.max(0, deltaCalls);
|
||||
existing.deltaExecTime += Math.max(0, deltaExecTime);
|
||||
existing.deltaRows += Math.max(0, deltaRows);
|
||||
if (deltaCalls > 0) {
|
||||
existing.users.add(row.username ?? 'unknown');
|
||||
}
|
||||
aggregates.set(templateId, existing);
|
||||
}
|
||||
|
||||
return [...aggregates.values()]
|
||||
.filter((aggregate) => aggregate.deltaCalls > 0)
|
||||
.map((aggregate) => ({
|
||||
id: aggregate.id,
|
||||
queryid: aggregate.queryid,
|
||||
dbid: aggregate.dbid,
|
||||
database: aggregate.database,
|
||||
query: aggregate.query,
|
||||
deltaCalls: aggregate.deltaCalls,
|
||||
deltaExecTime: aggregate.deltaExecTime,
|
||||
deltaRows: aggregate.deltaRows,
|
||||
meanExecTime: aggregate.deltaExecTime / Math.max(aggregate.deltaCalls, 1),
|
||||
distinctUsersDelta: aggregate.users.size,
|
||||
users: [...aggregate.users].sort(),
|
||||
firstObservedAt: aggregate.firstObservedAt,
|
||||
}));
|
||||
}
|
||||
|
||||
function scopedCounterBaseline(
|
||||
row: PostgresPgssRow,
|
||||
baselineCounter: PgssBaselineCounter | undefined,
|
||||
baselineFirstRun: boolean,
|
||||
warnings: string[],
|
||||
): PgssBaselineCounter {
|
||||
if (!baselineCounter || baselineFirstRun) {
|
||||
return ZERO_COUNTER;
|
||||
}
|
||||
if (
|
||||
baselineCounter.calls > row.calls ||
|
||||
baselineCounter.totalExecTime > row.totalExecTime ||
|
||||
baselineCounter.totalRows > row.totalRows
|
||||
) {
|
||||
warnings.push(`scoped_reset:dbid=${row.dbid} queryid=${row.queryid} userid=${row.userid}`);
|
||||
return ZERO_COUNTER;
|
||||
}
|
||||
return baselineCounter;
|
||||
}
|
||||
|
||||
function shouldSkipPgssSql(sql: string): boolean {
|
||||
return PGSS_HARD_SKIP_PREFIX_RE.test(sql) || PGSS_HARD_SKIP_TABLE_RE.test(sql);
|
||||
}
|
||||
|
||||
function selectPgssTemplates(templates: AnalyzedPgssTemplate[], maxTemplatesPerRun: number): AnalyzedPgssTemplate[] {
|
||||
return templates
|
||||
.map((template) => ({
|
||||
template,
|
||||
score: template.aggregate.users.length * Math.log1p(template.aggregate.deltaCalls),
|
||||
}))
|
||||
.sort(
|
||||
(left, right) => right.score - left.score || left.template.aggregate.id.localeCompare(right.template.aggregate.id),
|
||||
)
|
||||
.slice(0, maxTemplatesPerRun)
|
||||
.map((entry) => entry.template);
|
||||
}
|
||||
|
||||
function buildPgssStagedTemplate(
|
||||
template: AnalyzedPgssTemplate,
|
||||
config: HistoricSqlPullConfig,
|
||||
now: Date,
|
||||
): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } {
|
||||
const tablesTouched = [...template.analysis.tablesTouched].sort();
|
||||
const firstTable = tablesTouched[0] ?? 'query';
|
||||
const id = template.aggregate.id;
|
||||
|
||||
const metadata: HistoricSqlMetadata = {
|
||||
id,
|
||||
title: `postgres · ${firstTable} [${id.slice(0, 12)}]`,
|
||||
path: `templates/${id}/page.md`,
|
||||
objectType: HISTORIC_SQL_OBJECT_TYPE,
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: template.analysis.fingerprint,
|
||||
sub_cluster_id: null,
|
||||
dialect: 'postgres',
|
||||
tables_touched: tablesTouched,
|
||||
literal_slots: [],
|
||||
triage_signals: buildPgssTriageSignals({
|
||||
executions: template.aggregate.deltaCalls,
|
||||
distinctUsers: template.aggregate.distinctUsersDelta,
|
||||
firstSeen: template.aggregate.firstObservedAt,
|
||||
lastSeen: now.toISOString(),
|
||||
meanRuntimeMs: template.aggregate.meanExecTime,
|
||||
serviceAccountOnly: isServiceAccountOnly(template.aggregate.users, config.serviceAccountUserPatterns),
|
||||
now,
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
return {
|
||||
metadata,
|
||||
pageMarkdown: renderTemplatePage(id, template.analysis.normalizedSql, tablesTouched),
|
||||
usage: {
|
||||
stats: {
|
||||
executions: template.aggregate.deltaCalls,
|
||||
distinct_users: template.aggregate.distinctUsersDelta,
|
||||
first_seen: template.aggregate.firstObservedAt,
|
||||
last_seen: now.toISOString(),
|
||||
p50_runtime_ms: null,
|
||||
p95_runtime_ms: null,
|
||||
mean_runtime_ms: template.aggregate.meanExecTime,
|
||||
error_rate: 0,
|
||||
rows_produced: template.aggregate.deltaRows,
|
||||
},
|
||||
literal_slots: [],
|
||||
samples: [],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function buildPgssTriageSignals(input: {
|
||||
executions: number;
|
||||
distinctUsers: number;
|
||||
firstSeen: string;
|
||||
lastSeen: string;
|
||||
meanRuntimeMs: number;
|
||||
serviceAccountOnly: boolean;
|
||||
now: Date;
|
||||
}): Record<string, string> {
|
||||
return {
|
||||
executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high',
|
||||
distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: recencyBucket(input.lastSeen, input.now),
|
||||
service_account_only: String(input.serviceAccountOnly),
|
||||
runtime_bucket: runtimeBucket(input.meanRuntimeMs),
|
||||
};
|
||||
}
|
||||
|
||||
function runtimeBucket(meanRuntimeMs: number): string {
|
||||
if (meanRuntimeMs < 100) {
|
||||
return 'fast';
|
||||
}
|
||||
if (meanRuntimeMs < 1000) {
|
||||
return 'moderate';
|
||||
}
|
||||
return 'slow';
|
||||
}
|
||||
|
||||
function recencyBucket(lastSeen: string, now: Date): string {
|
||||
const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / 86400000);
|
||||
if (ageDays <= 14) {
|
||||
return 'active';
|
||||
}
|
||||
if (ageDays <= 60) {
|
||||
return 'warm';
|
||||
}
|
||||
return 'cold';
|
||||
}
|
||||
|
||||
function isServiceAccountOnly(users: string[], patterns: string[]): boolean {
|
||||
if (users.length === 0 || patterns.length === 0) {
|
||||
return false;
|
||||
}
|
||||
const regexes = patterns.map((pattern) => new RegExp(pattern));
|
||||
return users.every((user) => regexes.some((regex) => regex.test(user)));
|
||||
}
|
||||
|
||||
function renderTemplatePage(id: string, normalizedSql: string, tablesTouched: string[]): string {
|
||||
return [
|
||||
`# ${id}`,
|
||||
'',
|
||||
'## Normalized SQL',
|
||||
'```sql',
|
||||
normalizedSql,
|
||||
'```',
|
||||
'',
|
||||
'## Tables touched',
|
||||
...tablesTouched.map((table) => `- ${table}`),
|
||||
'',
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function buildNextBaseline(input: {
|
||||
rows: PostgresPgssRow[];
|
||||
fetchedAt: string;
|
||||
statsResetAt: string | null;
|
||||
pgServerVersion: string;
|
||||
previousBaseline: PgssBaseline | null;
|
||||
}): PgssBaseline {
|
||||
const templates: PgssBaseline['templates'] = {};
|
||||
for (const row of input.rows) {
|
||||
const templateId = pgssTemplateId(row);
|
||||
const previous = input.previousBaseline?.templates[templateId];
|
||||
const template = templates[templateId] ?? {
|
||||
firstObservedAt: previous?.firstObservedAt ?? input.fetchedAt,
|
||||
perUser: {},
|
||||
};
|
||||
template.perUser[row.userid] = {
|
||||
calls: row.calls,
|
||||
totalExecTime: row.totalExecTime,
|
||||
totalRows: row.totalRows,
|
||||
};
|
||||
templates[templateId] = template;
|
||||
}
|
||||
return {
|
||||
version: PGSS_BASELINE_VERSION,
|
||||
fetchedAt: input.fetchedAt,
|
||||
statsResetAt: input.statsResetAt,
|
||||
pgServerVersion: input.pgServerVersion,
|
||||
templates,
|
||||
};
|
||||
}
|
||||
|
||||
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`);
|
||||
}
|
||||
|
||||
async function writeText(root: string, relPath: string, value: string): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
await mkdir(dirname(target), { recursive: true });
|
||||
await writeFile(target, value, 'utf-8');
|
||||
}
|
||||
798
packages/context/src/ingest/adapters/historic-sql/stage.test.ts
Normal file
798
packages/context/src/ingest/adapters/historic-sql/stage.test.ts
Normal file
|
|
@ -0,0 +1,798 @@
|
|||
import { mkdtemp, readFile, readdir } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import { stageHistoricSqlTemplates } from './stage.js';
|
||||
import {
|
||||
historicSqlManifestSchema,
|
||||
historicSqlMetadataSchema,
|
||||
historicSqlUsageSchema,
|
||||
type HistoricSqlQueryHistoryReader,
|
||||
type HistoricSqlRawQueryRow,
|
||||
} from './types.js';
|
||||
|
||||
async function tempDir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-stage-'));
|
||||
}
|
||||
|
||||
async function readJson<T>(root: string, relPath: string): Promise<T> {
|
||||
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
|
||||
}
|
||||
|
||||
function fakeReader(rows: HistoricSqlRawQueryRow[]): HistoricSqlQueryHistoryReader {
|
||||
return {
|
||||
async probe() {},
|
||||
async *fetch() {
|
||||
for (const row of rows) {
|
||||
yield row;
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const fakeSqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
if (sql.includes('paid')) {
|
||||
return {
|
||||
fingerprint: 'fp_paid_orders',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [
|
||||
{ position: 1, type: 'string', exampleValue: 'paid' },
|
||||
{ position: 2, type: 'date', exampleValue: '2026-04-01' },
|
||||
],
|
||||
};
|
||||
}
|
||||
return {
|
||||
fingerprint: 'fp_refunds',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.refunds WHERE state = ?',
|
||||
tablesTouched: ['analytics.refunds'],
|
||||
literalSlots: [{ position: 1, type: 'string', exampleValue: 'complete' }],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const categoricalSqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
const status = sql.includes("'refunded'") ? 'refunded' : 'paid';
|
||||
return {
|
||||
fingerprint: 'fp_order_status',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [{ position: 1, type: 'string', exampleValue: status }],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
function categoricalRows(): HistoricSqlRawQueryRow[] {
|
||||
return [
|
||||
{
|
||||
id: 'paid-1',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
|
||||
user: 'analyst-a',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 11,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'paid-2',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
|
||||
user: 'analyst-b',
|
||||
startedAt: '2026-05-04T10:01:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 110,
|
||||
rowsProduced: 12,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'paid-3',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
|
||||
user: 'analyst-c',
|
||||
startedAt: '2026-05-04T10:02:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 120,
|
||||
rowsProduced: 13,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'refunded-1',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
|
||||
user: 'analyst-a',
|
||||
startedAt: '2026-05-04T10:03:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 130,
|
||||
rowsProduced: 21,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'refunded-2',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
|
||||
user: 'analyst-b',
|
||||
startedAt: '2026-05-04T10:04:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 140,
|
||||
rowsProduced: 22,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'refunded-3',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
|
||||
user: 'analyst-c',
|
||||
startedAt: '2026-05-04T10:05:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 150,
|
||||
rowsProduced: 23,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
const diverseSqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
const value = sql.match(/status = '([^']+)'/)?.[1] ?? 'unknown';
|
||||
return {
|
||||
fingerprint: 'fp_diverse_samples',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [{ position: 1, type: 'string', exampleValue: value }],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const classificationMatrixSqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
if (sql.includes('stale_orders')) {
|
||||
return {
|
||||
fingerprint: 'fp_stale_date',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.stale_orders WHERE created_at >= ?',
|
||||
tablesTouched: ['analytics.stale_orders'],
|
||||
literalSlots: [{ position: 1, type: 'date', exampleValue: '2026-04-01' }],
|
||||
};
|
||||
}
|
||||
|
||||
const stringValue = (field: string): string => sql.match(new RegExp(`${field} = '([^']+)'`))?.[1] ?? 'unknown';
|
||||
const amount = sql.match(/amount >= (\d+)/)?.[1] ?? '0';
|
||||
const asOf = sql.match(/created_at >= '([^']+)'/)?.[1] ?? '2026-05-01';
|
||||
|
||||
return {
|
||||
fingerprint: 'fp_classification_matrix',
|
||||
normalizedSql:
|
||||
'SELECT count(*) FROM analytics.orders WHERE region = ? AND plan = ? AND status = ? AND amount >= ? AND created_at >= ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [
|
||||
{ position: 1, type: 'string', exampleValue: stringValue('region') },
|
||||
{ position: 2, type: 'string', exampleValue: stringValue('plan') },
|
||||
{ position: 3, type: 'string', exampleValue: stringValue('status') },
|
||||
{ position: 4, type: 'number', exampleValue: amount },
|
||||
{ position: 5, type: 'date', exampleValue: asOf },
|
||||
],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
function classificationMatrixRows(): HistoricSqlRawQueryRow[] {
|
||||
const rows: HistoricSqlRawQueryRow[] = Array.from({ length: 20 }, (_, index) => {
|
||||
const status = index < 10 ? 'paid' : 'refunded';
|
||||
const plan = index === 19 ? 'self_serve' : 'enterprise';
|
||||
const amount = 100 + index;
|
||||
const asOf = `2026-05-${String(1 + Math.floor(index / 5)).padStart(2, '0')}`;
|
||||
return {
|
||||
id: `matrix-${index + 1}`,
|
||||
sql: `SELECT count(*) FROM analytics.orders WHERE region = 'us' AND plan = '${plan}' AND status = '${status}' AND amount >= ${amount} AND created_at >= '${asOf}'`,
|
||||
user: `analyst-${(index % 4) + 1}`,
|
||||
startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`,
|
||||
endedAt: null,
|
||||
runtimeMs: 100 + index,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
};
|
||||
});
|
||||
|
||||
return [
|
||||
...rows,
|
||||
{
|
||||
id: 'stale-date-1',
|
||||
sql: "SELECT count(*) FROM analytics.stale_orders WHERE created_at >= '2026-04-01'",
|
||||
user: 'analyst-1',
|
||||
startedAt: '2026-05-04T11:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 75,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
describe('stageHistoricSqlTemplates', () => {
|
||||
it('compresses rows by fingerprint into document-shaped staged templates', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader([
|
||||
{
|
||||
id: 'q1',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01' AND email = 'analyst@example.com'",
|
||||
user: 'analyst@example.com',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: '2026-05-04T10:00:01.000Z',
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'q2',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-05-01' AND email = 'analyst-2@example.com'",
|
||||
user: 'analyst-2@example.com',
|
||||
startedAt: '2026-05-04T11:00:00.000Z',
|
||||
endedAt: '2026-05-04T11:00:01.000Z',
|
||||
runtimeMs: 300,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]),
|
||||
sqlAnalysis: fakeSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: ['^svc_'],
|
||||
redactionPatterns: ['[\\w.+-]+@[\\w-]+\\.[\\w.-]+'],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest).toMatchObject({
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
nextSuccessfulCursor: '2026-05-04T11:00:00.000Z',
|
||||
templateCount: 1,
|
||||
capped: false,
|
||||
});
|
||||
|
||||
const files = (await readdir(join(stagedDir, 'templates', 'fp_paid_orders'))).sort();
|
||||
expect(files).toEqual(['metadata.json', 'page.md', 'usage.json']);
|
||||
|
||||
const metadata = historicSqlMetadataSchema.parse(
|
||||
await readJson(stagedDir, 'templates/fp_paid_orders/metadata.json'),
|
||||
);
|
||||
expect(metadata).toEqual({
|
||||
id: 'fp_paid_orders',
|
||||
title: 'snowflake · analytics.orders [fp_pai]',
|
||||
path: 'templates/fp_paid_orders/page.md',
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_paid_orders',
|
||||
sub_cluster_id: null,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [
|
||||
{ position: 1, type: 'string', classification: 'constant' },
|
||||
{ position: 2, type: 'date', classification: 'runtime' },
|
||||
],
|
||||
triage_signals: {
|
||||
executions_bucket: 'low',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 1 runtime',
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const page = await readFile(join(stagedDir, 'templates/fp_paid_orders/page.md'), 'utf-8');
|
||||
expect(page).toContain('## Normalized SQL');
|
||||
expect(page).toContain('SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?');
|
||||
expect(page).toContain('- analytics.orders');
|
||||
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json'));
|
||||
expect(usage.stats).toMatchObject({
|
||||
executions: 2,
|
||||
distinct_users: 2,
|
||||
first_seen: '2026-05-04T10:00:00.000Z',
|
||||
last_seen: '2026-05-04T11:00:00.000Z',
|
||||
p50_runtime_ms: 100,
|
||||
p95_runtime_ms: 300,
|
||||
error_rate: 0,
|
||||
});
|
||||
expect(usage.samples).toHaveLength(1);
|
||||
expect(usage.samples[0].bound_sql).toContain('<redacted>');
|
||||
expect(usage.samples[0].bound_sql).not.toContain('analyst@example.com');
|
||||
expect(usage.samples[0].bound_sql).not.toContain('analyst-2@example.com');
|
||||
});
|
||||
|
||||
it('skips hard-noise SQL and caps templates deterministically', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader([
|
||||
{
|
||||
id: 'show-1',
|
||||
sql: 'SHOW TABLES',
|
||||
user: 'analyst',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: null,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'q3',
|
||||
sql: "SELECT count(*) FROM analytics.refunds WHERE state = 'complete'",
|
||||
user: 'analyst',
|
||||
startedAt: '2026-05-04T11:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 50,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'q4',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01'",
|
||||
user: 'analyst',
|
||||
startedAt: '2026-05-04T11:30:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 40,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]),
|
||||
sqlAnalysis: fakeSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'bigquery',
|
||||
windowDays: 7,
|
||||
lastSuccessfulCursor: '2026-05-01T00:00:00.000Z',
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 1,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.templateCount).toBe(1);
|
||||
expect(manifest.capped).toBe(true);
|
||||
expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']);
|
||||
expect(manifest.templates.map((template) => template.id)).toEqual(['fp_paid_orders']);
|
||||
});
|
||||
|
||||
it('splits categorical fingerprints into one document directory per dominant value', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader(categoricalRows()),
|
||||
sqlAnalysis: categoricalSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
const templates = manifest.templates
|
||||
.map((template) => ({
|
||||
id: template.id,
|
||||
fingerprint: template.fingerprint,
|
||||
subClusterId: template.subClusterId,
|
||||
path: template.path,
|
||||
}))
|
||||
.sort((left, right) => left.id.localeCompare(right.id));
|
||||
|
||||
expect(manifest.templateCount).toBe(2);
|
||||
expect(templates).toEqual([
|
||||
{
|
||||
id: 'fp_order_status__cat_2b2ff2318877',
|
||||
fingerprint: 'fp_order_status',
|
||||
subClusterId: 'cat_2b2ff2318877',
|
||||
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
|
||||
},
|
||||
{
|
||||
id: 'fp_order_status__cat_34f037ddcbfa',
|
||||
fingerprint: 'fp_order_status',
|
||||
subClusterId: 'cat_34f037ddcbfa',
|
||||
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
|
||||
},
|
||||
]);
|
||||
|
||||
const paidMetadata = historicSqlMetadataSchema.parse(
|
||||
await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/metadata.json'),
|
||||
);
|
||||
expect(paidMetadata).toMatchObject({
|
||||
id: 'fp_order_status__cat_34f037ddcbfa',
|
||||
title: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
|
||||
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
|
||||
properties: {
|
||||
fingerprint: 'fp_order_status',
|
||||
sub_cluster_id: 'cat_34f037ddcbfa',
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
|
||||
},
|
||||
});
|
||||
|
||||
const paidUsage = historicSqlUsageSchema.parse(
|
||||
await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'),
|
||||
);
|
||||
expect(paidUsage.stats).toMatchObject({
|
||||
executions: 3,
|
||||
distinct_users: 3,
|
||||
first_seen: '2026-05-04T10:00:00.000Z',
|
||||
last_seen: '2026-05-04T10:02:00.000Z',
|
||||
rows_produced: 36,
|
||||
});
|
||||
expect(paidUsage.literal_slots).toEqual([{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }]);
|
||||
|
||||
const refundedUsage = historicSqlUsageSchema.parse(
|
||||
await readJson(stagedDir, 'templates/fp_order_status__cat_2b2ff2318877/usage.json'),
|
||||
);
|
||||
expect(refundedUsage.stats).toMatchObject({
|
||||
executions: 3,
|
||||
distinct_users: 3,
|
||||
first_seen: '2026-05-04T10:03:00.000Z',
|
||||
last_seen: '2026-05-04T10:05:00.000Z',
|
||||
rows_produced: 66,
|
||||
});
|
||||
expect(refundedUsage.literal_slots).toEqual([
|
||||
{ position: 1, distinct_values: 1, top_values: [['refunded', 3]] },
|
||||
]);
|
||||
});
|
||||
|
||||
it('classifies literal slots across the spec matrix and stale-date demotion', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader(classificationMatrixRows()),
|
||||
sqlAnalysis: classificationMatrixSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
const matrixTemplates = manifest.templates.filter((template) => template.fingerprint === 'fp_classification_matrix');
|
||||
expect(matrixTemplates).toHaveLength(2);
|
||||
expect(matrixTemplates.every((template) => template.subClusterId?.startsWith('cat_'))).toBe(true);
|
||||
|
||||
const matrixTemplate = matrixTemplates[0];
|
||||
if (!matrixTemplate) {
|
||||
throw new Error('expected classification matrix template');
|
||||
}
|
||||
const matrixMetadata = historicSqlMetadataSchema.parse(
|
||||
await readJson(stagedDir, matrixTemplate.path.replace('/page.md', '/metadata.json')),
|
||||
);
|
||||
expect(matrixMetadata.properties.literal_slots).toMatchInlineSnapshot(`
|
||||
[
|
||||
{
|
||||
"classification": "constant",
|
||||
"position": 1,
|
||||
"type": "string",
|
||||
},
|
||||
{
|
||||
"classification": "constant",
|
||||
"position": 2,
|
||||
"type": "string",
|
||||
},
|
||||
{
|
||||
"classification": "categorical",
|
||||
"position": 3,
|
||||
"type": "string",
|
||||
},
|
||||
{
|
||||
"classification": "runtime",
|
||||
"position": 4,
|
||||
"type": "number",
|
||||
},
|
||||
{
|
||||
"classification": "runtime",
|
||||
"position": 5,
|
||||
"type": "date",
|
||||
},
|
||||
]
|
||||
`);
|
||||
expect(matrixMetadata.properties.triage_signals.slot_summary).toBe('2 constant, 2 runtime');
|
||||
|
||||
const staleMetadata = historicSqlMetadataSchema.parse(
|
||||
await readJson(stagedDir, 'templates/fp_stale_date/metadata.json'),
|
||||
);
|
||||
expect(staleMetadata.properties.literal_slots).toMatchInlineSnapshot(`
|
||||
[
|
||||
{
|
||||
"classification": "runtime",
|
||||
"position": 1,
|
||||
"type": "date",
|
||||
},
|
||||
]
|
||||
`);
|
||||
expect(staleMetadata.properties.triage_signals.slot_summary).toBe('0 constant, 1 runtime');
|
||||
});
|
||||
|
||||
it('applies the templates-per-run cap after categorical expansion', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader(categoricalRows()),
|
||||
sqlAnalysis: categoricalSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 1,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.templateCount).toBe(1);
|
||||
expect(manifest.capped).toBe(true);
|
||||
expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']);
|
||||
expect(manifest.templates).toHaveLength(1);
|
||||
expect(manifest.templates[0].id).toMatch(/^fp_order_status__cat_/);
|
||||
});
|
||||
|
||||
it('omits rows_produced for BigQuery templates when reader rows have no row counts', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_bq',
|
||||
queryClient: {},
|
||||
reader: fakeReader([
|
||||
{
|
||||
id: 'bq-1',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
|
||||
user: 'analyst-a@example.com',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]),
|
||||
sqlAnalysis: fakeSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'bigquery',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json'));
|
||||
expect(usage.stats).not.toHaveProperty('rows_produced');
|
||||
expect(usage.samples[0]).not.toHaveProperty('rows_produced');
|
||||
});
|
||||
|
||||
it('keeps at most five diverse samples, preferring recent successful representatives per literal tuple', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const statuses = [
|
||||
'paid',
|
||||
'refunded',
|
||||
'pending',
|
||||
'failed',
|
||||
'trial',
|
||||
'cancelled',
|
||||
'draft',
|
||||
'returned',
|
||||
'review',
|
||||
'held',
|
||||
'archived',
|
||||
];
|
||||
const rows: HistoricSqlRawQueryRow[] = statuses.flatMap((status, index) => [
|
||||
{
|
||||
id: `${status}-old`,
|
||||
sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`,
|
||||
user: 'analyst-a',
|
||||
startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`,
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 1,
|
||||
success: false,
|
||||
errorMessage: 'old failed sample',
|
||||
},
|
||||
{
|
||||
id: `${status}-new`,
|
||||
sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`,
|
||||
user: 'analyst-a',
|
||||
startedAt: `2026-05-04T11:${String(index).padStart(2, '0')}:00.000Z`,
|
||||
endedAt: null,
|
||||
runtimeMs: 90,
|
||||
rowsProduced: 2,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]);
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader(rows),
|
||||
sqlAnalysis: diverseSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_diverse_samples/usage.json'));
|
||||
expect(usage.samples).toHaveLength(5);
|
||||
expect(usage.samples.every((sample) => sample.success)).toBe(true);
|
||||
expect(new Set(usage.samples.map((sample) => sample.bound_sql.match(/status = '([^']+)'/)?.[1])).size).toBe(5);
|
||||
expect(usage.samples.map((sample) => sample.started_at)).toEqual([
|
||||
'2026-05-04T11:10:00.000Z',
|
||||
'2026-05-04T11:09:00.000Z',
|
||||
'2026-05-04T11:08:00.000Z',
|
||||
'2026-05-04T11:07:00.000Z',
|
||||
'2026-05-04T11:06:00.000Z',
|
||||
]);
|
||||
});
|
||||
|
||||
it('uses recency as a tie-breaker when the templates-per-run cap overflows', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const sqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
const table = sql.includes('fresh_orders') ? 'fresh_orders' : 'stale_orders';
|
||||
return {
|
||||
fingerprint: `fp_${table}`,
|
||||
normalizedSql: `SELECT count(*) FROM analytics.${table}`,
|
||||
tablesTouched: [`analytics.${table}`],
|
||||
literalSlots: [],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader([
|
||||
{
|
||||
id: 'stale-1',
|
||||
sql: 'SELECT count(*) FROM analytics.stale_orders',
|
||||
user: 'analyst-a',
|
||||
startedAt: '2026-02-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'fresh-1',
|
||||
sql: 'SELECT count(*) FROM analytics.fresh_orders',
|
||||
user: 'analyst-a',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]),
|
||||
sqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 1,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.templates.map((template) => template.id)).toEqual(['fp_fresh_orders']);
|
||||
});
|
||||
|
||||
it('does not persist bound SQL samples when redaction patterns are invalid', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader([
|
||||
{
|
||||
id: 'q1',
|
||||
sql: "SELECT * FROM analytics.orders WHERE email = 'analyst@example.com'",
|
||||
user: 'analyst@example.com',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]),
|
||||
sqlAnalysis: {
|
||||
async analyzeForFingerprint() {
|
||||
return {
|
||||
fingerprint: 'fp_redaction',
|
||||
normalizedSql: 'SELECT * FROM analytics.orders WHERE email = ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [{ position: 1, type: 'string', exampleValue: 'analyst@example.com' }],
|
||||
};
|
||||
},
|
||||
},
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: ['['],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_redaction/usage.json'));
|
||||
expect(manifest.warnings.some((warning) => warning.startsWith('redaction_skipped:invalid_redaction_pattern'))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(usage.samples).toEqual([]);
|
||||
});
|
||||
});
|
||||
630
packages/context/src/ingest/adapters/historic-sql/stage.ts
Normal file
630
packages/context/src/ingest/adapters/historic-sql/stage.ts
Normal file
|
|
@ -0,0 +1,630 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import { mkdir, writeFile } from 'node:fs/promises';
|
||||
import { dirname, join } from 'node:path';
|
||||
import type {
|
||||
SqlAnalysisFingerprintResult,
|
||||
SqlAnalysisLiteralSlot,
|
||||
SqlAnalysisLiteralSlotType,
|
||||
SqlAnalysisPort,
|
||||
} from '../../../sql-analysis/index.js';
|
||||
import {
|
||||
HISTORIC_SQL_OBJECT_TYPE,
|
||||
HISTORIC_SQL_SOURCE_KEY,
|
||||
historicSqlPullConfigSchema,
|
||||
historicSqlRawQueryRowSchema,
|
||||
type HistoricSqlLiteralSlotClassification,
|
||||
type HistoricSqlManifest,
|
||||
type HistoricSqlMetadata,
|
||||
type HistoricSqlPullConfig,
|
||||
type HistoricSqlQueryHistoryReader,
|
||||
type HistoricSqlRawQueryRow,
|
||||
type HistoricSqlUsage,
|
||||
} from './types.js';
|
||||
|
||||
interface StageHistoricSqlTemplatesInput {
|
||||
stagedDir: string;
|
||||
connectionId: string;
|
||||
queryClient: unknown;
|
||||
reader: HistoricSqlQueryHistoryReader;
|
||||
sqlAnalysis: SqlAnalysisPort;
|
||||
pullConfig: HistoricSqlPullConfig;
|
||||
now?: Date;
|
||||
}
|
||||
|
||||
interface SlotObservation {
|
||||
value: string;
|
||||
rowStartedAt: string;
|
||||
}
|
||||
|
||||
interface SlotStats {
|
||||
position: number;
|
||||
type: SqlAnalysisLiteralSlotType;
|
||||
values: Map<string, number>;
|
||||
observations: SlotObservation[];
|
||||
}
|
||||
|
||||
interface TemplateAccumulator {
|
||||
fingerprint: string;
|
||||
normalizedSql: string;
|
||||
tablesTouched: Set<string>;
|
||||
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
|
||||
slotStats: Map<number, SlotStats>;
|
||||
}
|
||||
|
||||
interface ClassifiedLiteralSlot {
|
||||
position: number;
|
||||
type: SqlAnalysisLiteralSlotType;
|
||||
classification: HistoricSqlLiteralSlotClassification;
|
||||
}
|
||||
|
||||
interface TemplateVariant {
|
||||
id: string;
|
||||
fingerprint: string;
|
||||
subClusterId: string | null;
|
||||
normalizedSql: string;
|
||||
tablesTouched: Set<string>;
|
||||
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
|
||||
slotStats: Map<number, SlotStats>;
|
||||
slotClassifications: ClassifiedLiteralSlot[];
|
||||
}
|
||||
|
||||
interface CategoricalTupleEntry {
|
||||
position: number;
|
||||
value: string;
|
||||
}
|
||||
|
||||
interface RedactionPolicy {
|
||||
redactors: RegExp[];
|
||||
samplesAllowed: boolean;
|
||||
}
|
||||
|
||||
const HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i;
|
||||
const HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i;
|
||||
|
||||
export async function stageHistoricSqlTemplates(input: StageHistoricSqlTemplatesInput): Promise<void> {
|
||||
const config = historicSqlPullConfigSchema.parse(input.pullConfig);
|
||||
const now = input.now ?? new Date();
|
||||
const windowStart = config.lastSuccessfulCursor
|
||||
? new Date(config.lastSuccessfulCursor)
|
||||
: new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000);
|
||||
const warnings: string[] = [];
|
||||
const redaction = compileRedactors(config.redactionPatterns, warnings);
|
||||
const groups = new Map<string, TemplateAccumulator>();
|
||||
let nextSuccessfulCursor: string | null = null;
|
||||
|
||||
await input.reader.probe(input.queryClient);
|
||||
|
||||
for await (const rawRow of input.reader.fetch(
|
||||
input.queryClient,
|
||||
{ start: windowStart, end: now },
|
||||
config.lastSuccessfulCursor,
|
||||
)) {
|
||||
const row = historicSqlRawQueryRowSchema.parse(rawRow);
|
||||
if (!nextSuccessfulCursor || row.startedAt > nextSuccessfulCursor) {
|
||||
nextSuccessfulCursor = row.startedAt;
|
||||
}
|
||||
if (shouldSkipSql(row.sql)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const analysis = await input.sqlAnalysis.analyzeForFingerprint(row.sql, config.dialect);
|
||||
if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) {
|
||||
warnings.push(`analysis_failed:${row.id}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const group =
|
||||
groups.get(analysis.fingerprint) ??
|
||||
{
|
||||
fingerprint: analysis.fingerprint,
|
||||
normalizedSql: analysis.normalizedSql,
|
||||
tablesTouched: new Set<string>(),
|
||||
rows: [],
|
||||
slotStats: new Map<number, SlotStats>(),
|
||||
};
|
||||
|
||||
for (const table of analysis.tablesTouched) {
|
||||
group.tablesTouched.add(table);
|
||||
}
|
||||
for (const slot of analysis.literalSlots) {
|
||||
recordSlot(group.slotStats, slot, redaction.redactors, row.startedAt);
|
||||
}
|
||||
group.rows.push({ row, analysis });
|
||||
groups.set(analysis.fingerprint, group);
|
||||
}
|
||||
|
||||
const expandedTemplates = expandCategoricalTemplates([...groups.values()], redaction.redactors);
|
||||
const selected = selectTemplates(expandedTemplates, config.maxTemplatesPerRun, now);
|
||||
if (selected.length < expandedTemplates.length) {
|
||||
warnings.push(`templates_truncated: kept ${selected.length} of ${expandedTemplates.length} templates`);
|
||||
}
|
||||
|
||||
await mkdir(input.stagedDir, { recursive: true });
|
||||
const templates: HistoricSqlManifest['templates'] = [];
|
||||
for (const template of selected) {
|
||||
const staged = buildStagedTemplate(template, config, redaction, now);
|
||||
const basePath = `templates/${staged.metadata.id}`;
|
||||
await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata);
|
||||
await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown);
|
||||
await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage);
|
||||
templates.push({
|
||||
id: staged.metadata.id,
|
||||
fingerprint: staged.metadata.properties.fingerprint,
|
||||
subClusterId: staged.metadata.properties.sub_cluster_id,
|
||||
path: staged.metadata.path,
|
||||
});
|
||||
}
|
||||
|
||||
await writeJson(input.stagedDir, 'manifest.json', {
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: input.connectionId,
|
||||
dialect: config.dialect,
|
||||
fetchedAt: now.toISOString(),
|
||||
windowStart: windowStart.toISOString(),
|
||||
windowEnd: now.toISOString(),
|
||||
nextSuccessfulCursor,
|
||||
templateCount: selected.length,
|
||||
capped: selected.length < expandedTemplates.length,
|
||||
warnings,
|
||||
degraded: false,
|
||||
statsResetAt: null,
|
||||
baselineFirstRun: false,
|
||||
pgServerVersion: null,
|
||||
deallocCount: null,
|
||||
templates,
|
||||
} satisfies HistoricSqlManifest);
|
||||
}
|
||||
|
||||
function shouldSkipSql(sql: string): boolean {
|
||||
return HARD_SKIP_PREFIX_RE.test(sql) || HARD_SKIP_TABLE_RE.test(sql);
|
||||
}
|
||||
|
||||
function recordSlot(
|
||||
slotStats: Map<number, SlotStats>,
|
||||
slot: SqlAnalysisLiteralSlot,
|
||||
redactors: RegExp[],
|
||||
rowStartedAt: string,
|
||||
): void {
|
||||
const existing = slotStats.get(slot.position) ?? {
|
||||
position: slot.position,
|
||||
type: slot.type,
|
||||
values: new Map<string, number>(),
|
||||
observations: [],
|
||||
};
|
||||
const persistedValue = redactText(slot.exampleValue, redactors);
|
||||
existing.values.set(persistedValue, (existing.values.get(persistedValue) ?? 0) + 1);
|
||||
existing.observations.push({ value: persistedValue, rowStartedAt });
|
||||
slotStats.set(slot.position, existing);
|
||||
}
|
||||
|
||||
function expandCategoricalTemplates(groups: TemplateAccumulator[], redactors: RegExp[]): TemplateVariant[] {
|
||||
return groups.flatMap((group) => expandTemplateGroup(group, redactors));
|
||||
}
|
||||
|
||||
function expandTemplateGroup(group: TemplateAccumulator, redactors: RegExp[]): TemplateVariant[] {
|
||||
const rows = [...group.rows].sort((left, right) => left.row.startedAt.localeCompare(right.row.startedAt));
|
||||
const firstSeen = rows[0]?.row.startedAt;
|
||||
if (!firstSeen) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const slotClassifications = classifySlots(group.slotStats, rows.length, firstSeen);
|
||||
const categoricalPositions = slotClassifications
|
||||
.filter((slot) => slot.classification === 'categorical')
|
||||
.map((slot) => slot.position)
|
||||
.sort((left, right) => left - right);
|
||||
|
||||
if (categoricalPositions.length === 0) {
|
||||
return [
|
||||
{
|
||||
id: group.fingerprint,
|
||||
fingerprint: group.fingerprint,
|
||||
subClusterId: null,
|
||||
normalizedSql: group.normalizedSql,
|
||||
tablesTouched: group.tablesTouched,
|
||||
rows,
|
||||
slotStats: group.slotStats,
|
||||
slotClassifications,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
const byTuple = new Map<
|
||||
string,
|
||||
{
|
||||
tuple: CategoricalTupleEntry[];
|
||||
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
|
||||
}
|
||||
>();
|
||||
|
||||
for (const entry of rows) {
|
||||
const tuple = categoricalTuple(entry.analysis.literalSlots, categoricalPositions, redactors);
|
||||
const key = JSON.stringify(tuple);
|
||||
const existing = byTuple.get(key) ?? { tuple, rows: [] };
|
||||
existing.rows.push(entry);
|
||||
byTuple.set(key, existing);
|
||||
}
|
||||
|
||||
return [...byTuple.values()]
|
||||
.map(({ tuple, rows: tupleRows }) => {
|
||||
const subClusterId = subClusterIdForTuple(tuple);
|
||||
return {
|
||||
id: `${group.fingerprint}__${subClusterId}`,
|
||||
fingerprint: group.fingerprint,
|
||||
subClusterId,
|
||||
normalizedSql: group.normalizedSql,
|
||||
tablesTouched: group.tablesTouched,
|
||||
rows: tupleRows,
|
||||
slotStats: collectSlotStats(tupleRows, redactors),
|
||||
slotClassifications,
|
||||
};
|
||||
})
|
||||
.sort((left, right) => left.id.localeCompare(right.id));
|
||||
}
|
||||
|
||||
function classifySlots(
|
||||
slotStats: Map<number, SlotStats>,
|
||||
executions: number,
|
||||
firstSeen: string,
|
||||
): ClassifiedLiteralSlot[] {
|
||||
return [...slotStats.values()]
|
||||
.sort((left, right) => left.position - right.position)
|
||||
.map((slot) => ({
|
||||
position: slot.position,
|
||||
type: slot.type,
|
||||
classification: classifySlot(slot, executions, firstSeen),
|
||||
}));
|
||||
}
|
||||
|
||||
function collectSlotStats(
|
||||
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>,
|
||||
redactors: RegExp[],
|
||||
): Map<number, SlotStats> {
|
||||
const slotStats = new Map<number, SlotStats>();
|
||||
for (const entry of rows) {
|
||||
for (const slot of entry.analysis.literalSlots) {
|
||||
recordSlot(slotStats, slot, redactors, entry.row.startedAt);
|
||||
}
|
||||
}
|
||||
return slotStats;
|
||||
}
|
||||
|
||||
function categoricalTuple(
|
||||
literalSlots: SqlAnalysisLiteralSlot[],
|
||||
categoricalPositions: number[],
|
||||
redactors: RegExp[],
|
||||
): CategoricalTupleEntry[] {
|
||||
const valuesByPosition = new Map(
|
||||
literalSlots.map((slot) => [slot.position, redactText(slot.exampleValue, redactors)] as const),
|
||||
);
|
||||
return categoricalPositions.map((position) => ({
|
||||
position,
|
||||
value: valuesByPosition.get(position) ?? '<missing>',
|
||||
}));
|
||||
}
|
||||
|
||||
function subClusterIdForTuple(tuple: CategoricalTupleEntry[]): string {
|
||||
return `cat_${createHash('sha256').update(JSON.stringify(tuple)).digest('hex').slice(0, 12)}`;
|
||||
}
|
||||
|
||||
function buildStagedTemplate(
|
||||
template: TemplateVariant,
|
||||
config: HistoricSqlPullConfig,
|
||||
redaction: RedactionPolicy,
|
||||
now: Date,
|
||||
): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } {
|
||||
const rows = template.rows
|
||||
.map((entry) => entry.row)
|
||||
.sort((left, right) => left.startedAt.localeCompare(right.startedAt));
|
||||
const firstSeen = rows[0].startedAt;
|
||||
const lastSeen = rows[rows.length - 1].startedAt;
|
||||
const distinctUsers = new Set(rows.map((row) => row.user).filter((user): user is string => !!user)).size;
|
||||
const errorCount = rows.filter((row) => !row.success).length;
|
||||
const runtimes = rows
|
||||
.map((row) => row.runtimeMs)
|
||||
.filter((runtime): runtime is number => typeof runtime === 'number')
|
||||
.sort((left, right) => left - right);
|
||||
const triageSignals = buildTriageSignals({
|
||||
executions: rows.length,
|
||||
distinctUsers,
|
||||
errorRate: rows.length === 0 ? 0 : errorCount / rows.length,
|
||||
lastSeen,
|
||||
now,
|
||||
serviceAccountOnly: isServiceAccountOnly(rows, config.serviceAccountUserPatterns),
|
||||
slotClassifications: template.slotClassifications.map((slot) => slot.classification),
|
||||
});
|
||||
const tablesTouched = [...template.tablesTouched].sort();
|
||||
const firstTable = tablesTouched[0] ?? 'query';
|
||||
const id = template.id;
|
||||
const rowsProduced = sumRowsProduced(rows);
|
||||
const metadata: HistoricSqlMetadata = {
|
||||
id,
|
||||
title: buildTemplateTitle(config.dialect, firstTable, template.fingerprint, template.subClusterId),
|
||||
path: `templates/${id}/page.md`,
|
||||
objectType: HISTORIC_SQL_OBJECT_TYPE,
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: template.fingerprint,
|
||||
sub_cluster_id: template.subClusterId,
|
||||
dialect: config.dialect,
|
||||
tables_touched: tablesTouched,
|
||||
literal_slots: template.slotClassifications,
|
||||
triage_signals: triageSignals,
|
||||
},
|
||||
};
|
||||
|
||||
return {
|
||||
metadata,
|
||||
pageMarkdown: renderTemplatePage(id, template.normalizedSql, tablesTouched),
|
||||
usage: {
|
||||
stats: {
|
||||
executions: rows.length,
|
||||
distinct_users: distinctUsers,
|
||||
first_seen: firstSeen,
|
||||
last_seen: lastSeen,
|
||||
p50_runtime_ms: percentile(runtimes, 0.5),
|
||||
p95_runtime_ms: percentile(runtimes, 0.95),
|
||||
error_rate: rows.length === 0 ? 0 : errorCount / rows.length,
|
||||
...(rowsProduced === null ? {} : { rows_produced: rowsProduced }),
|
||||
},
|
||||
literal_slots: [...template.slotStats.values()]
|
||||
.sort((left, right) => left.position - right.position)
|
||||
.map((slot) => ({
|
||||
position: slot.position,
|
||||
distinct_values: slot.values.size,
|
||||
top_values: [...slot.values.entries()]
|
||||
.sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))
|
||||
.slice(0, 10),
|
||||
})),
|
||||
samples: selectSamples(template.rows, redaction),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const TEMPORAL_SLOT_TYPES = new Set<SqlAnalysisLiteralSlotType>(['date', 'timestamp']);
|
||||
|
||||
function isStaleDateConstant(slot: SlotStats, value: string, firstSeen: string): boolean {
|
||||
return slot.type === 'date' && parseTemporalSlotValue(value) !== null && value < firstSeen.slice(0, 10);
|
||||
}
|
||||
|
||||
function isMovingTemporalSlot(slot: SlotStats): boolean {
|
||||
if (!TEMPORAL_SLOT_TYPES.has(slot.type) || slot.values.size < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const observations: Array<{ rowStartedAt: number; literalTime: number }> = [];
|
||||
for (const observation of slot.observations) {
|
||||
const rowStartedAt = Date.parse(observation.rowStartedAt);
|
||||
const literalTime = parseTemporalSlotValue(observation.value);
|
||||
if (Number.isNaN(rowStartedAt) || literalTime === null) {
|
||||
return false;
|
||||
}
|
||||
observations.push({ rowStartedAt, literalTime });
|
||||
}
|
||||
|
||||
const literalTimes = observations
|
||||
.sort((left, right) => left.rowStartedAt - right.rowStartedAt)
|
||||
.map((observation) => observation.literalTime);
|
||||
|
||||
return isMonotonic(literalTimes);
|
||||
}
|
||||
|
||||
function parseTemporalSlotValue(value: string): number | null {
|
||||
const parsed = Date.parse(value);
|
||||
return Number.isNaN(parsed) ? null : parsed;
|
||||
}
|
||||
|
||||
function isMonotonic(values: number[]): boolean {
|
||||
if (values.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
let nonDecreasing = true;
|
||||
let nonIncreasing = true;
|
||||
for (let index = 1; index < values.length; index += 1) {
|
||||
if (values[index] < values[index - 1]) {
|
||||
nonDecreasing = false;
|
||||
}
|
||||
if (values[index] > values[index - 1]) {
|
||||
nonIncreasing = false;
|
||||
}
|
||||
}
|
||||
|
||||
return nonDecreasing || nonIncreasing;
|
||||
}
|
||||
|
||||
function classifySlot(
|
||||
slot: SlotStats,
|
||||
executions: number,
|
||||
firstSeen: string,
|
||||
): HistoricSqlLiteralSlotClassification {
|
||||
const ordered = [...slot.values.entries()].sort((left, right) => right[1] - left[1]);
|
||||
const distinct = ordered.length;
|
||||
const topCount = ordered[0]?.[1] ?? 0;
|
||||
const topValue = ordered[0]?.[0] ?? '';
|
||||
const staleDateConstant = isStaleDateConstant(slot, topValue, firstSeen);
|
||||
|
||||
if (distinct === 1 && !staleDateConstant) {
|
||||
return 'constant';
|
||||
}
|
||||
if (executions > 0 && topCount / executions >= 0.95 && !staleDateConstant) {
|
||||
return 'constant';
|
||||
}
|
||||
if (isMovingTemporalSlot(slot)) {
|
||||
return 'runtime';
|
||||
}
|
||||
if (executions > 0 && distinct >= 2 && distinct <= 10 && ordered.every(([, count]) => count / executions >= 0.05)) {
|
||||
return 'categorical';
|
||||
}
|
||||
return 'runtime';
|
||||
}
|
||||
|
||||
function buildTriageSignals(input: {
|
||||
executions: number;
|
||||
distinctUsers: number;
|
||||
errorRate: number;
|
||||
lastSeen: string;
|
||||
now: Date;
|
||||
serviceAccountOnly: boolean;
|
||||
slotClassifications: HistoricSqlLiteralSlotClassification[];
|
||||
}): Record<string, string> {
|
||||
const runtimeCount = input.slotClassifications.filter((classification) => classification === 'runtime').length;
|
||||
const constantCount = input.slotClassifications.filter((classification) => classification === 'constant').length;
|
||||
return {
|
||||
executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high',
|
||||
distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad',
|
||||
error_rate_bucket: input.errorRate <= 0.01 ? 'ok' : input.errorRate <= 0.1 ? 'noisy' : 'broken',
|
||||
recency_bucket: recencyBucket(input.lastSeen, input.now),
|
||||
service_account_only: String(input.serviceAccountOnly),
|
||||
slot_summary: `${constantCount} constant, ${runtimeCount} runtime`,
|
||||
};
|
||||
}
|
||||
|
||||
function recencyBucket(lastSeen: string, now: Date): string {
|
||||
const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / (24 * 60 * 60 * 1000));
|
||||
if (ageDays <= 14) {
|
||||
return 'active';
|
||||
}
|
||||
if (ageDays <= 60) {
|
||||
return 'warm';
|
||||
}
|
||||
return 'cold';
|
||||
}
|
||||
|
||||
function isServiceAccountOnly(rows: HistoricSqlRawQueryRow[], patterns: string[]): boolean {
|
||||
const users = rows.map((row) => row.user).filter((user): user is string => !!user);
|
||||
if (users.length === 0 || patterns.length === 0) {
|
||||
return false;
|
||||
}
|
||||
const regexes = patterns.map((pattern) => new RegExp(pattern));
|
||||
return users.every((user) => regexes.some((regex) => regex.test(user)));
|
||||
}
|
||||
|
||||
function buildTemplateTitle(
|
||||
dialect: HistoricSqlPullConfig['dialect'],
|
||||
firstTable: string,
|
||||
fingerprint: string,
|
||||
subClusterId: string | null,
|
||||
): string {
|
||||
if (!subClusterId) {
|
||||
return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}]`;
|
||||
}
|
||||
return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}:${subClusterId.slice(-6)}]`;
|
||||
}
|
||||
|
||||
function renderTemplatePage(fingerprint: string, normalizedSql: string, tablesTouched: string[]): string {
|
||||
return [
|
||||
`# ${fingerprint}`,
|
||||
'',
|
||||
'## Normalized SQL',
|
||||
'```sql',
|
||||
normalizedSql,
|
||||
'```',
|
||||
'',
|
||||
'## Tables touched',
|
||||
...tablesTouched.map((table) => `- ${table}`),
|
||||
'',
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function selectSamples(
|
||||
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>,
|
||||
redaction: RedactionPolicy,
|
||||
): HistoricSqlUsage['samples'] {
|
||||
if (!redaction.samplesAllowed) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const byLiteralTuple = new Map<string, { row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>();
|
||||
const preferred = [...rows].sort((left, right) => {
|
||||
if (left.row.success !== right.row.success) {
|
||||
return left.row.success ? -1 : 1;
|
||||
}
|
||||
return right.row.startedAt.localeCompare(left.row.startedAt);
|
||||
});
|
||||
|
||||
for (const entry of preferred) {
|
||||
const key = [...entry.analysis.literalSlots]
|
||||
.sort((left, right) => left.position - right.position)
|
||||
.map((slot) => slot.exampleValue)
|
||||
.join('\u001f');
|
||||
if (!byLiteralTuple.has(key)) {
|
||||
byLiteralTuple.set(key, entry);
|
||||
}
|
||||
}
|
||||
|
||||
return [...byLiteralTuple.values()]
|
||||
.sort((left, right) => right.row.startedAt.localeCompare(left.row.startedAt))
|
||||
.slice(0, 5)
|
||||
.map(({ row }) => ({
|
||||
started_at: row.startedAt,
|
||||
user: row.user,
|
||||
bound_sql: redactText(row.sql, redaction.redactors),
|
||||
...(row.rowsProduced === undefined ? {} : { rows_produced: row.rowsProduced ?? null }),
|
||||
runtime_ms: row.runtimeMs,
|
||||
success: row.success,
|
||||
}));
|
||||
}
|
||||
|
||||
function selectTemplates(templates: TemplateVariant[], maxTemplatesPerRun: number, now: Date): TemplateVariant[] {
|
||||
return templates
|
||||
.map((template) => ({ template, score: rankTemplate(template, now) }))
|
||||
.sort((left, right) => right.score - left.score || left.template.id.localeCompare(right.template.id))
|
||||
.slice(0, maxTemplatesPerRun)
|
||||
.map((entry) => entry.template);
|
||||
}
|
||||
|
||||
function rankTemplate(template: TemplateVariant, now: Date): number {
|
||||
const users = new Set(template.rows.map(({ row }) => row.user).filter((user): user is string => !!user)).size;
|
||||
const latestStartedAt = template.rows.reduce<string | null>(
|
||||
(latest, { row }) => (latest === null || row.startedAt > latest ? row.startedAt : latest),
|
||||
null,
|
||||
);
|
||||
const ageDays =
|
||||
latestStartedAt === null ? 365 : Math.max(0, (now.getTime() - new Date(latestStartedAt).getTime()) / 86400000);
|
||||
const recencyWeight = 1 / (1 + ageDays / 30);
|
||||
return users * Math.log1p(template.rows.length) * recencyWeight;
|
||||
}
|
||||
|
||||
function percentile(values: number[], percentileValue: number): number | null {
|
||||
if (values.length === 0) {
|
||||
return null;
|
||||
}
|
||||
const index = Math.min(values.length - 1, Math.max(0, Math.ceil(values.length * percentileValue) - 1));
|
||||
return values[index];
|
||||
}
|
||||
|
||||
function sumRowsProduced(rows: HistoricSqlRawQueryRow[]): number | null {
|
||||
const values = rows.map((row) => row.rowsProduced).filter((value): value is number => typeof value === 'number');
|
||||
return values.length > 0 ? values.reduce((sum, value) => sum + value, 0) : null;
|
||||
}
|
||||
|
||||
function compileRedactors(patterns: string[], warnings: string[]): RedactionPolicy {
|
||||
let samplesAllowed = true;
|
||||
const redactors = patterns.flatMap((pattern) => {
|
||||
try {
|
||||
return [new RegExp(pattern, 'g')];
|
||||
} catch (error) {
|
||||
samplesAllowed = false;
|
||||
warnings.push(
|
||||
`redaction_skipped:invalid_redaction_pattern:${pattern}:${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
return [];
|
||||
}
|
||||
});
|
||||
return { redactors, samplesAllowed };
|
||||
}
|
||||
|
||||
function redactText(value: string, redactors: RegExp[]): string {
|
||||
return redactors.reduce((current, regex) => current.replace(regex, '<redacted>'), value);
|
||||
}
|
||||
|
||||
async function writeJson(stagedDir: string, relPath: string, value: unknown): Promise<void> {
|
||||
await writeText(stagedDir, relPath, `${JSON.stringify(value, null, 2)}\n`);
|
||||
}
|
||||
|
||||
async function writeText(stagedDir: string, relPath: string, value: string): Promise<void> {
|
||||
const target = join(stagedDir, relPath);
|
||||
await mkdir(dirname(target), { recursive: true });
|
||||
await writeFile(target, value, 'utf-8');
|
||||
}
|
||||
201
packages/context/src/ingest/adapters/historic-sql/types.ts
Normal file
201
packages/context/src/ingest/adapters/historic-sql/types.ts
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
import { z } from 'zod';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
|
||||
export const HISTORIC_SQL_SOURCE_KEY = 'historic-sql' as const;
|
||||
export const HISTORIC_SQL_OBJECT_TYPE = 'historic_sql_template' as const;
|
||||
|
||||
const historicSqlDialectSchema = z.enum(['snowflake', 'bigquery', 'postgres']);
|
||||
export type HistoricSqlDialect = z.infer<typeof historicSqlDialectSchema>;
|
||||
|
||||
export const historicSqlPullConfigSchema = z.object({
|
||||
dialect: historicSqlDialectSchema,
|
||||
windowDays: z.number().int().min(1).max(365).default(90),
|
||||
lastSuccessfulCursor: z.string().datetime().nullable().default(null),
|
||||
serviceAccountUserPatterns: z.array(z.string()).default([]),
|
||||
redactionPatterns: z.array(z.string()).default([]),
|
||||
maxTemplatesPerRun: z.number().int().min(1).max(5000).default(5000),
|
||||
minCalls: z.number().int().min(1).default(5),
|
||||
});
|
||||
export type HistoricSqlPullConfig = z.infer<typeof historicSqlPullConfigSchema>;
|
||||
|
||||
export interface HistoricSqlTimeWindow {
|
||||
start: Date;
|
||||
end: Date;
|
||||
}
|
||||
|
||||
export const historicSqlRawQueryRowSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
sql: z.string().min(1),
|
||||
user: z.string().nullable().default(null),
|
||||
startedAt: z.string().datetime(),
|
||||
endedAt: z.string().datetime().nullable().default(null),
|
||||
runtimeMs: z.number().nonnegative().nullable().default(null),
|
||||
rowsProduced: z.number().int().nonnegative().nullable().optional(),
|
||||
success: z.boolean().default(true),
|
||||
errorMessage: z.string().nullable().default(null),
|
||||
});
|
||||
export type HistoricSqlRawQueryRow = z.infer<typeof historicSqlRawQueryRowSchema>;
|
||||
|
||||
export interface HistoricSqlQueryHistoryReader {
|
||||
probe(client: unknown): Promise<void>;
|
||||
fetch(
|
||||
client: unknown,
|
||||
window: HistoricSqlTimeWindow,
|
||||
cursor?: string | null,
|
||||
): AsyncIterable<HistoricSqlRawQueryRow>;
|
||||
}
|
||||
|
||||
export interface KloPostgresQueryClient {
|
||||
executeQuery(sql: string, params?: unknown[]): Promise<{ headers: string[]; rows: unknown[][]; totalRows?: number }>;
|
||||
}
|
||||
|
||||
export interface PostgresPgssProbeResult {
|
||||
pgServerVersion: string;
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
export interface PostgresPgssSnapshot {
|
||||
statsResetAt: string | null;
|
||||
deallocCount: number | null;
|
||||
rows: PostgresPgssRow[];
|
||||
}
|
||||
|
||||
export interface PostgresPgssReader {
|
||||
probe(client: KloPostgresQueryClient): Promise<PostgresPgssProbeResult>;
|
||||
readSnapshot(
|
||||
client: KloPostgresQueryClient,
|
||||
options: { minCalls: number; maxTemplates: number },
|
||||
): Promise<PostgresPgssSnapshot>;
|
||||
}
|
||||
|
||||
export interface PostgresPgssRow {
|
||||
queryid: string;
|
||||
userid: string;
|
||||
username: string | null;
|
||||
dbid: string;
|
||||
database: string | null;
|
||||
query: string;
|
||||
calls: number;
|
||||
totalExecTime: number;
|
||||
meanExecTime: number;
|
||||
totalRows: number;
|
||||
}
|
||||
|
||||
export interface PostgresPgssAggregateRow {
|
||||
id: string;
|
||||
queryid: string;
|
||||
dbid: string;
|
||||
database: string | null;
|
||||
query: string;
|
||||
deltaCalls: number;
|
||||
deltaExecTime: number;
|
||||
deltaRows: number;
|
||||
meanExecTime: number;
|
||||
distinctUsersDelta: number;
|
||||
users: string[];
|
||||
firstObservedAt: string;
|
||||
}
|
||||
|
||||
export interface HistoricSqlSourceAdapterDeps {
|
||||
sqlAnalysis: SqlAnalysisPort;
|
||||
reader: HistoricSqlQueryHistoryReader;
|
||||
queryClient: unknown;
|
||||
postgresReader?: PostgresPgssReader;
|
||||
postgresQueryClient?: KloPostgresQueryClient;
|
||||
postgresBaselineRootDir?: string;
|
||||
now?: () => Date;
|
||||
onPullSucceeded?: (ctx: {
|
||||
connectionId: string;
|
||||
sourceKey: string;
|
||||
syncId: string;
|
||||
trigger: import('../../types.js').IngestTrigger;
|
||||
completedAt: Date;
|
||||
stagedDir: string;
|
||||
nextSuccessfulCursor: string | null;
|
||||
}) => Promise<void>;
|
||||
}
|
||||
|
||||
const historicSqlLiteralSlotClassificationSchema = z.enum(['constant', 'runtime', 'categorical']);
|
||||
export type HistoricSqlLiteralSlotClassification = z.infer<typeof historicSqlLiteralSlotClassificationSchema>;
|
||||
|
||||
export const historicSqlMetadataSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
title: z.string().min(1),
|
||||
path: z.string().min(1),
|
||||
objectType: z.literal(HISTORIC_SQL_OBJECT_TYPE),
|
||||
lastEditedAt: z.null(),
|
||||
properties: z.object({
|
||||
fingerprint: z.string().min(1),
|
||||
sub_cluster_id: z.string().nullable(),
|
||||
dialect: historicSqlDialectSchema,
|
||||
tables_touched: z.array(z.string()),
|
||||
literal_slots: z.array(
|
||||
z.object({
|
||||
position: z.number().int().min(1),
|
||||
type: z.enum(['string', 'number', 'timestamp', 'date', 'boolean', 'null', 'unknown']),
|
||||
classification: historicSqlLiteralSlotClassificationSchema,
|
||||
}),
|
||||
),
|
||||
triage_signals: z.record(z.string(), z.string()),
|
||||
}),
|
||||
});
|
||||
export type HistoricSqlMetadata = z.infer<typeof historicSqlMetadataSchema>;
|
||||
|
||||
export const historicSqlUsageSchema = z.object({
|
||||
stats: z.object({
|
||||
executions: z.number().int().nonnegative(),
|
||||
distinct_users: z.number().int().nonnegative(),
|
||||
first_seen: z.string().datetime(),
|
||||
last_seen: z.string().datetime(),
|
||||
p50_runtime_ms: z.number().nonnegative().nullable(),
|
||||
p95_runtime_ms: z.number().nonnegative().nullable(),
|
||||
mean_runtime_ms: z.number().nonnegative().nullable().optional(),
|
||||
error_rate: z.number().min(0).max(1),
|
||||
rows_produced: z.number().int().nonnegative().nullable().optional(),
|
||||
}),
|
||||
literal_slots: z.array(
|
||||
z.object({
|
||||
position: z.number().int().min(1),
|
||||
distinct_values: z.number().int().nonnegative(),
|
||||
top_values: z.array(z.tuple([z.string(), z.number().int().nonnegative()])),
|
||||
}),
|
||||
),
|
||||
samples: z.array(
|
||||
z.object({
|
||||
started_at: z.string().datetime(),
|
||||
user: z.string().nullable(),
|
||||
bound_sql: z.string(),
|
||||
rows_produced: z.number().int().nonnegative().nullable().optional(),
|
||||
runtime_ms: z.number().nonnegative().nullable(),
|
||||
success: z.boolean(),
|
||||
}),
|
||||
),
|
||||
});
|
||||
export type HistoricSqlUsage = z.infer<typeof historicSqlUsageSchema>;
|
||||
|
||||
export const historicSqlManifestSchema = z.object({
|
||||
source: z.literal(HISTORIC_SQL_SOURCE_KEY),
|
||||
connectionId: z.string().min(1),
|
||||
dialect: historicSqlDialectSchema,
|
||||
fetchedAt: z.string().datetime(),
|
||||
windowStart: z.string().datetime(),
|
||||
windowEnd: z.string().datetime(),
|
||||
nextSuccessfulCursor: z.string().datetime().nullable(),
|
||||
templateCount: z.number().int().nonnegative(),
|
||||
capped: z.boolean(),
|
||||
warnings: z.array(z.string()),
|
||||
degraded: z.boolean().default(false),
|
||||
statsResetAt: z.string().datetime().nullable().default(null),
|
||||
baselineFirstRun: z.boolean().default(false),
|
||||
pgServerVersion: z.string().nullable().default(null),
|
||||
deallocCount: z.number().int().nonnegative().nullable().default(null),
|
||||
templates: z.array(
|
||||
z.object({
|
||||
id: z.string().min(1),
|
||||
fingerprint: z.string().min(1),
|
||||
subClusterId: z.string().nullable(),
|
||||
path: z.string().min(1),
|
||||
}),
|
||||
),
|
||||
});
|
||||
export type HistoricSqlManifest = z.infer<typeof historicSqlManifestSchema>;
|
||||
107
packages/context/src/ingest/adapters/live-database/chunk.test.ts
Normal file
107
packages/context/src/ingest/adapters/live-database/chunk.test.ts
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
import { mkdtemp } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import type { KloSchemaSnapshot } from '../../../scan/types.js';
|
||||
import { chunkLiveDatabaseStagedDir } from './chunk.js';
|
||||
import { liveDatabaseTablePath, writeLiveDatabaseSnapshot } from './stage.js';
|
||||
|
||||
function snapshot(): KloSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'conn-1',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-27T00:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: null,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: null,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
describe('chunkLiveDatabaseStagedDir', () => {
|
||||
it('emits one work unit per table on the first run', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-chunk-'));
|
||||
await writeLiveDatabaseSnapshot(dir, snapshot());
|
||||
|
||||
const result = await chunkLiveDatabaseStagedDir(dir);
|
||||
expect(result.workUnits.map((wu) => wu.unitKey)).toEqual([
|
||||
'live-database-public-customers',
|
||||
'live-database-public-orders',
|
||||
]);
|
||||
expect(result.workUnits[0]?.dependencyPaths).toEqual(['connection.json', 'foreign-keys.json']);
|
||||
expect(result.workUnits[0]?.peerFileIndex).toContain(
|
||||
liveDatabaseTablePath({ catalog: null, db: 'public', name: 'orders' }),
|
||||
);
|
||||
});
|
||||
|
||||
it('keeps only changed tables during incremental syncs and records table evictions', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-diff-'));
|
||||
await writeLiveDatabaseSnapshot(dir, snapshot());
|
||||
const ordersPath = liveDatabaseTablePath({ catalog: null, db: 'public', name: 'orders' });
|
||||
const customersPath = liveDatabaseTablePath({ catalog: null, db: 'public', name: 'customers' });
|
||||
|
||||
const result = await chunkLiveDatabaseStagedDir(dir, {
|
||||
added: [],
|
||||
modified: [ordersPath],
|
||||
deleted: [customersPath],
|
||||
unchanged: ['connection.json', 'foreign-keys.json'],
|
||||
});
|
||||
|
||||
expect(result.workUnits.map((wu) => wu.unitKey)).toEqual(['live-database-public-orders']);
|
||||
expect(result.eviction?.deletedRawPaths).toEqual([customersPath]);
|
||||
});
|
||||
|
||||
it('fans out all table work units when the foreign-key index changes', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-fk-'));
|
||||
await writeLiveDatabaseSnapshot(dir, snapshot());
|
||||
|
||||
const result = await chunkLiveDatabaseStagedDir(dir, {
|
||||
added: [],
|
||||
modified: ['foreign-keys.json'],
|
||||
deleted: [],
|
||||
unchanged: [],
|
||||
});
|
||||
|
||||
expect(result.workUnits).toHaveLength(2);
|
||||
});
|
||||
});
|
||||
58
packages/context/src/ingest/adapters/live-database/chunk.ts
Normal file
58
packages/context/src/ingest/adapters/live-database/chunk.ts
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
import type { ChunkResult, DiffSet, WorkUnit } from '../../types.js';
|
||||
import type { KloSchemaTable } from '../../../scan/types.js';
|
||||
import { LIVE_DATABASE_FOREIGN_KEYS_FILE, LIVE_DATABASE_META_FILE, readLiveDatabaseTableFiles } from './stage.js';
|
||||
|
||||
function unitKey(table: KloSchemaTable): string {
|
||||
const parts = [table.catalog, table.db, table.name]
|
||||
.filter((part): part is string => typeof part === 'string' && part.length > 0)
|
||||
.map((part) =>
|
||||
part
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-+|-+$/g, ''),
|
||||
)
|
||||
.filter(Boolean);
|
||||
return `live-database-${parts.join('-') || 'table'}`;
|
||||
}
|
||||
|
||||
function displayName(table: KloSchemaTable): string {
|
||||
return [table.catalog, table.db, table.name].filter(Boolean).join('.');
|
||||
}
|
||||
|
||||
function isTablePath(path: string): boolean {
|
||||
return path.startsWith('tables/') && path.endsWith('.json');
|
||||
}
|
||||
|
||||
export async function chunkLiveDatabaseStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
|
||||
const tableFiles = await readLiveDatabaseTableFiles(stagedDir);
|
||||
const allTablePaths = tableFiles.map((file) => file.path);
|
||||
const globalDeps = [LIVE_DATABASE_META_FILE, LIVE_DATABASE_FOREIGN_KEYS_FILE];
|
||||
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
|
||||
const globalTouched = Boolean(
|
||||
touched && (touched.has(LIVE_DATABASE_META_FILE) || touched.has(LIVE_DATABASE_FOREIGN_KEYS_FILE)),
|
||||
);
|
||||
|
||||
const workUnits: WorkUnit[] = [];
|
||||
for (const file of tableFiles) {
|
||||
if (touched && !globalTouched && !touched.has(file.path)) {
|
||||
continue;
|
||||
}
|
||||
const peers = allTablePaths.filter((path) => path !== file.path).sort();
|
||||
workUnits.push({
|
||||
unitKey: unitKey(file.table),
|
||||
displayLabel: `Live database table ${displayName(file.table)}`,
|
||||
rawFiles: [file.path],
|
||||
peerFileIndex: peers,
|
||||
dependencyPaths: globalDeps,
|
||||
notes: `Database catalog snapshot for ${displayName(file.table)} with ${file.table.columns.length} column${
|
||||
file.table.columns.length === 1 ? '' : 's'
|
||||
}.`,
|
||||
});
|
||||
}
|
||||
|
||||
const deletedRawPaths = diffSet ? diffSet.deleted.filter(isTablePath).sort() : [];
|
||||
return {
|
||||
workUnits,
|
||||
...(deletedRawPaths.length > 0 ? { eviction: { deletedRawPaths } } : {}),
|
||||
};
|
||||
}
|
||||
|
|
@ -0,0 +1,224 @@
|
|||
import { once } from 'node:events';
|
||||
import { createServer } from 'node:http';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { createDaemonLiveDatabaseIntrospection } from './daemon-introspection.js';
|
||||
|
||||
const daemonResponse = {
|
||||
connection_id: 'warehouse',
|
||||
extracted_at: '2026-04-28T10:00:00+00:00',
|
||||
metadata: { driver: 'postgres', schemas: ['public'] },
|
||||
tables: [
|
||||
{
|
||||
catalog: 'warehouse',
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
comment: null,
|
||||
columns: [{ name: 'id', type: 'integer', nullable: false, primary_key: true, comment: null }],
|
||||
foreign_keys: [],
|
||||
},
|
||||
{
|
||||
catalog: 'warehouse',
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
comment: 'Order facts',
|
||||
columns: [
|
||||
{ name: 'id', type: 'integer', nullable: false, primary_key: true, comment: 'Order id' },
|
||||
{ name: 'customer_id', type: 'integer', nullable: false, primary_key: false, comment: null },
|
||||
],
|
||||
foreign_keys: [
|
||||
{
|
||||
from_column: 'customer_id',
|
||||
to_table: 'customers',
|
||||
to_column: 'id',
|
||||
constraint_name: 'orders_customer_id_fkey',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
describe('createDaemonLiveDatabaseIntrospection', () => {
|
||||
it('calls the database-introspect daemon command and maps the snapshot response', async () => {
|
||||
const runJson = vi.fn(async () => daemonResponse);
|
||||
const introspection = createDaemonLiveDatabaseIntrospection({
|
||||
connections: {
|
||||
warehouse: {
|
||||
driver: 'postgres',
|
||||
url: 'postgres://localhost:5432/warehouse',
|
||||
readonly: true,
|
||||
},
|
||||
},
|
||||
schemas: ['public'],
|
||||
runJson,
|
||||
});
|
||||
|
||||
await expect(introspection.extractSchema('warehouse')).resolves.toEqual({
|
||||
connectionId: 'warehouse',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-28T10:00:00+00:00',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: { driver: 'postgres', schemas: ['public'] },
|
||||
tables: [
|
||||
{
|
||||
catalog: 'warehouse',
|
||||
db: 'public',
|
||||
name: 'customers',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: null,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
{
|
||||
catalog: 'warehouse',
|
||||
db: 'public',
|
||||
name: 'orders',
|
||||
kind: 'table',
|
||||
comment: 'Order facts',
|
||||
estimatedRows: null,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Order id',
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: null,
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_customer_id_fkey',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(runJson).toHaveBeenCalledWith('database-introspect', {
|
||||
connection_id: 'warehouse',
|
||||
driver: 'postgres',
|
||||
url: 'postgres://localhost:5432/warehouse',
|
||||
schemas: ['public'],
|
||||
statement_timeout_ms: 30_000,
|
||||
connection_timeout_seconds: 5,
|
||||
});
|
||||
});
|
||||
|
||||
it('calls a running daemon HTTP endpoint when baseUrl is configured', async () => {
|
||||
const requests: Array<{ url: string | undefined; body: unknown }> = [];
|
||||
const server = createServer((request, response) => {
|
||||
const chunks: Buffer[] = [];
|
||||
request.on('data', (chunk: Buffer) => chunks.push(chunk));
|
||||
request.on('end', () => {
|
||||
requests.push({
|
||||
url: request.url,
|
||||
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
|
||||
});
|
||||
response.writeHead(200, { 'content-type': 'application/json' });
|
||||
response.end(JSON.stringify(daemonResponse));
|
||||
});
|
||||
});
|
||||
|
||||
server.listen(0, '127.0.0.1');
|
||||
await once(server, 'listening');
|
||||
try {
|
||||
const address = server.address();
|
||||
if (!address || typeof address === 'string') {
|
||||
throw new Error('expected TCP server address');
|
||||
}
|
||||
const introspection = createDaemonLiveDatabaseIntrospection({
|
||||
connections: {
|
||||
warehouse: {
|
||||
driver: 'postgresql',
|
||||
url: 'postgres://localhost:5432/warehouse',
|
||||
readonly: true,
|
||||
},
|
||||
},
|
||||
baseUrl: `http://127.0.0.1:${address.port}`,
|
||||
});
|
||||
|
||||
await expect(introspection.extractSchema('warehouse')).resolves.toMatchObject({
|
||||
connectionId: 'warehouse',
|
||||
tables: [{ name: 'customers' }, { name: 'orders' }],
|
||||
});
|
||||
|
||||
expect(requests).toEqual([
|
||||
{
|
||||
url: '/database/introspect',
|
||||
body: {
|
||||
connection_id: 'warehouse',
|
||||
driver: 'postgres',
|
||||
url: 'postgres://localhost:5432/warehouse',
|
||||
schemas: ['public'],
|
||||
statement_timeout_ms: 30_000,
|
||||
connection_timeout_seconds: 5,
|
||||
},
|
||||
},
|
||||
]);
|
||||
} finally {
|
||||
server.close();
|
||||
}
|
||||
});
|
||||
|
||||
it('requires a configured read-only postgres connection with a url', async () => {
|
||||
const introspection = createDaemonLiveDatabaseIntrospection({
|
||||
connections: {
|
||||
warehouse: {
|
||||
driver: 'postgres',
|
||||
url: 'postgres://localhost:5432/warehouse',
|
||||
readonly: false,
|
||||
},
|
||||
},
|
||||
runJson: vi.fn(async () => daemonResponse),
|
||||
});
|
||||
|
||||
await expect(introspection.extractSchema('warehouse')).rejects.toThrow(
|
||||
'Local live-database ingest requires connections.warehouse.readonly: true.',
|
||||
);
|
||||
});
|
||||
|
||||
it('rejects unsupported local connection drivers before calling the daemon', async () => {
|
||||
const runJson = vi.fn(async () => daemonResponse);
|
||||
const introspection = createDaemonLiveDatabaseIntrospection({
|
||||
connections: {
|
||||
warehouse: {
|
||||
driver: 'snowflake',
|
||||
url: 'snowflake://example',
|
||||
readonly: true,
|
||||
},
|
||||
},
|
||||
runJson,
|
||||
});
|
||||
|
||||
await expect(introspection.extractSchema('warehouse')).rejects.toThrow(
|
||||
'Local live-database ingest cannot run driver "snowflake".',
|
||||
);
|
||||
expect(runJson).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,256 @@
|
|||
import { spawn } from 'node:child_process';
|
||||
import { request as httpRequest } from 'node:http';
|
||||
import { request as httpsRequest } from 'node:https';
|
||||
import { URL } from 'node:url';
|
||||
import type { KloProjectConnectionConfig } from '../../../project/config.js';
|
||||
import type { KloSchemaColumn, KloSchemaForeignKey, KloSchemaSnapshot, KloSchemaTable } from '../../../scan/types.js';
|
||||
import { inferKloDimensionType, normalizeKloNativeType } from '../../../scan/type-normalization.js';
|
||||
import type { LiveDatabaseIntrospectionPort } from './types.js';
|
||||
|
||||
export type KloDaemonDatabaseIntrospectionCommand = 'database-introspect';
|
||||
|
||||
export type KloDaemonDatabaseJsonRunner = (
|
||||
subcommand: KloDaemonDatabaseIntrospectionCommand,
|
||||
payload: Record<string, unknown>,
|
||||
) => Promise<Record<string, unknown>>;
|
||||
|
||||
export type KloDaemonDatabaseHttpJsonRunner = (
|
||||
path: string,
|
||||
payload: Record<string, unknown>,
|
||||
) => Promise<Record<string, unknown>>;
|
||||
|
||||
export interface DaemonLiveDatabaseIntrospectionOptions {
|
||||
connections: Record<string, KloProjectConnectionConfig>;
|
||||
schemas?: string[];
|
||||
statementTimeoutMs?: number;
|
||||
connectionTimeoutSeconds?: number;
|
||||
command?: string;
|
||||
args?: string[];
|
||||
cwd?: string;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
baseUrl?: string;
|
||||
runJson?: KloDaemonDatabaseJsonRunner;
|
||||
requestJson?: KloDaemonDatabaseHttpJsonRunner;
|
||||
now?: () => Date;
|
||||
}
|
||||
|
||||
const DEFAULT_SCHEMAS = ['public'];
|
||||
|
||||
function parseJsonObject(raw: string, subcommand: string): Record<string, unknown> {
|
||||
const parsed = JSON.parse(raw) as unknown;
|
||||
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
|
||||
throw new Error(`klo-daemon ${subcommand} returned non-object JSON`);
|
||||
}
|
||||
return parsed as Record<string, unknown>;
|
||||
}
|
||||
|
||||
function runProcessJson(
|
||||
options: Required<Pick<DaemonLiveDatabaseIntrospectionOptions, 'command' | 'args'>> &
|
||||
Pick<DaemonLiveDatabaseIntrospectionOptions, 'cwd' | 'env'>,
|
||||
): KloDaemonDatabaseJsonRunner {
|
||||
return async (subcommand, payload) =>
|
||||
new Promise((resolve, reject) => {
|
||||
const child = spawn(options.command, [...options.args, subcommand], {
|
||||
cwd: options.cwd,
|
||||
env: { ...process.env, ...options.env },
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
});
|
||||
const stdout: Buffer[] = [];
|
||||
const stderr: Buffer[] = [];
|
||||
|
||||
child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk));
|
||||
child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk));
|
||||
child.on('error', reject);
|
||||
child.on('close', (code) => {
|
||||
const stdoutText = Buffer.concat(stdout).toString('utf8').trim();
|
||||
const stderrText = Buffer.concat(stderr).toString('utf8').trim();
|
||||
if (code !== 0) {
|
||||
reject(new Error(`klo-daemon ${subcommand} failed: ${stderrText || `exit code ${code}`}`));
|
||||
return;
|
||||
}
|
||||
try {
|
||||
resolve(parseJsonObject(stdoutText, subcommand));
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
child.stdin.end(`${JSON.stringify(payload)}\n`);
|
||||
});
|
||||
}
|
||||
|
||||
function normalizedBaseUrl(baseUrl: string): string {
|
||||
return baseUrl.endsWith('/') ? baseUrl : `${baseUrl}/`;
|
||||
}
|
||||
|
||||
function postJson(baseUrl: string): KloDaemonDatabaseHttpJsonRunner {
|
||||
return async (path, payload) =>
|
||||
new Promise((resolve, reject) => {
|
||||
const target = new URL(path.replace(/^\//, ''), normalizedBaseUrl(baseUrl));
|
||||
const body = JSON.stringify(payload);
|
||||
const client = target.protocol === 'https:' ? httpsRequest : httpRequest;
|
||||
const request = client(
|
||||
target,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: {
|
||||
accept: 'application/json',
|
||||
'content-type': 'application/json',
|
||||
'content-length': Buffer.byteLength(body),
|
||||
},
|
||||
},
|
||||
(response) => {
|
||||
const chunks: Buffer[] = [];
|
||||
response.on('data', (chunk: Buffer) => chunks.push(chunk));
|
||||
response.on('end', () => {
|
||||
const text = Buffer.concat(chunks).toString('utf8');
|
||||
const statusCode = response.statusCode ?? 0;
|
||||
if (statusCode < 200 || statusCode >= 300) {
|
||||
reject(new Error(`klo-daemon HTTP ${path} failed with ${statusCode}: ${text}`));
|
||||
return;
|
||||
}
|
||||
try {
|
||||
resolve(parseJsonObject(text, path));
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
},
|
||||
);
|
||||
request.on('error', reject);
|
||||
request.end(body);
|
||||
});
|
||||
}
|
||||
|
||||
function recordValue(value: unknown): Record<string, unknown> {
|
||||
return value && typeof value === 'object' && !Array.isArray(value) ? (value as Record<string, unknown>) : {};
|
||||
}
|
||||
|
||||
function recordArray(value: unknown): Array<Record<string, unknown>> {
|
||||
return Array.isArray(value)
|
||||
? value.filter(
|
||||
(item): item is Record<string, unknown> => item !== null && typeof item === 'object' && !Array.isArray(item),
|
||||
)
|
||||
: [];
|
||||
}
|
||||
|
||||
function requiredString(value: unknown, field: string): string {
|
||||
if (typeof value !== 'string' || value.length === 0) {
|
||||
throw new Error(`klo-daemon database introspection response is missing string field ${field}`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function nullableString(value: unknown): string | null {
|
||||
return typeof value === 'string' ? value : null;
|
||||
}
|
||||
|
||||
function optionalString(value: unknown): string | undefined {
|
||||
return typeof value === 'string' ? value : undefined;
|
||||
}
|
||||
|
||||
function normalizeDriver(driver: unknown): string {
|
||||
const normalized = String(driver ?? '').trim().toLowerCase();
|
||||
return normalized === 'postgresql' ? 'postgres' : normalized;
|
||||
}
|
||||
|
||||
function requirePostgresConnection(
|
||||
connections: Record<string, KloProjectConnectionConfig>,
|
||||
connectionId: string,
|
||||
): KloProjectConnectionConfig & { url: string } {
|
||||
const connection = connections[connectionId];
|
||||
const driver = normalizeDriver(connection?.driver);
|
||||
if (driver !== 'postgres') {
|
||||
throw new Error(`Local live-database ingest cannot run driver "${connection?.driver ?? 'unknown'}".`);
|
||||
}
|
||||
if (connection?.readonly !== true) {
|
||||
throw new Error(`Local live-database ingest requires connections.${connectionId}.readonly: true.`);
|
||||
}
|
||||
if (typeof connection.url !== 'string' || connection.url.trim().length === 0) {
|
||||
throw new Error(`Local live-database ingest requires connections.${connectionId}.url.`);
|
||||
}
|
||||
return connection as KloProjectConnectionConfig & { url: string };
|
||||
}
|
||||
|
||||
function mapColumn(raw: Record<string, unknown>): KloSchemaColumn {
|
||||
const nativeType = requiredString(raw.type, 'tables[].columns[].type');
|
||||
return {
|
||||
name: requiredString(raw.name, 'tables[].columns[].name'),
|
||||
nativeType,
|
||||
normalizedType: normalizeKloNativeType(nativeType),
|
||||
dimensionType: inferKloDimensionType(nativeType),
|
||||
nullable: raw.nullable !== false ? true : false,
|
||||
primaryKey: raw.primary_key === true,
|
||||
comment: nullableString(raw.comment),
|
||||
};
|
||||
}
|
||||
|
||||
function mapForeignKey(raw: Record<string, unknown>): KloSchemaForeignKey {
|
||||
return {
|
||||
fromColumn: requiredString(raw.from_column, 'tables[].foreign_keys[].from_column'),
|
||||
toCatalog: null,
|
||||
toDb: null,
|
||||
toTable: requiredString(raw.to_table, 'tables[].foreign_keys[].to_table'),
|
||||
toColumn: requiredString(raw.to_column, 'tables[].foreign_keys[].to_column'),
|
||||
constraintName: nullableString(raw.constraint_name),
|
||||
};
|
||||
}
|
||||
|
||||
function mapTable(raw: Record<string, unknown>): KloSchemaTable {
|
||||
return {
|
||||
catalog: nullableString(raw.catalog),
|
||||
db: nullableString(raw.db),
|
||||
name: requiredString(raw.name, 'tables[].name'),
|
||||
kind: 'table',
|
||||
comment: nullableString(raw.comment),
|
||||
estimatedRows: null,
|
||||
columns: recordArray(raw.columns).map(mapColumn),
|
||||
foreignKeys: recordArray(raw.foreign_keys).map(mapForeignKey),
|
||||
};
|
||||
}
|
||||
|
||||
function mapDaemonSnapshot(
|
||||
raw: Record<string, unknown>,
|
||||
input: { connectionId: string; extractedAt: string; schemas: string[] },
|
||||
): KloSchemaSnapshot {
|
||||
return {
|
||||
connectionId: requiredString(raw.connection_id, 'connection_id') || input.connectionId,
|
||||
driver: 'postgres',
|
||||
extractedAt: optionalString(raw.extracted_at) ?? input.extractedAt,
|
||||
scope: { schemas: input.schemas },
|
||||
metadata: recordValue(raw.metadata),
|
||||
tables: recordArray(raw.tables).map(mapTable),
|
||||
};
|
||||
}
|
||||
|
||||
export function createDaemonLiveDatabaseIntrospection(
|
||||
options: DaemonLiveDatabaseIntrospectionOptions,
|
||||
): LiveDatabaseIntrospectionPort {
|
||||
const schemas = options.schemas ?? DEFAULT_SCHEMAS;
|
||||
const command = options.command ?? 'python';
|
||||
const args = options.args ?? ['-m', 'klo_daemon'];
|
||||
const runJson = options.runJson ?? runProcessJson({ command, args, cwd: options.cwd, env: options.env });
|
||||
const requestJson = options.requestJson ?? (options.baseUrl ? postJson(options.baseUrl) : undefined);
|
||||
const now = options.now ?? (() => new Date());
|
||||
|
||||
return {
|
||||
async extractSchema(connectionId: string): Promise<KloSchemaSnapshot> {
|
||||
const connection = requirePostgresConnection(options.connections, connectionId);
|
||||
const payload = {
|
||||
connection_id: connectionId,
|
||||
driver: normalizeDriver(connection.driver),
|
||||
url: connection.url,
|
||||
schemas,
|
||||
statement_timeout_ms: options.statementTimeoutMs ?? 30_000,
|
||||
connection_timeout_seconds: options.connectionTimeoutSeconds ?? 5,
|
||||
};
|
||||
const raw = requestJson
|
||||
? await requestJson('/database/introspect', payload)
|
||||
: await runJson('database-introspect', payload);
|
||||
return mapDaemonSnapshot(raw, {
|
||||
connectionId,
|
||||
extractedAt: now().toISOString(),
|
||||
schemas,
|
||||
});
|
||||
},
|
||||
};
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import type { KloSchemaSnapshot } from '../../../scan/types.js';
|
||||
import { buildLiveDatabaseTableNaturalKey, kloSchemaSnapshotToExtractedSchema } from './extracted-schema.js';
|
||||
|
||||
function snapshot(): KloSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'conn-1',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-27T00:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: { driver: 'postgres' },
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: 'Orders placed by customers',
|
||||
estimatedRows: null,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: 'Primary key',
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_customer_id_fkey',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: null,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
describe('kloSchemaSnapshotToExtractedSchema', () => {
|
||||
it('preserves structural table, column, comment, and key metadata', () => {
|
||||
const extracted = kloSchemaSnapshotToExtractedSchema(snapshot());
|
||||
|
||||
expect(extracted.tables).toEqual([
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
dbComment: 'Orders placed by customers',
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'integer',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
dbComment: 'Primary key',
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
type: 'integer',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
dbComment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: 'orders_customer_id_fkey',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
dbComment: null,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'integer',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
dbComment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('builds the same natural key shape used by schema sync', () => {
|
||||
expect(buildLiveDatabaseTableNaturalKey({ catalog: null, db: 'public', name: 'orders' })).toBe('|public|orders');
|
||||
expect(buildLiveDatabaseTableNaturalKey({ catalog: 'warehouse', db: 'analytics', name: 'events' })).toBe(
|
||||
'warehouse|analytics|events',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
import type { KloSchemaSnapshot, KloSchemaTable } from '../../../scan/types.js';
|
||||
|
||||
export interface LiveDatabaseExtractedForeignKey {
|
||||
fromTable: string;
|
||||
fromColumn: string;
|
||||
toTable: string;
|
||||
toColumn: string;
|
||||
constraintName?: string;
|
||||
}
|
||||
|
||||
export interface LiveDatabaseExtractedColumn {
|
||||
name: string;
|
||||
type: string;
|
||||
nullable: boolean;
|
||||
primaryKey: boolean;
|
||||
dbComment: string | null;
|
||||
}
|
||||
|
||||
export interface LiveDatabaseExtractedTable {
|
||||
name: string;
|
||||
catalog: string | null;
|
||||
db: string | null;
|
||||
dbComment: string | null;
|
||||
columns: LiveDatabaseExtractedColumn[];
|
||||
foreignKeys: LiveDatabaseExtractedForeignKey[];
|
||||
}
|
||||
|
||||
export interface LiveDatabaseExtractedSchema {
|
||||
connectionId?: string;
|
||||
tables: LiveDatabaseExtractedTable[];
|
||||
}
|
||||
|
||||
export function buildLiveDatabaseTableNaturalKey(table: Pick<KloSchemaTable, 'catalog' | 'db' | 'name'>): string {
|
||||
return `${table.catalog ?? ''}|${table.db ?? ''}|${table.name}`;
|
||||
}
|
||||
|
||||
export function kloSchemaSnapshotToExtractedSchema(snapshot: KloSchemaSnapshot): LiveDatabaseExtractedSchema {
|
||||
return {
|
||||
connectionId: snapshot.connectionId,
|
||||
tables: snapshot.tables.map((table) => ({
|
||||
name: table.name,
|
||||
catalog: table.catalog ?? null,
|
||||
db: table.db ?? null,
|
||||
dbComment: table.comment ?? null,
|
||||
columns: table.columns.map((column) => ({
|
||||
name: column.name,
|
||||
type: column.nativeType,
|
||||
nullable: column.nullable,
|
||||
primaryKey: column.primaryKey,
|
||||
dbComment: column.comment ?? null,
|
||||
})),
|
||||
foreignKeys: table.foreignKeys.map((foreignKey) => ({
|
||||
fromTable: table.name,
|
||||
fromColumn: foreignKey.fromColumn,
|
||||
toTable: foreignKey.toTable,
|
||||
toColumn: foreignKey.toColumn,
|
||||
...(foreignKey.constraintName ? { constraintName: foreignKey.constraintName } : {}),
|
||||
})),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
import { mkdtemp } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { LiveDatabaseSourceAdapter } from './live-database.adapter.js';
|
||||
|
||||
describe('LiveDatabaseSourceAdapter', () => {
|
||||
it('fetches a schema snapshot through the introspection port', async () => {
|
||||
const extractSchema = vi.fn().mockResolvedValue({
|
||||
connectionId: 'conn-1',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-27T00:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: {},
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: null,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
});
|
||||
const adapter = new LiveDatabaseSourceAdapter({
|
||||
introspection: { extractSchema },
|
||||
now: () => new Date('2026-04-27T00:00:00.000Z'),
|
||||
});
|
||||
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-adapter-'));
|
||||
|
||||
await adapter.fetch(undefined, dir, { connectionId: 'conn-1', sourceKey: 'live-database' });
|
||||
|
||||
expect(extractSchema).toHaveBeenCalledWith('conn-1');
|
||||
await expect(adapter.detect(dir)).resolves.toBe(true);
|
||||
const chunked = await adapter.chunk(dir);
|
||||
expect(chunked.workUnits.map((wu) => wu.unitKey)).toEqual(['live-database-public-orders']);
|
||||
});
|
||||
|
||||
it('declares the live database source and skill', () => {
|
||||
const adapter = new LiveDatabaseSourceAdapter({
|
||||
introspection: { extractSchema: vi.fn() },
|
||||
});
|
||||
expect(adapter.source).toBe('live-database');
|
||||
expect(adapter.skillNames).toEqual(['live_database_ingest']);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
import type { ChunkResult, DiffSet, FetchContext, SourceAdapter } from '../../types.js';
|
||||
import { chunkLiveDatabaseStagedDir } from './chunk.js';
|
||||
import { detectLiveDatabaseStagedDir, writeLiveDatabaseSnapshot } from './stage.js';
|
||||
import type { LiveDatabaseSourceAdapterDeps } from './types.js';
|
||||
|
||||
export class LiveDatabaseSourceAdapter implements SourceAdapter {
|
||||
readonly source = 'live-database';
|
||||
readonly skillNames = ['live_database_ingest'];
|
||||
|
||||
constructor(private readonly deps: LiveDatabaseSourceAdapterDeps) {}
|
||||
|
||||
detect(stagedDir: string): Promise<boolean> {
|
||||
return detectLiveDatabaseStagedDir(stagedDir);
|
||||
}
|
||||
|
||||
async fetch(_pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
|
||||
const snapshot = await this.deps.introspection.extractSchema(ctx.connectionId);
|
||||
await writeLiveDatabaseSnapshot(stagedDir, {
|
||||
...snapshot,
|
||||
connectionId: ctx.connectionId,
|
||||
extractedAt: snapshot.extractedAt ?? (this.deps.now ?? (() => new Date()))().toISOString(),
|
||||
});
|
||||
}
|
||||
|
||||
chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
|
||||
return chunkLiveDatabaseStagedDir(stagedDir, diffSet);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,252 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
buildLiveDatabaseManifestShards,
|
||||
type LiveDatabaseManifestExistingDescriptions,
|
||||
type LiveDatabaseManifestJoinEntry,
|
||||
type LiveDatabaseManifestShard,
|
||||
} from './manifest.js';
|
||||
|
||||
function shardObject(shards: Map<string, LiveDatabaseManifestShard>): Record<string, LiveDatabaseManifestShard> {
|
||||
return Object.fromEntries([...shards.entries()].sort(([a], [b]) => a.localeCompare(b)));
|
||||
}
|
||||
|
||||
describe('buildLiveDatabaseManifestShards', () => {
|
||||
it('builds shard objects with generated joins and preserved external descriptions', () => {
|
||||
const existingDescriptions = new Map<string, LiveDatabaseManifestExistingDescriptions>([
|
||||
[
|
||||
'orders',
|
||||
{
|
||||
table: { user: 'Pinned analyst description', db: 'Old db description' },
|
||||
columns: new Map([['id', { user: 'Pinned id description', db: 'Old id description' }]]),
|
||||
},
|
||||
],
|
||||
]);
|
||||
|
||||
const preservedJoins = new Map<string, LiveDatabaseManifestJoinEntry[]>([
|
||||
[
|
||||
'orders',
|
||||
[
|
||||
{
|
||||
to: 'customers',
|
||||
on: 'orders.account_id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
},
|
||||
{
|
||||
to: 'missing_accounts',
|
||||
on: 'orders.account_id = missing_accounts.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
},
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
const result = buildLiveDatabaseManifestShards({
|
||||
connectionType: 'POSTGRESQL',
|
||||
mapColumnType: (nativeType) => nativeType.toLowerCase(),
|
||||
existingDescriptions,
|
||||
existingPreservedJoins: preservedJoins,
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
descriptions: { db: 'Fresh db description', ai: 'Generated AI description' },
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'INTEGER',
|
||||
pk: true,
|
||||
nullable: false,
|
||||
descriptions: { db: 'Fresh id description' },
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
type: 'INTEGER',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'INTEGER',
|
||||
pk: true,
|
||||
nullable: false,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
joins: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumns: ['customer_id'],
|
||||
toTable: 'customers',
|
||||
toColumns: ['id'],
|
||||
relationship: 'MANY_TO_ONE',
|
||||
source: 'formal',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(result.tablesProcessed).toBe(2);
|
||||
expect(shardObject(result.shards)).toEqual({
|
||||
public: {
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
descriptions: {
|
||||
user: 'Pinned analyst description',
|
||||
db: 'Fresh db description',
|
||||
ai: 'Generated AI description',
|
||||
},
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'integer',
|
||||
pk: true,
|
||||
nullable: false,
|
||||
descriptions: {
|
||||
user: 'Pinned id description',
|
||||
db: 'Fresh id description',
|
||||
},
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
type: 'integer',
|
||||
},
|
||||
],
|
||||
joins: [
|
||||
{
|
||||
to: 'customers',
|
||||
on: 'orders.customer_id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'formal',
|
||||
},
|
||||
{
|
||||
to: 'customers',
|
||||
on: 'orders.account_id = customers.id',
|
||||
relationship: 'many_to_one',
|
||||
source: 'manual',
|
||||
},
|
||||
],
|
||||
},
|
||||
customers: {
|
||||
table: 'public.customers',
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'integer',
|
||||
pk: true,
|
||||
nullable: false,
|
||||
},
|
||||
],
|
||||
joins: [
|
||||
{
|
||||
to: 'orders',
|
||||
on: 'customers.id = orders.customer_id',
|
||||
relationship: 'one_to_many',
|
||||
source: 'formal',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('uses warehouse and schema shard keys for snowflake-style connections', () => {
|
||||
const result = buildLiveDatabaseManifestShards({
|
||||
connectionType: 'SNOWFLAKE',
|
||||
mapColumnType: (nativeType) => nativeType.toLowerCase(),
|
||||
tables: [
|
||||
{
|
||||
name: 'accounts',
|
||||
catalog: 'ANALYTICS',
|
||||
db: 'CORE',
|
||||
columns: [{ name: 'id', type: 'NUMBER' }],
|
||||
},
|
||||
],
|
||||
joins: [],
|
||||
});
|
||||
|
||||
expect(shardObject(result.shards)).toEqual({
|
||||
'ANALYTICS.CORE': {
|
||||
tables: {
|
||||
accounts: {
|
||||
table: 'ANALYTICS.CORE.accounts',
|
||||
columns: [{ name: 'id', type: 'number' }],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('renders ordered multi-column joins in both directions', () => {
|
||||
const result = buildLiveDatabaseManifestShards({
|
||||
connectionType: 'POSTGRESQL',
|
||||
mapColumnType: (nativeType) => nativeType,
|
||||
tables: [
|
||||
{
|
||||
name: 'order_lines',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
columns: [
|
||||
{ name: 'order_id', type: 'integer' },
|
||||
{ name: 'line_number', type: 'integer' },
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'order_line_allocations',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
columns: [
|
||||
{ name: 'order_id', type: 'integer' },
|
||||
{ name: 'line_number', type: 'integer' },
|
||||
],
|
||||
},
|
||||
],
|
||||
joins: [
|
||||
{
|
||||
fromTable: 'order_line_allocations',
|
||||
fromColumns: ['order_id', 'line_number'],
|
||||
toTable: 'order_lines',
|
||||
toColumns: ['order_id', 'line_number'],
|
||||
relationship: 'many_to_one',
|
||||
source: 'inferred',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(shardObject(result.shards)).toMatchObject({
|
||||
public: {
|
||||
tables: {
|
||||
order_line_allocations: {
|
||||
joins: [
|
||||
{
|
||||
to: 'order_lines',
|
||||
on: 'order_line_allocations.order_id = order_lines.order_id AND order_line_allocations.line_number = order_lines.line_number',
|
||||
relationship: 'many_to_one',
|
||||
source: 'inferred',
|
||||
},
|
||||
],
|
||||
},
|
||||
order_lines: {
|
||||
joins: [
|
||||
{
|
||||
to: 'order_line_allocations',
|
||||
on: 'order_lines.order_id = order_line_allocations.order_id AND order_lines.line_number = order_line_allocations.line_number',
|
||||
relationship: 'one_to_many',
|
||||
source: 'inferred',
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
270
packages/context/src/ingest/adapters/live-database/manifest.ts
Normal file
270
packages/context/src/ingest/adapters/live-database/manifest.ts
Normal file
|
|
@ -0,0 +1,270 @@
|
|||
const RELATIONSHIP_MAP: Record<string, string> = {
|
||||
MANY_TO_ONE: 'many_to_one',
|
||||
ONE_TO_MANY: 'one_to_many',
|
||||
ONE_TO_ONE: 'one_to_one',
|
||||
};
|
||||
|
||||
const RELATIONSHIP_INVERSE: Record<string, string> = {
|
||||
many_to_one: 'one_to_many',
|
||||
one_to_many: 'many_to_one',
|
||||
one_to_one: 'one_to_one',
|
||||
};
|
||||
|
||||
const SCAN_MANAGED_DESCRIPTION_KEYS = new Set(['db', 'ai']);
|
||||
|
||||
export interface LiveDatabaseManifestColumn {
|
||||
name: string;
|
||||
type: string;
|
||||
pk?: boolean;
|
||||
nullable?: boolean;
|
||||
descriptions?: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface LiveDatabaseManifestJoinEntry {
|
||||
to: string;
|
||||
on: string;
|
||||
relationship: string;
|
||||
source: string;
|
||||
}
|
||||
|
||||
export interface LiveDatabaseManifestTableEntry {
|
||||
table: string;
|
||||
descriptions?: Record<string, string>;
|
||||
columns: LiveDatabaseManifestColumn[];
|
||||
joins?: LiveDatabaseManifestJoinEntry[];
|
||||
}
|
||||
|
||||
export interface LiveDatabaseManifestShard {
|
||||
tables: Record<string, LiveDatabaseManifestTableEntry>;
|
||||
}
|
||||
|
||||
export interface LiveDatabaseManifestTableData {
|
||||
name: string;
|
||||
catalog: string | null;
|
||||
db: string | null;
|
||||
descriptions?: Record<string, string>;
|
||||
columns: Array<{
|
||||
name: string;
|
||||
type: string;
|
||||
pk?: boolean;
|
||||
nullable?: boolean;
|
||||
descriptions?: Record<string, string>;
|
||||
}>;
|
||||
}
|
||||
|
||||
export interface LiveDatabaseManifestJoinData {
|
||||
fromTable: string;
|
||||
fromColumns: string[];
|
||||
toTable: string;
|
||||
toColumns: string[];
|
||||
relationship: string;
|
||||
source: 'formal' | 'inferred' | 'manual';
|
||||
}
|
||||
|
||||
export interface LiveDatabaseManifestExistingDescriptions {
|
||||
table?: Record<string, string>;
|
||||
columns: Map<string, Record<string, string>>;
|
||||
}
|
||||
|
||||
export interface BuildLiveDatabaseManifestShardsInput {
|
||||
connectionType: string;
|
||||
tables: LiveDatabaseManifestTableData[];
|
||||
joins: LiveDatabaseManifestJoinData[];
|
||||
mapColumnType: (nativeType: string) => string;
|
||||
existingPreservedJoins?: Map<string, LiveDatabaseManifestJoinEntry[]>;
|
||||
existingDescriptions?: Map<string, LiveDatabaseManifestExistingDescriptions>;
|
||||
}
|
||||
|
||||
export interface BuildLiveDatabaseManifestShardsResult {
|
||||
shards: Map<string, LiveDatabaseManifestShard>;
|
||||
tablesProcessed: number;
|
||||
}
|
||||
|
||||
function mergeDescriptionsPreservingExternal(
|
||||
existing: Record<string, string> | undefined,
|
||||
incoming: Record<string, string> | undefined,
|
||||
): Record<string, string> | undefined {
|
||||
if (!existing && !incoming) {
|
||||
return undefined;
|
||||
}
|
||||
const result: Record<string, string> = {};
|
||||
if (existing) {
|
||||
for (const [key, value] of Object.entries(existing)) {
|
||||
if (!SCAN_MANAGED_DESCRIPTION_KEYS.has(key)) {
|
||||
result[key] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (incoming) {
|
||||
Object.assign(result, incoming);
|
||||
}
|
||||
return Object.keys(result).length > 0 ? result : undefined;
|
||||
}
|
||||
|
||||
function getShardKey(connectionType: string, catalog: string | null, db: string | null): string {
|
||||
const normalized = connectionType.toUpperCase();
|
||||
|
||||
switch (normalized) {
|
||||
case 'SNOWFLAKE':
|
||||
case 'DATABRICKS': {
|
||||
const catalogPart = catalog ?? 'default';
|
||||
const schemaPart = db ?? 'public';
|
||||
return `${catalogPart}.${schemaPart}`;
|
||||
}
|
||||
case 'BIGQUERY': {
|
||||
return db ?? catalog ?? 'default';
|
||||
}
|
||||
case 'MYSQL':
|
||||
case 'CLICKHOUSE': {
|
||||
return db ?? catalog ?? 'default';
|
||||
}
|
||||
default: {
|
||||
return db ?? 'public';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function buildTableRef(name: string, catalog: string | null, db: string | null): string {
|
||||
const parts: string[] = [];
|
||||
if (catalog) {
|
||||
parts.push(catalog);
|
||||
}
|
||||
if (db) {
|
||||
parts.push(db);
|
||||
}
|
||||
parts.push(name);
|
||||
return parts.join('.');
|
||||
}
|
||||
|
||||
function addJoinOnce(
|
||||
joinsByTable: Map<string, LiveDatabaseManifestJoinEntry[]>,
|
||||
tableName: string,
|
||||
join: LiveDatabaseManifestJoinEntry,
|
||||
): void {
|
||||
const joins = joinsByTable.get(tableName) ?? [];
|
||||
const exists = joins.some((candidate) => candidate.to === join.to && candidate.on === join.on);
|
||||
if (!exists) {
|
||||
joins.push(join);
|
||||
}
|
||||
joinsByTable.set(tableName, joins);
|
||||
}
|
||||
|
||||
function joinCondition(
|
||||
leftTable: string,
|
||||
leftColumns: readonly string[],
|
||||
rightTable: string,
|
||||
rightColumns: readonly string[],
|
||||
): string {
|
||||
if (leftColumns.length === 0 || leftColumns.length !== rightColumns.length) {
|
||||
throw new Error(`Invalid relationship join from ${leftTable} to ${rightTable}: column tuple widths differ`);
|
||||
}
|
||||
return leftColumns
|
||||
.map((leftColumn, index) => {
|
||||
const rightColumn = rightColumns[index];
|
||||
if (!rightColumn) {
|
||||
throw new Error(`Invalid relationship join from ${leftTable} to ${rightTable}: missing target column`);
|
||||
}
|
||||
return `${leftTable}.${leftColumn} = ${rightTable}.${rightColumn}`;
|
||||
})
|
||||
.join(' AND ');
|
||||
}
|
||||
|
||||
function buildJoinsByTable(
|
||||
tableNames: Set<string>,
|
||||
joins: LiveDatabaseManifestJoinData[],
|
||||
preservedJoins: Map<string, LiveDatabaseManifestJoinEntry[]>,
|
||||
): Map<string, LiveDatabaseManifestJoinEntry[]> {
|
||||
const joinsByTable = new Map<string, LiveDatabaseManifestJoinEntry[]>();
|
||||
|
||||
for (const join of joins) {
|
||||
if (!tableNames.has(join.fromTable) || !tableNames.has(join.toTable)) {
|
||||
continue;
|
||||
}
|
||||
const relationship = RELATIONSHIP_MAP[join.relationship] ?? join.relationship;
|
||||
addJoinOnce(joinsByTable, join.fromTable, {
|
||||
to: join.toTable,
|
||||
on: joinCondition(join.fromTable, join.fromColumns, join.toTable, join.toColumns),
|
||||
relationship,
|
||||
source: join.source,
|
||||
});
|
||||
|
||||
const reverseRelationship = RELATIONSHIP_INVERSE[relationship] ?? 'one_to_many';
|
||||
addJoinOnce(joinsByTable, join.toTable, {
|
||||
to: join.fromTable,
|
||||
on: joinCondition(join.toTable, join.toColumns, join.fromTable, join.fromColumns),
|
||||
relationship: reverseRelationship,
|
||||
source: join.source,
|
||||
});
|
||||
}
|
||||
|
||||
for (const [tableName, tableJoins] of preservedJoins) {
|
||||
if (!tableNames.has(tableName)) {
|
||||
continue;
|
||||
}
|
||||
for (const join of tableJoins) {
|
||||
if (tableNames.has(join.to)) {
|
||||
addJoinOnce(joinsByTable, tableName, join);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return joinsByTable;
|
||||
}
|
||||
|
||||
export function buildLiveDatabaseManifestShards(
|
||||
input: BuildLiveDatabaseManifestShardsInput,
|
||||
): BuildLiveDatabaseManifestShardsResult {
|
||||
const tableNames = new Set(input.tables.map((table) => table.name));
|
||||
const joinsByTable = buildJoinsByTable(tableNames, input.joins, input.existingPreservedJoins ?? new Map());
|
||||
const shards = new Map<string, LiveDatabaseManifestShard>();
|
||||
|
||||
for (const table of input.tables) {
|
||||
const shardKey = getShardKey(input.connectionType, table.catalog, table.db);
|
||||
const shard = shards.get(shardKey) ?? { tables: {} };
|
||||
const existingDescriptions = input.existingDescriptions?.get(table.name);
|
||||
|
||||
const columns: LiveDatabaseManifestColumn[] = table.columns.map((column) => {
|
||||
const manifestColumn: LiveDatabaseManifestColumn = {
|
||||
name: column.name,
|
||||
type: input.mapColumnType(column.type),
|
||||
};
|
||||
if (column.pk) {
|
||||
manifestColumn.pk = true;
|
||||
}
|
||||
if (column.nullable === false) {
|
||||
manifestColumn.nullable = false;
|
||||
}
|
||||
const descriptions = mergeDescriptionsPreservingExternal(
|
||||
existingDescriptions?.columns.get(column.name),
|
||||
column.descriptions,
|
||||
);
|
||||
if (descriptions) {
|
||||
manifestColumn.descriptions = descriptions;
|
||||
}
|
||||
return manifestColumn;
|
||||
});
|
||||
|
||||
const entry: LiveDatabaseManifestTableEntry = {
|
||||
table: buildTableRef(table.name, table.catalog, table.db),
|
||||
columns,
|
||||
};
|
||||
|
||||
const tableDescriptions = mergeDescriptionsPreservingExternal(existingDescriptions?.table, table.descriptions);
|
||||
if (tableDescriptions) {
|
||||
entry.descriptions = tableDescriptions;
|
||||
}
|
||||
|
||||
const tableJoins = joinsByTable.get(table.name);
|
||||
if (tableJoins && tableJoins.length > 0) {
|
||||
entry.joins = tableJoins;
|
||||
}
|
||||
|
||||
shard.tables[table.name] = entry;
|
||||
shards.set(shardKey, shard);
|
||||
}
|
||||
|
||||
return {
|
||||
shards,
|
||||
tablesProcessed: input.tables.length,
|
||||
};
|
||||
}
|
||||
152
packages/context/src/ingest/adapters/live-database/stage.test.ts
Normal file
152
packages/context/src/ingest/adapters/live-database/stage.test.ts
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
import { mkdtemp, readFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
detectLiveDatabaseStagedDir,
|
||||
LIVE_DATABASE_FOREIGN_KEYS_FILE,
|
||||
LIVE_DATABASE_META_FILE,
|
||||
liveDatabaseTablePath,
|
||||
readLiveDatabaseTableFiles,
|
||||
writeLiveDatabaseSnapshot,
|
||||
} from './stage.js';
|
||||
import type { KloSchemaSnapshot } from '../../../scan/types.js';
|
||||
|
||||
function snapshot(): KloSchemaSnapshot {
|
||||
return {
|
||||
connectionId: 'conn-1',
|
||||
driver: 'postgres',
|
||||
extractedAt: '2026-04-27T00:00:00.000Z',
|
||||
scope: { schemas: ['public'] },
|
||||
metadata: { dialect: 'postgres' },
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: 'Orders placed by customers',
|
||||
estimatedRows: 200,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'customer_id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
{
|
||||
name: 'total',
|
||||
nativeType: 'numeric',
|
||||
normalizedType: 'numeric',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromColumn: 'customer_id',
|
||||
toCatalog: null,
|
||||
toDb: 'public',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
constraintName: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
kind: 'table',
|
||||
comment: null,
|
||||
estimatedRows: 50,
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
nativeType: 'integer',
|
||||
normalizedType: 'integer',
|
||||
dimensionType: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
comment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
describe('live-database staged snapshot files', () => {
|
||||
it('writes deterministic metadata, table, and foreign-key files', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-stage-'));
|
||||
await writeLiveDatabaseSnapshot(dir, snapshot());
|
||||
|
||||
await expect(readFile(join(dir, LIVE_DATABASE_META_FILE), 'utf8')).resolves.toContain('"connectionId": "conn-1"');
|
||||
await expect(readFile(join(dir, LIVE_DATABASE_FOREIGN_KEYS_FILE), 'utf8')).resolves.toContain(
|
||||
'"fromTable": "orders"',
|
||||
);
|
||||
const connectionJson = await readFile(join(dir, LIVE_DATABASE_META_FILE), 'utf8');
|
||||
expect(connectionJson).toContain('"driver": "postgres"');
|
||||
expect(connectionJson).toContain('"schemas"');
|
||||
|
||||
const ordersPath = liveDatabaseTablePath({ catalog: null, db: 'public', name: 'orders' });
|
||||
const customersPath = liveDatabaseTablePath({ catalog: null, db: 'public', name: 'customers' });
|
||||
expect(ordersPath).toMatch(/^tables\/[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.json$/);
|
||||
await expect(readFile(join(dir, ordersPath), 'utf8')).resolves.toContain('"name": "orders"');
|
||||
await expect(readFile(join(dir, customersPath), 'utf8')).resolves.toContain('"name": "customers"');
|
||||
const ordersJson = await readFile(join(dir, ordersPath), 'utf8');
|
||||
expect(ordersJson).toContain('"kind": "table"');
|
||||
expect(ordersJson).toContain('"estimatedRows": 200');
|
||||
expect(ordersJson).toContain('"nativeType": "integer"');
|
||||
expect(ordersJson).toContain('"normalizedType": "integer"');
|
||||
expect(ordersJson).not.toContain('"type": "integer"');
|
||||
|
||||
const tableFiles = await readLiveDatabaseTableFiles(dir);
|
||||
expect(tableFiles.map((file) => file.table.name)).toEqual(['customers', 'orders']);
|
||||
expect(await detectLiveDatabaseStagedDir(dir)).toBe(true);
|
||||
});
|
||||
|
||||
it('redacts sensitive snapshot metadata before writing connection metadata', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-redacted-stage-'));
|
||||
await writeLiveDatabaseSnapshot(dir, {
|
||||
...snapshot(),
|
||||
metadata: {
|
||||
dialect: 'postgres',
|
||||
url: 'postgres://reader:secret@example.test/db', // pragma: allowlist secret
|
||||
serviceAccountJson: {
|
||||
client_email: 'reader@example.test',
|
||||
private_key: 'pem-value', // pragma: allowlist secret
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const connectionJson = await readFile(join(dir, LIVE_DATABASE_META_FILE), 'utf8');
|
||||
|
||||
expect(connectionJson).toContain('"dialect": "postgres"');
|
||||
expect(connectionJson).toContain('"client_email": "reader@example.test"');
|
||||
expect(connectionJson).toContain('"url": "<redacted>"');
|
||||
expect(connectionJson).toContain('"private_key": "<redacted>"');
|
||||
expect(connectionJson).not.toContain('postgres://reader:secret@example.test/db'); // pragma: allowlist secret
|
||||
expect(connectionJson).not.toContain('pem-value');
|
||||
});
|
||||
|
||||
it('returns false for a directory that is missing live database metadata', async () => {
|
||||
const dir = await mkdtemp(join(tmpdir(), 'klo-live-db-empty-'));
|
||||
expect(await detectLiveDatabaseStagedDir(dir)).toBe(false);
|
||||
});
|
||||
});
|
||||
138
packages/context/src/ingest/adapters/live-database/stage.ts
Normal file
138
packages/context/src/ingest/adapters/live-database/stage.ts
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
import { Buffer } from 'node:buffer';
|
||||
import type { Dirent } from 'node:fs';
|
||||
import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises';
|
||||
import { join, relative } from 'node:path';
|
||||
import { redactKloSensitiveMetadata } from '../../../core/redaction.js';
|
||||
import type { KloSchemaSnapshot, KloSchemaTable, KloTableRef } from '../../../scan/types.js';
|
||||
|
||||
export const LIVE_DATABASE_META_FILE = 'connection.json';
|
||||
export const LIVE_DATABASE_FOREIGN_KEYS_FILE = 'foreign-keys.json';
|
||||
const LIVE_DATABASE_TABLES_DIR = 'tables';
|
||||
|
||||
interface LiveDatabaseTableFile {
|
||||
path: string;
|
||||
table: KloSchemaTable;
|
||||
}
|
||||
|
||||
interface ForeignKeyIndexEntry {
|
||||
fromTable: string;
|
||||
fromTablePath: string;
|
||||
fromColumn: string;
|
||||
toCatalog: string | null;
|
||||
toDb: string | null;
|
||||
toTable: string;
|
||||
toColumn: string;
|
||||
constraintName: string | null;
|
||||
}
|
||||
|
||||
function encodePathPart(value: string | null | undefined): string {
|
||||
return Buffer.from(value ?? '_', 'utf8').toString('base64url');
|
||||
}
|
||||
|
||||
function tableSortKey(table: KloTableRef): string {
|
||||
return `${table.catalog ?? ''}\u0000${table.db ?? ''}\u0000${table.name}`;
|
||||
}
|
||||
|
||||
export function liveDatabaseTablePath(table: KloTableRef): string {
|
||||
return `${LIVE_DATABASE_TABLES_DIR}/${encodePathPart(table.catalog)}.${encodePathPart(table.db)}.${encodePathPart(
|
||||
table.name,
|
||||
)}.json`;
|
||||
}
|
||||
|
||||
async function walkFiles(root: string, dir = root): Promise<string[]> {
|
||||
let entries: Dirent[];
|
||||
try {
|
||||
entries = await readdir(dir, { withFileTypes: true });
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
const files: string[] = [];
|
||||
for (const entry of entries) {
|
||||
const absolute = join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await walkFiles(root, absolute)));
|
||||
} else if (entry.isFile()) {
|
||||
files.push(relative(root, absolute).replace(/\\/g, '/'));
|
||||
}
|
||||
}
|
||||
return files.sort();
|
||||
}
|
||||
|
||||
function stableJson(value: unknown): string {
|
||||
return `${JSON.stringify(value, null, 2)}\n`;
|
||||
}
|
||||
|
||||
function foreignKeyIndex(snapshot: KloSchemaSnapshot): ForeignKeyIndexEntry[] {
|
||||
const entries: ForeignKeyIndexEntry[] = [];
|
||||
for (const table of snapshot.tables) {
|
||||
for (const fk of table.foreignKeys) {
|
||||
entries.push({
|
||||
fromTable: table.name,
|
||||
fromTablePath: liveDatabaseTablePath(table),
|
||||
fromColumn: fk.fromColumn,
|
||||
toCatalog: fk.toCatalog,
|
||||
toDb: fk.toDb,
|
||||
toTable: fk.toTable,
|
||||
toColumn: fk.toColumn,
|
||||
constraintName: fk.constraintName,
|
||||
});
|
||||
}
|
||||
}
|
||||
entries.sort(
|
||||
(a, b) =>
|
||||
a.fromTable.localeCompare(b.fromTable) ||
|
||||
a.fromColumn.localeCompare(b.fromColumn) ||
|
||||
a.toTable.localeCompare(b.toTable) ||
|
||||
a.toColumn.localeCompare(b.toColumn),
|
||||
);
|
||||
return entries;
|
||||
}
|
||||
|
||||
export async function writeLiveDatabaseSnapshot(stagedDir: string, snapshot: KloSchemaSnapshot): Promise<void> {
|
||||
await mkdir(join(stagedDir, LIVE_DATABASE_TABLES_DIR), { recursive: true });
|
||||
const sortedTables = [...snapshot.tables].sort((a, b) => tableSortKey(a).localeCompare(tableSortKey(b)));
|
||||
const metadata = {
|
||||
connectionId: snapshot.connectionId,
|
||||
driver: snapshot.driver,
|
||||
extractedAt: snapshot.extractedAt,
|
||||
scope: snapshot.scope,
|
||||
metadata: redactKloSensitiveMetadata(snapshot.metadata),
|
||||
tableCount: sortedTables.length,
|
||||
};
|
||||
await writeFile(join(stagedDir, LIVE_DATABASE_META_FILE), stableJson(metadata));
|
||||
await writeFile(
|
||||
join(stagedDir, LIVE_DATABASE_FOREIGN_KEYS_FILE),
|
||||
stableJson({ foreignKeys: foreignKeyIndex(snapshot) }),
|
||||
);
|
||||
for (const table of sortedTables) {
|
||||
await writeFile(join(stagedDir, liveDatabaseTablePath(table)), stableJson(table));
|
||||
}
|
||||
}
|
||||
|
||||
export async function readLiveDatabaseTableFiles(stagedDir: string): Promise<LiveDatabaseTableFile[]> {
|
||||
const files = await walkFiles(join(stagedDir, LIVE_DATABASE_TABLES_DIR));
|
||||
const out: LiveDatabaseTableFile[] = [];
|
||||
for (const file of files.filter((path) => path.endsWith('.json'))) {
|
||||
const path = `${LIVE_DATABASE_TABLES_DIR}/${file}`;
|
||||
const raw = await readFile(join(stagedDir, path), 'utf8');
|
||||
const parsed = JSON.parse(raw) as KloSchemaTable;
|
||||
if (parsed && typeof parsed.name === 'string' && Array.isArray(parsed.columns)) {
|
||||
out.push({ path, table: parsed });
|
||||
}
|
||||
}
|
||||
out.sort((a, b) => tableSortKey(a.table).localeCompare(tableSortKey(b.table)));
|
||||
return out;
|
||||
}
|
||||
|
||||
export async function detectLiveDatabaseStagedDir(stagedDir: string): Promise<boolean> {
|
||||
try {
|
||||
const meta = JSON.parse(await readFile(join(stagedDir, LIVE_DATABASE_META_FILE), 'utf8')) as unknown;
|
||||
if (!meta || typeof meta !== 'object' || Array.isArray(meta)) {
|
||||
return false;
|
||||
}
|
||||
const files = await readLiveDatabaseTableFiles(stagedDir);
|
||||
return files.length > 0;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,428 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { type LiveDatabaseSyncedSchema, planLiveDatabaseStructuralSync } from './structural-sync.js';
|
||||
|
||||
function idFactory(): () => string {
|
||||
let next = 1;
|
||||
return () => `id-${next++}`;
|
||||
}
|
||||
|
||||
describe('planLiveDatabaseStructuralSync', () => {
|
||||
it('plans table and column creates, updates, deletes, and metadata invalidation', () => {
|
||||
const current: LiveDatabaseSyncedSchema = {
|
||||
connectionId: 'conn-1',
|
||||
tables: [
|
||||
{
|
||||
id: 'tbl-orders',
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
enabled: true,
|
||||
descriptions: { ai: 'Old AI order text', db: 'Old DB order text' },
|
||||
columns: [
|
||||
{
|
||||
id: 'col-order-id',
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
parentColumnId: null,
|
||||
descriptions: { db: 'Order id' },
|
||||
embedding: [1, 2, 3],
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
{
|
||||
id: 'col-order-total',
|
||||
name: 'total',
|
||||
type: 'number',
|
||||
nullable: true,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: { ai: 'Old AI total text', db: 'Old total text' },
|
||||
embedding: [4, 5, 6],
|
||||
sampleValues: ['10'],
|
||||
cardinality: 12,
|
||||
},
|
||||
{
|
||||
id: 'col-order-removed',
|
||||
name: 'removed',
|
||||
type: 'string',
|
||||
nullable: true,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'tbl-removed',
|
||||
name: 'removed_table',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'col-removed-id',
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
links: [
|
||||
{
|
||||
id: 'inferred-total-link',
|
||||
fromTableId: 'tbl-orders',
|
||||
fromColumnId: 'col-order-total',
|
||||
toTableId: 'tbl-orders',
|
||||
toColumnId: 'col-order-id',
|
||||
source: 'inferred',
|
||||
confidence: 0.7,
|
||||
relationshipType: 'MANY_TO_ONE',
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const plan = planLiveDatabaseStructuralSync({
|
||||
connectionId: 'conn-1',
|
||||
current,
|
||||
extracted: {
|
||||
connectionId: 'conn-1',
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
dbComment: 'Fresh DB order text',
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
dbComment: 'Order id',
|
||||
},
|
||||
{
|
||||
name: 'total',
|
||||
type: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
dbComment: 'Fresh total text',
|
||||
},
|
||||
{
|
||||
name: 'created_at',
|
||||
type: 'time',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
dbComment: 'Creation timestamp',
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
dbComment: 'Customer table',
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
dbComment: null,
|
||||
},
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
idFactory: idFactory(),
|
||||
});
|
||||
|
||||
expect(plan.stats).toEqual({
|
||||
tablesCreated: 1,
|
||||
tablesDeleted: 1,
|
||||
columnsCreated: 2,
|
||||
columnsDeleted: 2,
|
||||
columnsModified: 1,
|
||||
formalLinksCreated: 0,
|
||||
formalLinksDeleted: 0,
|
||||
});
|
||||
expect(plan.operations.deleteTableIds).toEqual(['tbl-removed']);
|
||||
expect(plan.operations.deleteColumnIds).toEqual(['col-order-removed']);
|
||||
expect(plan.operations.insertTables).toEqual([
|
||||
{
|
||||
id: 'id-2',
|
||||
connectionId: 'conn-1',
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
enabled: true,
|
||||
},
|
||||
]);
|
||||
expect(plan.operations.insertColumns).toEqual([
|
||||
{
|
||||
id: 'id-1',
|
||||
tableId: 'tbl-orders',
|
||||
name: 'created_at',
|
||||
parentColumnId: null,
|
||||
},
|
||||
{
|
||||
id: 'id-3',
|
||||
tableId: 'id-2',
|
||||
name: 'id',
|
||||
parentColumnId: null,
|
||||
},
|
||||
]);
|
||||
expect(plan.operations.touchColumnIds).toEqual(['col-order-total']);
|
||||
expect(plan.operations.invalidateColumnEmbeddingIds).toEqual(['col-order-total']);
|
||||
expect(plan.inferredLinksToValidate).toEqual(['inferred-total-link']);
|
||||
expect(plan.changes).toEqual({
|
||||
newTableIds: ['id-2'],
|
||||
newColumnIds: ['id-1', 'id-3'],
|
||||
tablesWithStructuralChanges: ['tbl-orders', 'id-2'],
|
||||
columnsWithTypeChange: ['col-order-total'],
|
||||
columnsWithDescriptionChange: ['col-order-total'],
|
||||
tablesWithDescriptionChange: ['tbl-orders'],
|
||||
});
|
||||
|
||||
const orders = plan.schema.tables.find((table) => table.name === 'orders');
|
||||
expect(orders?.descriptions).toEqual({ db: 'Fresh DB order text' });
|
||||
expect(orders?.columns.map((column) => column.name)).toEqual(['id', 'total', 'created_at']);
|
||||
expect(orders?.columns.find((column) => column.name === 'total')).toMatchObject({
|
||||
id: 'col-order-total',
|
||||
type: 'string',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
descriptions: { db: 'Fresh total text' },
|
||||
embedding: null,
|
||||
sampleValues: ['10'],
|
||||
cardinality: 12,
|
||||
});
|
||||
});
|
||||
|
||||
it('builds formal links from extracted foreign keys and preserves valid inferred links', () => {
|
||||
const current: LiveDatabaseSyncedSchema = {
|
||||
connectionId: 'conn-1',
|
||||
tables: [
|
||||
{
|
||||
id: 'tbl-orders',
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'col-orders-id',
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
{
|
||||
id: 'col-orders-customer',
|
||||
name: 'customer_id',
|
||||
type: 'number',
|
||||
nullable: false,
|
||||
primaryKey: false,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'tbl-customers',
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
enabled: true,
|
||||
descriptions: {},
|
||||
columns: [
|
||||
{
|
||||
id: 'col-customers-id',
|
||||
name: 'id',
|
||||
type: 'number',
|
||||
nullable: false,
|
||||
primaryKey: true,
|
||||
parentColumnId: null,
|
||||
descriptions: {},
|
||||
embedding: null,
|
||||
sampleValues: null,
|
||||
cardinality: null,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
links: [
|
||||
{
|
||||
id: 'formal-existing',
|
||||
fromTableId: 'tbl-orders',
|
||||
fromColumnId: 'col-orders-customer',
|
||||
toTableId: 'tbl-customers',
|
||||
toColumnId: 'col-customers-id',
|
||||
source: 'formal',
|
||||
confidence: 1,
|
||||
relationshipType: 'MANY_TO_ONE',
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
{
|
||||
id: 'inferred-existing',
|
||||
fromTableId: 'tbl-orders',
|
||||
fromColumnId: 'col-orders-id',
|
||||
toTableId: 'tbl-customers',
|
||||
toColumnId: 'col-customers-id',
|
||||
source: 'inferred',
|
||||
confidence: 0.6,
|
||||
relationshipType: 'MANY_TO_ONE',
|
||||
isPrimaryKeyReference: true,
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const plan = planLiveDatabaseStructuralSync({
|
||||
connectionId: 'conn-1',
|
||||
current,
|
||||
extracted: {
|
||||
connectionId: 'conn-1',
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
dbComment: null,
|
||||
columns: [
|
||||
{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null },
|
||||
{ name: 'customer_id', type: 'number', nullable: false, primaryKey: false, dbComment: null },
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
dbComment: null,
|
||||
columns: [{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null }],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
idFactory: idFactory(),
|
||||
});
|
||||
|
||||
expect(plan.stats.formalLinksCreated).toBe(0);
|
||||
expect(plan.stats.formalLinksDeleted).toBe(0);
|
||||
expect(plan.schema.links.map((link) => link.id)).toEqual(['formal-existing', 'inferred-existing']);
|
||||
|
||||
const planAfterForeignKeyRemoval = planLiveDatabaseStructuralSync({
|
||||
connectionId: 'conn-1',
|
||||
current,
|
||||
extracted: {
|
||||
connectionId: 'conn-1',
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
dbComment: null,
|
||||
columns: [
|
||||
{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null },
|
||||
{ name: 'customer_id', type: 'number', nullable: false, primaryKey: false, dbComment: null },
|
||||
],
|
||||
foreignKeys: [],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
dbComment: null,
|
||||
columns: [{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null }],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
idFactory: idFactory(),
|
||||
});
|
||||
|
||||
expect(planAfterForeignKeyRemoval.stats.formalLinksDeleted).toBe(1);
|
||||
expect(planAfterForeignKeyRemoval.schema.links.map((link) => link.id)).toEqual(['inferred-existing']);
|
||||
|
||||
const planAfterForeignKeyCreation = planLiveDatabaseStructuralSync({
|
||||
connectionId: 'conn-1',
|
||||
current: { ...current, links: [current.links[1]] },
|
||||
extracted: {
|
||||
connectionId: 'conn-1',
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
dbComment: null,
|
||||
columns: [
|
||||
{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null },
|
||||
{ name: 'customer_id', type: 'number', nullable: false, primaryKey: false, dbComment: null },
|
||||
],
|
||||
foreignKeys: [
|
||||
{
|
||||
fromTable: 'orders',
|
||||
fromColumn: 'customer_id',
|
||||
toTable: 'customers',
|
||||
toColumn: 'id',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: 'customers',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
dbComment: null,
|
||||
columns: [{ name: 'id', type: 'number', nullable: false, primaryKey: true, dbComment: null }],
|
||||
foreignKeys: [],
|
||||
},
|
||||
],
|
||||
},
|
||||
idFactory: idFactory(),
|
||||
});
|
||||
|
||||
expect(planAfterForeignKeyCreation.stats.formalLinksCreated).toBe(1);
|
||||
expect(planAfterForeignKeyCreation.schema.links[0]).toMatchObject({
|
||||
id: 'id-1',
|
||||
fromTableId: 'tbl-orders',
|
||||
fromColumnId: 'col-orders-customer',
|
||||
toTableId: 'tbl-customers',
|
||||
toColumnId: 'col-customers-id',
|
||||
source: 'formal',
|
||||
confidence: 1,
|
||||
relationshipType: 'MANY_TO_ONE',
|
||||
isPrimaryKeyReference: true,
|
||||
});
|
||||
});
|
||||
});
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue