ktx/packages/cli/test/context/daemon/semantic-layer-compute.test.ts

344 lines
10 KiB
TypeScript
Raw Permalink Normal View History

2026-05-10 23:12:26 +02:00
import { once } from 'node:events';
import { createServer } from 'node:http';
import { describe, expect, it, vi } from 'vitest';
test: split cli tests from source tree (#216) * feat(cli): define full warehouse dialect contract * test(cli): keep dialect edge tests focused * fix(cli): stabilize dialect contract foundation * refactor(connectors): own read-only query preparation * refactor(connectors): resolve dialects through registry * refactor(connectors): keep concrete dialect classes internal * chore(workspace): enforce dialect import boundary * refactor(cli): resolve relationship dialect at scan boundary * refactor(cli): use dialect display parsing for entity details * refactor(cli): use dialect display parsing for warehouse catalog * refactor(cli): use dialect SQL in relationship workflows * test(cli): verify solid dialect scan workflow closure * test: split cli tests from source tree * refactor(cli): standardize BigQuery scope listing * feat(sqlite): implement connector scope listing * test(connectors): cover required table listing * feat(cli): add warehouse driver registry * refactor(setup): route scope discovery through driver registry * refactor(cli): route local query execution through driver registry * refactor(historic-sql): route dialect support through driver registry * refactor(cli): test warehouse connections through driver registry * fix(cli): close driver registry type export gaps * Improve setup daemon diagnostics * refactor(setup): centralize rail-prefixed diagnostics + query-history fallback Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput into clack.ts so the setup wizard, managed daemons, and embedding/agent steps share one rail-formatted writer. setup-databases.ts also adds a "disable query history and retry" option when the schema-context build fails and query history is the likely culprit, surfaced via a new failed-query-history-unavailable status. * fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match The setup picker's KtxTableListEntry was a 2-level { schema, name }, so qualifiedTableId always wrote db.name into enabled_tables. When BigQuery, Snowflake, or SQL Server later ran fast ingest, their introspect step filtered the scope set with scopedTableNames(scope, { catalog: projectId|database, db }) — catalog was non-null on the introspect side but null in the scope refs, so every entry was rejected, the live-database adapter staged zero table files, and detect() failed with 'Adapter "live-database" did not recognize fetched source output'. Align the picker boundary with the canonical 3-level KtxTableRef: - Add catalog: string | null to KtxTableListEntry. - BigQuery/Snowflake/SQL Server listTables populate catalog from the resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null. - qualifiedTableId emits catalog.schema.name when catalog is non-null (resolveEnabledTables already accepts the 3-part shape) and schemasFromEnabledTables now goes through parseDottedTableEntry so it recovers the schema correctly from both 2-part and 3-part entries. - Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker reuse. Update listTables expectations in all seven connector tests and the setup / picker test fixtures. Add a picker regression test that covers the catalog-bearing round-trip (save + refine). * fix(cli): allow debug telemetry under opt-out env
2026-05-26 08:49:05 +02:00
import { createHttpSemanticLayerComputePort, createPythonSemanticLayerComputePort } from '../../../src/context/daemon/semantic-layer-compute.js';
2026-05-10 23:12:26 +02:00
const source = {
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'number' }],
joins: [],
measures: [{ name: 'order_count', expr: 'count(*)' }],
};
const sourceGenerationInput = {
tables: [
{
name: 'orders',
db: 'public',
comment: 'Orders table',
columns: [
{ name: 'id', type: 'integer', primaryKey: true, nullable: false, comment: 'Order ID' },
{ name: 'customer_id', type: 'integer' },
{ name: 'amount', type: 'decimal', comment: 'Order amount' },
],
},
{
name: 'customers',
db: 'public',
columns: [
{ name: 'id', type: 'integer', primaryKey: true },
{ name: 'email', type: 'varchar' },
],
},
],
links: [
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
relationshipType: 'MANY_TO_ONE',
},
],
dialect: 'postgres',
};
const sourceGenerationDaemonPayload = {
tables: [
{
name: 'orders',
db: 'public',
comment: 'Orders table',
columns: [
{ name: 'id', type: 'integer', primary_key: true, nullable: false, comment: 'Order ID' },
{ name: 'customer_id', type: 'integer' },
{ name: 'amount', type: 'decimal', comment: 'Order amount' },
],
},
{
name: 'customers',
db: 'public',
columns: [
{ name: 'id', type: 'integer', primary_key: true },
{ name: 'email', type: 'varchar' },
],
},
],
links: [
{
from_table: 'orders',
from_column: 'customer_id',
to_table: 'customers',
to_column: 'id',
relationship_type: 'MANY_TO_ONE',
},
],
dialect: 'postgres',
};
const sourceGenerationDaemonResponse = {
source_count: 2,
sources: [
{
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'number' }],
joins: [
{
to: 'customers',
on: 'customer_id = customers.id',
relationship: 'many_to_one',
},
],
measures: [{ name: 'record_count', expr: 'count(id)' }],
},
],
};
describe('createPythonSemanticLayerComputePort', () => {
it('calls the semantic-query stdio command', async () => {
const runJson = vi.fn(async () => ({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
}));
feat(telemetry): anonymous posthog usage telemetry across node cli and python daemon (#205) * feat: add telemetry phase 1 * feat: add node telemetry event catalog * feat: add telemetry event helpers * feat: emit setup and connection telemetry * feat: emit connection and stack telemetry * feat: emit ingest and scan telemetry * feat: emit query telemetry * feat: emit sampled mcp telemetry * docs: expand telemetry event catalog * feat: add telemetry schema sync artifact * feat: pass telemetry project id to semantic daemon * feat: add daemon telemetry foundation * feat: emit semantic daemon telemetry * feat: emit daemon lifecycle telemetry * docs: document full telemetry event catalog * feat(telemetry): dim first-run notice * feat(telemetry): show first-run notice before command output * feat(telemetry): wire ktx PostHog project for live ingestion * docs(telemetry): drop posthog project name and host from storage section * docs(telemetry): trim to general overview and disclaimer * docs(agents): add short telemetry guidelines * feat(telemetry): enable posthog geoip enrichment * docs(telemetry): drop ip-geoip note from public overview * refactor(telemetry): drop no-op groupIdentify, rely on capture groups field * fix(telemetry): respect CI kill switch in python daemon identity * fix(sql): route table-count analysis to existing analyze-batch endpoint * fix(telemetry): emit install_first_run from notice path and derive flagsPresent from commander * fix(telemetry): read package info via getKtxCliPackageInfo to satisfy boundary check * fix(telemetry): make python identity env={} bypass os.environ and unset CI in tests * fix(telemetry): unset CI kill switch in cli-program-telemetry tests
2026-05-22 18:18:47 +02:00
const port = createPythonSemanticLayerComputePort({
runJson,
projectId: 'hashed-project-id',
});
2026-05-10 23:12:26 +02:00
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toEqual({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
});
expect(runJson).toHaveBeenCalledWith('semantic-query', {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
feat(telemetry): anonymous posthog usage telemetry across node cli and python daemon (#205) * feat: add telemetry phase 1 * feat: add node telemetry event catalog * feat: add telemetry event helpers * feat: emit setup and connection telemetry * feat: emit connection and stack telemetry * feat: emit ingest and scan telemetry * feat: emit query telemetry * feat: emit sampled mcp telemetry * docs: expand telemetry event catalog * feat: add telemetry schema sync artifact * feat: pass telemetry project id to semantic daemon * feat: add daemon telemetry foundation * feat: emit semantic daemon telemetry * feat: emit daemon lifecycle telemetry * docs: document full telemetry event catalog * feat(telemetry): dim first-run notice * feat(telemetry): show first-run notice before command output * feat(telemetry): wire ktx PostHog project for live ingestion * docs(telemetry): drop posthog project name and host from storage section * docs(telemetry): trim to general overview and disclaimer * docs(agents): add short telemetry guidelines * feat(telemetry): enable posthog geoip enrichment * docs(telemetry): drop ip-geoip note from public overview * refactor(telemetry): drop no-op groupIdentify, rely on capture groups field * fix(telemetry): respect CI kill switch in python daemon identity * fix(sql): route table-count analysis to existing analyze-batch endpoint * fix(telemetry): emit install_first_run from notice path and derive flagsPresent from commander * fix(telemetry): read package info via getKtxCliPackageInfo to satisfy boundary check * fix(telemetry): make python identity env={} bypass os.environ and unset CI in tests * fix(telemetry): unset CI kill switch in cli-program-telemetry tests
2026-05-22 18:18:47 +02:00
projectId: 'hashed-project-id',
2026-05-10 23:12:26 +02:00
});
});
it('calls the semantic-validate stdio command', async () => {
const runJson = vi.fn(async () => ({
valid: true,
errors: [],
warnings: [],
per_source_warnings: {},
}));
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(
port.validateSources({
sources: [source],
dialect: 'postgres',
recentlyTouched: ['orders'],
}),
).resolves.toEqual({
valid: true,
errors: [],
warnings: [],
perSourceWarnings: {},
});
expect(runJson).toHaveBeenCalledWith('semantic-validate', {
sources: [source],
dialect: 'postgres',
recently_touched: ['orders'],
});
});
it('calls the semantic-generate-sources stdio command', async () => {
const runJson = vi.fn(async () => sourceGenerationDaemonResponse);
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(runJson).toHaveBeenCalledWith('semantic-generate-sources', sourceGenerationDaemonPayload);
});
});
describe('createHttpSemanticLayerComputePort', () => {
it('calls semantic query and validate HTTP endpoints through an injected runner', async () => {
const requestJson = vi.fn(async (path: string) => {
if (path === '/semantic-layer/query') {
return {
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
};
}
return {
valid: true,
errors: [],
warnings: [],
per_source_warnings: {},
};
});
const port = createHttpSemanticLayerComputePort({ baseUrl: 'http://127.0.0.1:8765/', requestJson });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toEqual({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
});
await expect(
port.validateSources({
sources: [source],
dialect: 'postgres',
recentlyTouched: ['orders'],
}),
).resolves.toEqual({
valid: true,
errors: [],
warnings: [],
perSourceWarnings: {},
});
expect(requestJson).toHaveBeenNthCalledWith(1, '/semantic-layer/query', {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
});
expect(requestJson).toHaveBeenNthCalledWith(2, '/semantic-layer/validate', {
sources: [source],
dialect: 'postgres',
recently_touched: ['orders'],
});
});
it('calls the semantic source-generation HTTP endpoint through an injected runner', async () => {
const requestJson = vi.fn(async () => sourceGenerationDaemonResponse);
const port = createHttpSemanticLayerComputePort({ baseUrl: 'http://127.0.0.1:8765/', requestJson });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(requestJson).toHaveBeenCalledWith('/semantic-layer/generate-sources', sourceGenerationDaemonPayload);
});
it('posts JSON to a running HTTP daemon endpoint', async () => {
const requests: Array<{ url: string | undefined; body: unknown }> = [];
const server = createServer((request, response) => {
const chunks: Buffer[] = [];
request.on('data', (chunk: Buffer) => chunks.push(chunk));
request.on('end', () => {
requests.push({
url: request.url,
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
});
response.writeHead(200, { 'content-type': 'application/json' });
response.end(
JSON.stringify({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
}),
);
});
});
server.listen(0, '127.0.0.1');
await once(server, 'listening');
try {
const address = server.address();
if (!address || typeof address === 'string') {
throw new Error('expected TCP server address');
}
const port = createHttpSemanticLayerComputePort({ baseUrl: `http://127.0.0.1:${address.port}` });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toMatchObject({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
});
expect(requests).toEqual([
{
url: '/semantic-layer/query',
body: {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
},
},
]);
} finally {
server.close();
}
});
it('posts source-generation JSON to a running HTTP daemon endpoint', async () => {
const requests: Array<{ url: string | undefined; body: unknown }> = [];
const server = createServer((request, response) => {
const chunks: Buffer[] = [];
request.on('data', (chunk: Buffer) => chunks.push(chunk));
request.on('end', () => {
requests.push({
url: request.url,
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
});
response.writeHead(200, { 'content-type': 'application/json' });
response.end(JSON.stringify(sourceGenerationDaemonResponse));
});
});
server.listen(0, '127.0.0.1');
await once(server, 'listening');
try {
const address = server.address();
if (!address || typeof address === 'string') {
throw new Error('expected TCP server address');
}
const port = createHttpSemanticLayerComputePort({ baseUrl: `http://127.0.0.1:${address.port}` });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(requests).toEqual([
{
url: '/semantic-layer/generate-sources',
body: sourceGenerationDaemonPayload,
},
]);
} finally {
server.close();
}
});
});