ktx/packages/cli/test/telemetry/project-snapshot.test.ts

import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';

import { buildProjectStackSnapshotFields } from '../../src/telemetry/project-snapshot.js';

describe('buildProjectStackSnapshotFields', () => {
  let projectDir: string;

  beforeEach(async () => {
    projectDir = await mkdtemp(join(tmpdir(), 'ktx-stack-snapshot-'));
  });

  afterEach(async () => {
    await rm(projectDir, { recursive: true, force: true });
  });

  it('summarizes connectors and project capabilities without names or paths', async () => {
    await mkdir(join(projectDir, 'semantic-layer', 'warehouse'), { recursive: true });
    await mkdir(join(projectDir, 'wiki', 'global'), { recursive: true });
    await writeFile(join(projectDir, 'semantic-layer', 'warehouse', 'orders.yaml'), 'name: orders\n');
    await writeFile(join(projectDir, 'wiki', 'global', 'revenue.md'), '# Revenue\n');
    await writeFile(join(projectDir, '.mcp.json'), '{"mcpServers":{"ktx":{}}}\n');

    const fields = await buildProjectStackSnapshotFields({
      projectDir,
      config: {
        connections: {
          orbit_demo: { driver: 'sqlite', path: join(projectDir, 'demo.db') },
          warehouse: { driver: 'postgres', readonly: true },
        },
        ingest: {
          adapters: [],
          embeddings: { backend: 'sentence-transformers', dimensions: 384 },
          workUnits: { stepBudget: 40, maxConcurrency: 1, failureMode: 'continue' },
        },
        llm: { provider: { backend: 'none' }, models: {}, promptCaching: {} },
        scan: {
          enrichment: { mode: 'none' },
          relationships: {
            enabled: true,
            llmProposals: true,
            validationRequiredForManifest: true,
            acceptThreshold: 0.85,
            reviewThreshold: 0.55,
            maxLlmTablesPerBatch: 40,
            maxCandidatesPerColumn: 25,
            profileSampleRows: 10000,
            profileConcurrency: 4,
            validationConcurrency: 4,
          },
        },
        storage: {
          state: 'sqlite',
          search: 'sqlite-fts5',
          git: { auto_commit: true, author: 'ktx <ktx@example.com>' },
        },
        agent: { run_research: { enabled: false, max_iterations: 20, default_toolset: [] } },
        memory: { auto_commit: true },
      },
    });

    expect(fields).toEqual({
      connectors: [
        { driver: 'sqlite', isDemo: true },
        { driver: 'postgres', isDemo: false },
      ],
      connectionCount: 2,
      hasSl: true,
      hasWiki: true,
      hasMcp: true,
      hasManagedRuntime: true,
    });
    expect(JSON.stringify(fields)).not.toContain(projectDir);
    expect(JSON.stringify(fields)).not.toContain('warehouse');
  });
});
feat(telemetry): anonymous posthog usage telemetry across node cli and python daemon (#205) * feat: add telemetry phase 1 * feat: add node telemetry event catalog * feat: add telemetry event helpers * feat: emit setup and connection telemetry * feat: emit connection and stack telemetry * feat: emit ingest and scan telemetry * feat: emit query telemetry * feat: emit sampled mcp telemetry * docs: expand telemetry event catalog * feat: add telemetry schema sync artifact * feat: pass telemetry project id to semantic daemon * feat: add daemon telemetry foundation * feat: emit semantic daemon telemetry * feat: emit daemon lifecycle telemetry * docs: document full telemetry event catalog * feat(telemetry): dim first-run notice * feat(telemetry): show first-run notice before command output * feat(telemetry): wire ktx PostHog project for live ingestion * docs(telemetry): drop posthog project name and host from storage section * docs(telemetry): trim to general overview and disclaimer * docs(agents): add short telemetry guidelines * feat(telemetry): enable posthog geoip enrichment * docs(telemetry): drop ip-geoip note from public overview * refactor(telemetry): drop no-op groupIdentify, rely on capture groups field * fix(telemetry): respect CI kill switch in python daemon identity * fix(sql): route table-count analysis to existing analyze-batch endpoint * fix(telemetry): emit install_first_run from notice path and derive flagsPresent from commander * fix(telemetry): read package info via getKtxCliPackageInfo to satisfy boundary check * fix(telemetry): make python identity env={} bypass os.environ and unset CI in tests * fix(telemetry): unset CI kill switch in cli-program-telemetry tests 2026-05-22 18:18:47 +02:00			`import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';`
			`import { tmpdir } from 'node:os';`
			`import { join } from 'node:path';`
			`import { afterEach, beforeEach, describe, expect, it } from 'vitest';`

test: split cli tests from source tree (#216) * feat(cli): define full warehouse dialect contract * test(cli): keep dialect edge tests focused * fix(cli): stabilize dialect contract foundation * refactor(connectors): own read-only query preparation * refactor(connectors): resolve dialects through registry * refactor(connectors): keep concrete dialect classes internal * chore(workspace): enforce dialect import boundary * refactor(cli): resolve relationship dialect at scan boundary * refactor(cli): use dialect display parsing for entity details * refactor(cli): use dialect display parsing for warehouse catalog * refactor(cli): use dialect SQL in relationship workflows * test(cli): verify solid dialect scan workflow closure * test: split cli tests from source tree * refactor(cli): standardize BigQuery scope listing * feat(sqlite): implement connector scope listing * test(connectors): cover required table listing * feat(cli): add warehouse driver registry * refactor(setup): route scope discovery through driver registry * refactor(cli): route local query execution through driver registry * refactor(historic-sql): route dialect support through driver registry * refactor(cli): test warehouse connections through driver registry * fix(cli): close driver registry type export gaps * Improve setup daemon diagnostics * refactor(setup): centralize rail-prefixed diagnostics + query-history fallback Extract errorMessage, writePrefixedLines, and flushPrefixedBufferedCommandOutput into clack.ts so the setup wizard, managed daemons, and embedding/agent steps share one rail-formatted writer. setup-databases.ts also adds a "disable query history and retry" option when the schema-context build fails and query history is the likely culprit, surfaced via a new failed-query-history-unavailable status. * fix(cli): carry catalog through the picker so BigQuery/Snowflake/SQL Server scope filters match The setup picker's KtxTableListEntry was a 2-level { schema, name }, so qualifiedTableId always wrote db.name into enabled_tables. When BigQuery, Snowflake, or SQL Server later ran fast ingest, their introspect step filtered the scope set with scopedTableNames(scope, { catalog: projectId\|database, db }) — catalog was non-null on the introspect side but null in the scope refs, so every entry was rejected, the live-database adapter staged zero table files, and detect() failed with 'Adapter "live-database" did not recognize fetched source output'. Align the picker boundary with the canonical 3-level KtxTableRef: - Add catalog: string \| null to KtxTableListEntry. - BigQuery/Snowflake/SQL Server listTables populate catalog from the resolved projectId / database; Postgres/MySQL/ClickHouse/SQLite set null. - qualifiedTableId emits catalog.schema.name when catalog is non-null (resolveEnabledTables already accepts the 3-part shape) and schemasFromEnabledTables now goes through parseDottedTableEntry so it recovers the schema correctly from both 2-part and 3-part entries. - Export parseDottedTableEntry from enabled-tables.ts (@internal) for picker reuse. Update listTables expectations in all seven connector tests and the setup / picker test fixtures. Add a picker regression test that covers the catalog-bearing round-trip (save + refine). * fix(cli): allow debug telemetry under opt-out env 2026-05-26 08:49:05 +02:00			`import { buildProjectStackSnapshotFields } from '../../src/telemetry/project-snapshot.js';`
feat(telemetry): anonymous posthog usage telemetry across node cli and python daemon (#205) * feat: add telemetry phase 1 * feat: add node telemetry event catalog * feat: add telemetry event helpers * feat: emit setup and connection telemetry * feat: emit connection and stack telemetry * feat: emit ingest and scan telemetry * feat: emit query telemetry * feat: emit sampled mcp telemetry * docs: expand telemetry event catalog * feat: add telemetry schema sync artifact * feat: pass telemetry project id to semantic daemon * feat: add daemon telemetry foundation * feat: emit semantic daemon telemetry * feat: emit daemon lifecycle telemetry * docs: document full telemetry event catalog * feat(telemetry): dim first-run notice * feat(telemetry): show first-run notice before command output * feat(telemetry): wire ktx PostHog project for live ingestion * docs(telemetry): drop posthog project name and host from storage section * docs(telemetry): trim to general overview and disclaimer * docs(agents): add short telemetry guidelines * feat(telemetry): enable posthog geoip enrichment * docs(telemetry): drop ip-geoip note from public overview * refactor(telemetry): drop no-op groupIdentify, rely on capture groups field * fix(telemetry): respect CI kill switch in python daemon identity * fix(sql): route table-count analysis to existing analyze-batch endpoint * fix(telemetry): emit install_first_run from notice path and derive flagsPresent from commander * fix(telemetry): read package info via getKtxCliPackageInfo to satisfy boundary check * fix(telemetry): make python identity env={} bypass os.environ and unset CI in tests * fix(telemetry): unset CI kill switch in cli-program-telemetry tests 2026-05-22 18:18:47 +02:00
			`describe('buildProjectStackSnapshotFields', () => {`
			`let projectDir: string;`

			`beforeEach(async () => {`
			`projectDir = await mkdtemp(join(tmpdir(), 'ktx-stack-snapshot-'));`
			`});`

			`afterEach(async () => {`
			`await rm(projectDir, { recursive: true, force: true });`
			`});`

			`it('summarizes connectors and project capabilities without names or paths', async () => {`
			`await mkdir(join(projectDir, 'semantic-layer', 'warehouse'), { recursive: true });`
			`await mkdir(join(projectDir, 'wiki', 'global'), { recursive: true });`
			`await writeFile(join(projectDir, 'semantic-layer', 'warehouse', 'orders.yaml'), 'name: orders\n');`
			`await writeFile(join(projectDir, 'wiki', 'global', 'revenue.md'), '# Revenue\n');`
			`await writeFile(join(projectDir, '.mcp.json'), '{"mcpServers":{"ktx":{}}}\n');`

			`const fields = await buildProjectStackSnapshotFields({`
			`projectDir,`
			`config: {`
			`connections: {`
			`orbit_demo: { driver: 'sqlite', path: join(projectDir, 'demo.db') },`
			`warehouse: { driver: 'postgres', readonly: true },`
			`},`
			`ingest: {`
			`adapters: [],`
			`embeddings: { backend: 'sentence-transformers', dimensions: 384 },`
			`workUnits: { stepBudget: 40, maxConcurrency: 1, failureMode: 'continue' },`
			`},`
			`llm: { provider: { backend: 'none' }, models: {}, promptCaching: {} },`
			`scan: {`
			`enrichment: { mode: 'none' },`
			`relationships: {`
			`enabled: true,`
			`llmProposals: true,`
			`validationRequiredForManifest: true,`
			`acceptThreshold: 0.85,`
			`reviewThreshold: 0.55,`
			`maxLlmTablesPerBatch: 40,`
			`maxCandidatesPerColumn: 25,`
			`profileSampleRows: 10000,`
fix(snowflake): unblock multi-schema ingest and relationship discovery (#204) * feat(setup): drop redundant Snowflake schema prompt; fall back to free-text on listSchemas failure Snowflake setup previously asked for a single schema as free text, then ran a multiselect against the discovered schemas — two schema questions back-to-back, with the first being only a session bootstrap. The SDK's `schema` is optional, so the bootstrap step is unnecessary. - Remove the free-text Snowflake schema prompt; only pass `schema` to snowflake-sdk when one is configured. - When `listSchemas()` fails (e.g. role lacks SHOW SCHEMAS), prompt the user for a comma-separated list, persist it as `schema_names`, and use it as both the table-list filter and the multiselect default. Applies to every driver with a scope-discovery spec, not just Snowflake. - Update docs to lead with `schema_names`; keep `schema_name` as a documented single-schema shorthand. * fix(snowflake): keep introspecting when primary-key discovery is denied The PK query joins INFORMATION_SCHEMA.TABLE_CONSTRAINTS and INFORMATION_SCHEMA.KEY_COLUMN_USAGE, which require grants the connection role may not have. Previously a 'SQL compilation error: Object ANALYTICS.INFORMATION_SCHEMA.KEY_COLUMN_USAGE does not exist or not authorized' aborted the entire introspect — schemas, columns, and row counts were all discarded over a missing nice-to-have. Wrap the constraint query in try/catch, log a one-line warning per schema, and return an empty PK map. Columns end up with primaryKey=false; relationship inference still has FK and profiling to fall back on. * fix(scan): unblock relationship discovery on Snowflake Two adjacent bugs prevented the scan's relationship pipeline from producing any joins on a Snowflake warehouse: - relationship-profiling.ts fell through to a default `GROUP_CONCAT` branch for unknown drivers. Snowflake has no GROUP_CONCAT, so every per-table profile query failed with "Unknown function GROUP_CONCAT". Add an explicit Snowflake branch that uses LISTAGG with a literal '\x1f' delimiter (Snowflake requires the delimiter to be a constant, so CHR(31) is rejected). - description-generation.ts destructured `connector.sampleTable` and `connector.sampleColumn` into bare locals, losing the `this` binding when the class-method connectors (Snowflake, Postgres, MySQL) were invoked. Every sample call threw "Cannot read properties of undefined (reading 'assertConnection')" and degraded LLM descriptions to metadata-only prompts. Call the methods through the connector instead. Without these, even after the primary-key probe is allowed to fail softly, the scan ends up with 0 validated relationships and an empty `joins:` block in every shard YAML. * test(scan): cover table-ref helpers * feat(scan): plumb tableScope through live-database introspection port * feat(scan): apply tableScope during metadata fetch * feat(scan): enforce table scope at fetch boundary * feat(scan): pool Snowflake sessions and batch enrichment for faster ingest (#206) * feat(cli): add RSA key-pair auth option to Snowflake setup wizard Extends the interactive Snowflake setup flow with an authentication-method prompt (password vs RSA/JWT key-pair). The RSA branch collects a private-key path (env/file/absolute) and an optional passphrase; the resulting connection config records `authMethod: 'rsa'` with `privateKey` and `passphrase` instead of `password`. * feat(scan): pool Snowflake sessions * fix(scan): reuse structural snapshots and cleanup connectors * feat(scan): parallelize relationship profiling * feat(scan): batch table description generation * docs: document Snowflake ingest concurrency knobs * fix(scan): close Snowflake ingest perf verification gaps * fix(scan): keep batched description failure bounded * feat(scan): dispatch query-history probes by connection driver Extract historic-sql dialect resolution into a shared helper so the status-project readiness check and the local ingest factory agree on which connections enable query history and which probe to run. The status command now picks the postgres/snowflake/bigquery probe based on the connection's driver instead of always reporting against postgres, which previously caused snowflake connections with queryHistory.enabled to surface a misleading "driver is snowflake" failure. Also drops a noisy console.warn from Snowflake primary-key discovery — INFORMATION_SCHEMA.KEY_COLUMN_USAGE is commonly ungranted for read-only roles and the FK + profiling paths handle the empty PK map already. * fix(llm): allow StructuredOutput tool and raise maxTurns for generateObject The Claude Code agent SDK announces an internal pseudo-tool named StructuredOutput in the system/init message whenever outputFormat is set to { type: 'json_schema' }. The runtime's isolation check built its allowedToolIds set only from MCP tool ids and treated StructuredOutput as an unexpected host-injected tool, so every generateObject call threw "Claude Code runtime isolation failed: tools=StructuredOutput ..." and the table-descriptions and relationship-LLM-proposal enrichment stages recorded null output across the board. Whitelist StructuredOutput specifically in generateObject's allowedToolIds — the check also enforces missing_tools symmetry, so generateText and runAgentLoop, which do not see StructuredOutput, must not require it. generateObject also ran with maxTurns: 1, which the model intermittently breached when it emitted thinking text before the structured response. Raised to 5 to give the schema-bound call enough headroom without allowing unbounded loops. The existing tests now exercise the path with an init message that announces StructuredOutput so the regression cannot slip back in. * chore(scripts): add ktx-reset.sh project-cleanup helper Convenience script for repeatable ingest testing: takes a project directory and prunes everything except ktx.yaml and .ktx/secrets/, so the next ktx setup or ktx ingest run starts from a known-clean state. 2026-05-23 10:41:30 +02:00			`profileConcurrency: 4,`
feat(telemetry): anonymous posthog usage telemetry across node cli and python daemon (#205) * feat: add telemetry phase 1 * feat: add node telemetry event catalog * feat: add telemetry event helpers * feat: emit setup and connection telemetry * feat: emit connection and stack telemetry * feat: emit ingest and scan telemetry * feat: emit query telemetry * feat: emit sampled mcp telemetry * docs: expand telemetry event catalog * feat: add telemetry schema sync artifact * feat: pass telemetry project id to semantic daemon * feat: add daemon telemetry foundation * feat: emit semantic daemon telemetry * feat: emit daemon lifecycle telemetry * docs: document full telemetry event catalog * feat(telemetry): dim first-run notice * feat(telemetry): show first-run notice before command output * feat(telemetry): wire ktx PostHog project for live ingestion * docs(telemetry): drop posthog project name and host from storage section * docs(telemetry): trim to general overview and disclaimer * docs(agents): add short telemetry guidelines * feat(telemetry): enable posthog geoip enrichment * docs(telemetry): drop ip-geoip note from public overview * refactor(telemetry): drop no-op groupIdentify, rely on capture groups field * fix(telemetry): respect CI kill switch in python daemon identity * fix(sql): route table-count analysis to existing analyze-batch endpoint * fix(telemetry): emit install_first_run from notice path and derive flagsPresent from commander * fix(telemetry): read package info via getKtxCliPackageInfo to satisfy boundary check * fix(telemetry): make python identity env={} bypass os.environ and unset CI in tests * fix(telemetry): unset CI kill switch in cli-program-telemetry tests 2026-05-22 18:18:47 +02:00			`validationConcurrency: 4,`
			`},`
			`},`
			`storage: {`
			`state: 'sqlite',`
			`search: 'sqlite-fts5',`
			`git: { auto_commit: true, author: 'ktx <ktx@example.com>' },`
			`},`
			`agent: { run_research: { enabled: false, max_iterations: 20, default_toolset: [] } },`
			`memory: { auto_commit: true },`
			`},`
			`});`

			`expect(fields).toEqual({`
			`connectors: [`
			`{ driver: 'sqlite', isDemo: true },`
			`{ driver: 'postgres', isDemo: false },`
			`],`
			`connectionCount: 2,`
			`hasSl: true,`
			`hasWiki: true,`
			`hasMcp: true,`
			`hasManagedRuntime: true,`
			`});`
			`expect(JSON.stringify(fields)).not.toContain(projectDir);`
			`expect(JSON.stringify(fields)).not.toContain('warehouse');`
			`});`
			`});`