diff --git a/docs/superpowers/plans/2026-05-14-research-agent-mcp-dictionary-search.md b/docs/superpowers/plans/2026-05-14-research-agent-mcp-dictionary-search.md new file mode 100644 index 00000000..63663937 --- /dev/null +++ b/docs/superpowers/plans/2026-05-14-research-agent-mcp-dictionary-search.md @@ -0,0 +1,939 @@ +# Research Agent MCP Dictionary Search Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add the MCP-shaped `dictionary_search` tool so external research agents can resolve user-mentioned literal values to profile-sampled warehouse columns. + +**Architecture:** Reuse the existing relationship-profile dictionary extraction as the source of truth, add a focused local dictionary-search service that reports coverage and non-authoritative misses per connection, then register the service through the MCP context tool surface and local project ports. The service re-reads the latest profile artifact on each call instead of keeping a long-lived cache, so scan freshness is correct for the MCP daemon v1. + +**Tech Stack:** TypeScript, Vitest, Zod, KTX local file store, relationship-profile artifacts, KTX MCP context ports. + +--- + +## Current Audit + +Original spec: `docs/superpowers/specs/2026-05-14-research-agent-mcp-tools-design.md` + +Implemented v1 slices: + +- `docs/superpowers/plans/2026-05-14-research-agent-mcp-sql-execution-foundation.md` is implemented. Current source has sqlglot read-only validation in `python/ktx-daemon/src/ktx_daemon/sql_analysis.py`, `SqlAnalysisPort.validateReadOnly()` in `packages/context/src/sql-analysis/ports.ts`, MCP `sql_execution` registration in `packages/context/src/mcp/context-tools.ts`, and local connector execution gated by validation in `packages/context/src/mcp/local-project-ports.ts`. +- `docs/superpowers/plans/2026-05-14-research-agent-mcp-entity-details.md` is implemented. Current source has `packages/context/src/scan/entity-details.ts`, MCP `entity_details` registration in `packages/context/src/mcp/context-tools.ts`, and local project wiring in `packages/context/src/mcp/local-project-ports.ts`. + +V1-blocking gaps remaining against the original spec: + +- `dictionary_search` is not registered on the MCP surface and `KtxMcpContextPorts` has no dictionary-search port. +- `discover_data` is not registered on the MCP surface and the unified ranked result shape is not implemented. +- The ingest-side warehouse-verification tools still use `connectionName` / `targets` / `rowLimit` contracts and have not been fully converged with shared MCP-shaped services. +- `ktx mcp start|stop|status|logs` and the HTTP Streamable MCP daemon do not exist. +- `ktx setup-agents` does not install MCP client config entries or the `ktx-research` skill. + +This plan covers only the next focused blocker: MCP `dictionary_search`. Later plans still need to cover `discover_data`, ingest contract convergence, the HTTP daemon, and setup-agent/research-skill installation. + +Non-blocking or explicitly out-of-scope gaps: + +- Python code execution over MCP. +- Stdio MCP transport. +- OS-level auto-start. +- Native TLS, audit logging, rate limiting, per-tool authorization, and multi-project daemon routing. +- Streaming SQL results. + +## File Structure + +Create: + +- `packages/context/src/sl/dictionary-search.ts` + - Reads the latest `relationship-profile.json` per searched connection. + - Uses `loadLatestSlDictionaryEntries()` for dictionary entries. + - Returns spec-shaped `searched` coverage records, matches, and per-value miss reasons. + - Re-reads artifacts per call rather than caching, satisfying MCP freshness for v1. +- `packages/context/src/sl/dictionary-search.test.ts` + - Covers matches, non-authoritative misses, missing profile artifacts, no candidate columns, case-insensitive substring matching, and connection scoping. + +Modify: + +- `packages/context/src/sl/index.ts` + - Export the new service and response types. +- `packages/context/src/mcp/types.ts` + - Add `KtxDictionarySearchMcpPort` and include `dictionarySearch` in `KtxMcpContextPorts`. +- `packages/context/src/mcp/context-tools.ts` + - Add the `dictionary_search` Zod schema and registration. +- `packages/context/src/mcp/server.test.ts` + - Assert MCP registration and structured output for `dictionary_search`. +- `packages/context/src/mcp/local-project-ports.ts` + - Wire local project dictionary search to the new service. +- `packages/context/src/mcp/local-project-ports.test.ts` + - Cover local-port `dictionary_search` success and missing-profile behavior. +- `packages/context/src/mcp/index.ts` + - Export the new MCP port type if it is not already covered by existing barrel exports. + +## Task 1: Add The Dictionary Search Service + +**Files:** +- Create: `packages/context/src/sl/dictionary-search.test.ts` +- Create: `packages/context/src/sl/dictionary-search.ts` +- Modify: `packages/context/src/sl/index.ts` + +- [ ] **Step 1: Write failing service tests** + +Create `packages/context/src/sl/dictionary-search.test.ts`: + +```typescript +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { initKtxProject, type KtxLocalProject } from '../project/index.js'; +import { createKtxDictionarySearchService } from './dictionary-search.js'; + +describe('createKtxDictionarySearchService', () => { + let tempDir: string; + let project: KtxLocalProject; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-dictionary-search-')); + project = await initKtxProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' }); + project.config.connections.warehouse = { driver: 'postgres', url: 'env:DATABASE_URL' }; + project.config.connections.billing = { driver: 'postgres', url: 'env:BILLING_DATABASE_URL' }; + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + async function seedProfile(input: { + connectionId: string; + syncId: string; + columns: Record; + }): Promise { + await project.fileStore.writeFile( + `raw-sources/${input.connectionId}/live-database/${input.syncId}/enrichment/relationship-profile.json`, + `${JSON.stringify( + { + connectionId: input.connectionId, + driver: 'postgres', + sqlAvailable: true, + queryCount: 4, + tables: [], + columns: input.columns, + warnings: [], + }, + null, + 2, + )}\n`, + 'ktx', + 'ktx@example.com', + 'Seed relationship profile', + ); + } + + it('returns matches and non-authoritative misses across configured connections', async () => { + await seedProfile({ + connectionId: 'warehouse', + syncId: 'sync-1', + columns: { + 'orders.status': { + table: { catalog: null, db: 'public', name: 'orders' }, + column: 'status', + nativeType: 'text', + normalizedType: 'string', + distinctCount: 3, + sampleValues: ['paid', 'refunded', 'pending'], + }, + }, + }); + await seedProfile({ + connectionId: 'billing', + syncId: 'sync-2', + columns: { + 'customers.name': { + table: { catalog: null, db: 'public', name: 'customers' }, + column: 'name', + nativeType: 'text', + normalizedType: 'string', + distinctCount: 4, + sampleValues: ['Acme Corp', 'Globex'], + }, + }, + }); + const service = createKtxDictionarySearchService(project); + + await expect(service.search({ values: ['PAID', 'missing'] })).resolves.toEqual({ + searched: [ + { + connectionId: 'billing', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 1, + syncId: 'sync-2', + profiledAt: null, + }, + status: 'ready', + }, + { + connectionId: 'warehouse', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 1, + syncId: 'sync-1', + profiledAt: null, + }, + status: 'ready', + }, + ], + results: [ + { + value: 'PAID', + matches: [ + { + connectionId: 'warehouse', + sourceName: 'orders', + columnName: 'status', + matchedValue: 'paid', + cardinality: 3, + }, + ], + misses: [{ connectionId: 'billing', reason: 'value_not_in_sample' }], + }, + { + value: 'missing', + matches: [], + misses: [ + { connectionId: 'billing', reason: 'value_not_in_sample' }, + { connectionId: 'warehouse', reason: 'value_not_in_sample' }, + ], + }, + ], + }); + }); + + it('distinguishes missing profile artifacts from profiles with no candidate columns', async () => { + await seedProfile({ + connectionId: 'billing', + syncId: 'sync-empty', + columns: { + 'events.id': { + table: { catalog: null, db: 'public', name: 'events' }, + column: 'id', + nativeType: 'integer', + normalizedType: 'integer', + distinctCount: 100, + sampleValues: [1, 2, 3], + }, + }, + }); + const service = createKtxDictionarySearchService(project); + + await expect(service.search({ values: ['Acme'] })).resolves.toEqual({ + searched: [ + { + connectionId: 'billing', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 0, + syncId: 'sync-empty', + profiledAt: null, + }, + status: 'no_candidate_columns', + }, + { + connectionId: 'warehouse', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 0, + syncId: null, + profiledAt: null, + }, + status: 'no_profile_artifact', + }, + ], + results: [ + { + value: 'Acme', + matches: [], + misses: [ + { connectionId: 'billing', reason: 'no_candidate_columns' }, + { connectionId: 'warehouse', reason: 'no_profile_artifact' }, + ], + }, + ], + }); + }); + + it('scopes search to the requested connection', async () => { + await seedProfile({ + connectionId: 'warehouse', + syncId: 'sync-1', + columns: { + 'orders.status': { + table: { catalog: null, db: 'public', name: 'orders' }, + column: 'status', + nativeType: 'text', + normalizedType: 'string', + distinctCount: 3, + sampleValues: ['paid'], + }, + }, + }); + await seedProfile({ + connectionId: 'billing', + syncId: 'sync-2', + columns: { + 'invoices.status': { + table: { catalog: null, db: 'public', name: 'invoices' }, + column: 'status', + nativeType: 'text', + normalizedType: 'string', + distinctCount: 2, + sampleValues: ['paid'], + }, + }, + }); + const service = createKtxDictionarySearchService(project); + + await expect(service.search({ connectionId: 'billing', values: ['paid'] })).resolves.toMatchObject({ + searched: [{ connectionId: 'billing', status: 'ready' }], + results: [ + { + value: 'paid', + matches: [{ connectionId: 'billing', sourceName: 'invoices', columnName: 'status', matchedValue: 'paid' }], + misses: [], + }, + ], + }); + }); +}); +``` + +- [ ] **Step 2: Run service tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/dictionary-search.test.ts +``` + +Expected: FAIL with `Cannot find module './dictionary-search.js'`. + +- [ ] **Step 3: Implement the dictionary search service** + +Create `packages/context/src/sl/dictionary-search.ts`: + +```typescript +import type { KtxLocalProject } from '../project/index.js'; +import { loadLatestSlDictionaryEntries, type SlDictionaryEntry } from './sl-dictionary-profile.js'; + +export type KtxDictionarySearchStatus = 'ready' | 'no_profile_artifact' | 'no_candidate_columns'; +export type KtxDictionarySearchMissReason = 'no_profile_artifact' | 'no_candidate_columns' | 'value_not_in_sample'; + +export interface KtxDictionarySearchInput { + values: string[]; + connectionId?: string; +} + +export interface KtxDictionarySearchCoverage { + sampledRows: number | null; + valuesPerColumn: number | null; + profiledColumns: number; + syncId: string | null; + profiledAt: string | null; +} + +export interface KtxDictionarySearchSearchedConnection { + connectionId: string; + coverage: KtxDictionarySearchCoverage; + status: KtxDictionarySearchStatus; +} + +export interface KtxDictionarySearchMatch { + connectionId: string; + sourceName: string; + columnName: string; + matchedValue: string; + cardinality: number | null; +} + +export interface KtxDictionarySearchMiss { + connectionId: string; + reason: KtxDictionarySearchMissReason; +} + +export interface KtxDictionarySearchValueResult { + value: string; + matches: KtxDictionarySearchMatch[]; + misses: KtxDictionarySearchMiss[]; +} + +export interface KtxDictionarySearchResponse { + searched: KtxDictionarySearchSearchedConnection[]; + results: KtxDictionarySearchValueResult[]; +} + +interface RelationshipProfileArtifact { + connectionId?: string; + profileSampleRows?: unknown; + sampleValuesPerColumn?: unknown; + profiledAt?: unknown; + extractedAt?: unknown; +} + +function uniqueSorted(values: Iterable): string[] { + return [...new Set([...values].filter((value) => value.trim().length > 0))].sort((left, right) => + left.localeCompare(right), + ); +} + +function latestProfileSyncId(path: string): string | null { + const parts = path.split('/'); + return parts.at(-3) ?? null; +} + +function optionalNumber(value: unknown): number | null { + return typeof value === 'number' && Number.isFinite(value) ? value : null; +} + +function optionalString(value: unknown): string | null { + return typeof value === 'string' && value.trim().length > 0 ? value : null; +} + +async function latestProfilePath(project: KtxLocalProject, connectionId: string): Promise { + const root = `raw-sources/${connectionId}/live-database`; + let files: string[]; + try { + files = (await project.fileStore.listFiles(root)).files; + } catch { + return null; + } + return files + .filter((path) => path.endsWith('/enrichment/relationship-profile.json')) + .sort((left, right) => left.localeCompare(right)) + .at(-1) ?? null; +} + +async function readProfile(project: KtxLocalProject, path: string): Promise { + const raw = await project.fileStore.readFile(path); + const parsed = JSON.parse(raw.content) as unknown; + return typeof parsed === 'object' && parsed !== null && !Array.isArray(parsed) + ? (parsed as RelationshipProfileArtifact) + : {}; +} + +function profiledColumnCount(entries: readonly SlDictionaryEntry[]): number { + return new Set(entries.map((entry) => `${entry.sourceName}\u001f${entry.columnName}`)).size; +} + +async function searchedConnection( + project: KtxLocalProject, + connectionId: string, + entries: readonly SlDictionaryEntry[], +): Promise { + const path = await latestProfilePath(project, connectionId); + if (!path) { + return { + connectionId, + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 0, + syncId: null, + profiledAt: null, + }, + status: 'no_profile_artifact', + }; + } + + const profile = await readProfile(project, path); + const count = profiledColumnCount(entries); + return { + connectionId, + coverage: { + sampledRows: optionalNumber(profile.profileSampleRows), + valuesPerColumn: optionalNumber(profile.sampleValuesPerColumn), + profiledColumns: count, + syncId: latestProfileSyncId(path), + profiledAt: optionalString(profile.profiledAt) ?? optionalString(profile.extractedAt), + }, + status: count > 0 ? 'ready' : 'no_candidate_columns', + }; +} + +function entryMatchesValue(entry: SlDictionaryEntry, value: string): boolean { + return entry.value.toLowerCase().includes(value.toLowerCase()); +} + +function toMatch(entry: SlDictionaryEntry): KtxDictionarySearchMatch { + return { + connectionId: entry.connectionId, + sourceName: entry.sourceName, + columnName: entry.columnName, + matchedValue: entry.value, + cardinality: entry.cardinality, + }; +} + +function sortMatches(matches: KtxDictionarySearchMatch[]): KtxDictionarySearchMatch[] { + return matches.sort( + (left, right) => + left.connectionId.localeCompare(right.connectionId) || + left.sourceName.localeCompare(right.sourceName) || + left.columnName.localeCompare(right.columnName) || + left.matchedValue.localeCompare(right.matchedValue), + ); +} + +function missReason(status: KtxDictionarySearchStatus): KtxDictionarySearchMissReason { + return status === 'ready' ? 'value_not_in_sample' : status; +} + +export function createKtxDictionarySearchService(project: KtxLocalProject) { + return { + async search(input: KtxDictionarySearchInput): Promise { + const connectionIds = input.connectionId ? [input.connectionId] : uniqueSorted(Object.keys(project.config.connections)); + const entries = await loadLatestSlDictionaryEntries(project, connectionIds); + const entriesByConnection = new Map(); + for (const connectionId of connectionIds) { + entriesByConnection.set( + connectionId, + entries.filter((entry) => entry.connectionId === connectionId), + ); + } + + const searched = ( + await Promise.all( + connectionIds.map((connectionId) => + searchedConnection(project, connectionId, entriesByConnection.get(connectionId) ?? []), + ), + ) + ).sort((left, right) => left.connectionId.localeCompare(right.connectionId)); + const searchedByConnection = new Map(searched.map((connection) => [connection.connectionId, connection])); + + return { + searched, + results: input.values.map((value) => { + const matches = sortMatches(entries.filter((entry) => entryMatchesValue(entry, value)).map(toMatch)); + const matchedConnections = new Set(matches.map((match) => match.connectionId)); + return { + value, + matches, + misses: searched + .filter((connection) => !matchedConnections.has(connection.connectionId)) + .map((connection) => ({ + connectionId: connection.connectionId, + reason: missReason(searchedByConnection.get(connection.connectionId)?.status ?? 'no_profile_artifact'), + })), + }; + }), + }; + }, + }; +} +``` + +- [ ] **Step 4: Export the service** + +In `packages/context/src/sl/index.ts`, add: + +```typescript +export { + createKtxDictionarySearchService, +} from './dictionary-search.js'; +export type { + KtxDictionarySearchCoverage, + KtxDictionarySearchInput, + KtxDictionarySearchMatch, + KtxDictionarySearchMiss, + KtxDictionarySearchMissReason, + KtxDictionarySearchResponse, + KtxDictionarySearchSearchedConnection, + KtxDictionarySearchStatus, + KtxDictionarySearchValueResult, +} from './dictionary-search.js'; +``` + +- [ ] **Step 5: Run service tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/dictionary-search.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit the service slice** + +Run: + +```bash +git add packages/context/src/sl/dictionary-search.ts packages/context/src/sl/dictionary-search.test.ts packages/context/src/sl/index.ts +git commit -m "feat(context): add dictionary search service" +``` + +## Task 2: Register The MCP `dictionary_search` Tool + +**Files:** +- Modify: `packages/context/src/mcp/types.ts` +- Modify: `packages/context/src/mcp/context-tools.ts` +- Modify: `packages/context/src/mcp/server.test.ts` +- Modify: `packages/context/src/mcp/index.ts` + +- [ ] **Step 1: Add MCP port types** + +In `packages/context/src/mcp/types.ts`, extend the imports: + +```typescript +import type { KtxDictionarySearchInput, KtxDictionarySearchResponse } from '../sl/index.js'; +``` + +Add this interface near the other MCP port interfaces: + +```typescript +export interface KtxDictionarySearchMcpPort { + search(input: KtxDictionarySearchInput): Promise; +} +``` + +Add the new optional port to `KtxMcpContextPorts`: + +```typescript +export interface KtxMcpContextPorts { + connections?: KtxConnectionsMcpPort; + knowledge?: KtxKnowledgeMcpPort; + semanticLayer?: KtxSemanticLayerMcpPort; + entityDetails?: KtxEntityDetailsMcpPort; + dictionarySearch?: KtxDictionarySearchMcpPort; + sqlExecution?: KtxSqlExecutionMcpPort; + ingest?: KtxIngestMcpPort; + scan?: KtxScanMcpPort; +} +``` + +- [ ] **Step 2: Write failing MCP registration test** + +In `packages/context/src/mcp/server.test.ts`, update the type import list to include: + +```typescript +KtxDictionarySearchMcpPort, +``` + +Add this test after the `entity_details` registration test: + +```typescript + it('registers dictionary_search when the host provides a dictionary-search port', async () => { + const fake = makeFakeServer(); + const dictionarySearch: KtxDictionarySearchMcpPort = { + search: vi.fn().mockResolvedValue({ + searched: [ + { + connectionId: 'warehouse', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 1, + syncId: 'sync-1', + profiledAt: null, + }, + status: 'ready', + }, + ], + results: [ + { + value: 'paid', + matches: [ + { + connectionId: 'warehouse', + sourceName: 'orders', + columnName: 'status', + matchedValue: 'paid', + cardinality: 3, + }, + ], + misses: [], + }, + ], + }), + }; + + createKtxMcpServer({ + server: fake.server, + userContext: { userId: 'local-user' }, + contextTools: { dictionarySearch }, + }); + + expect(fake.tools.map((tool) => tool.name)).toEqual(['dictionary_search']); + await expect( + getTool(fake.tools, 'dictionary_search').handler({ + connectionId: 'warehouse', + values: ['paid'], + }), + ).resolves.toMatchObject({ + structuredContent: { + searched: [{ connectionId: 'warehouse', status: 'ready' }], + results: [ + { + value: 'paid', + matches: [{ connectionId: 'warehouse', sourceName: 'orders', columnName: 'status' }], + misses: [], + }, + ], + }, + }); + expect(dictionarySearch.search).toHaveBeenCalledWith({ + connectionId: 'warehouse', + values: ['paid'], + }); + }); +``` + +- [ ] **Step 3: Run failing MCP registration test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/server.test.ts -t "dictionary_search" +``` + +Expected: FAIL because `dictionary_search` is not registered. + +- [ ] **Step 4: Add the MCP schema and registration** + +In `packages/context/src/mcp/context-tools.ts`, add the input schema near the other research schemas: + +```typescript +const dictionarySearchSchema = z.object({ + values: z.array(z.string().min(1)).min(1).max(20), + connectionId: connectionIdSchema.optional(), +}); +``` + +Add this registration block after `entity_details` and before `sql_execution`: + +```typescript + if (ports.dictionarySearch) { + const dictionarySearch = ports.dictionarySearch; + registerParsedTool( + server, + 'dictionary_search', + { + title: 'Dictionary Search', + description: + 'Search profile-sampled warehouse values and report matching connection/source/column locations plus non-authoritative miss reasons.', + inputSchema: dictionarySearchSchema.shape, + }, + dictionarySearchSchema, + async (input) => jsonToolResult(await dictionarySearch.search(input)), + ); + } +``` + +- [ ] **Step 5: Confirm MCP barrel exports** + +Open `packages/context/src/mcp/index.ts`. If it exports from `./types.js`, no change is needed. If it lists named type exports, add `KtxDictionarySearchMcpPort` to that list. + +- [ ] **Step 6: Run MCP registration test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/server.test.ts -t "dictionary_search" +``` + +Expected: PASS. + +- [ ] **Step 7: Commit MCP registration** + +Run: + +```bash +git add packages/context/src/mcp/types.ts packages/context/src/mcp/context-tools.ts packages/context/src/mcp/server.test.ts packages/context/src/mcp/index.ts +git commit -m "feat(context): register MCP dictionary search tool" +``` + +## Task 3: Wire Local Project MCP Ports + +**Files:** +- Modify: `packages/context/src/mcp/local-project-ports.ts` +- Modify: `packages/context/src/mcp/local-project-ports.test.ts` + +- [ ] **Step 1: Write failing local-port tests** + +In `packages/context/src/mcp/local-project-ports.test.ts`, add this test after the entity-details local-port tests: + +```typescript + it('exposes local dictionary search through MCP ports', async () => { + const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + await project.fileStore.writeFile( + 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json', + `${JSON.stringify( + { + connectionId: 'warehouse', + driver: 'postgres', + sqlAvailable: true, + queryCount: 4, + tables: [], + columns: { + 'orders.status': { + table: { catalog: null, db: 'public', name: 'orders' }, + column: 'status', + nativeType: 'text', + normalizedType: 'string', + distinctCount: 2, + sampleValues: ['paid', 'refunded'], + }, + }, + warnings: [], + }, + null, + 2, + )}\n`, + 'ktx', + 'ktx@example.com', + 'Seed dictionary profile', + ); + + const ports = createLocalProjectMcpContextPorts(project); + + await expect(ports.dictionarySearch?.search({ values: ['paid'] })).resolves.toMatchObject({ + searched: [{ connectionId: 'warehouse', status: 'ready' }], + results: [ + { + value: 'paid', + matches: [{ connectionId: 'warehouse', sourceName: 'orders', columnName: 'status', matchedValue: 'paid' }], + misses: [], + }, + ], + }); + }); + + it('reports missing local dictionary profiles through MCP ports', async () => { + const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + + const ports = createLocalProjectMcpContextPorts(project); + + await expect(ports.dictionarySearch?.search({ values: ['paid'] })).resolves.toEqual({ + searched: [ + { + connectionId: 'warehouse', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 0, + syncId: null, + profiledAt: null, + }, + status: 'no_profile_artifact', + }, + ], + results: [ + { + value: 'paid', + matches: [], + misses: [{ connectionId: 'warehouse', reason: 'no_profile_artifact' }], + }, + ], + }); + }); +``` + +- [ ] **Step 2: Run failing local-port tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/local-project-ports.test.ts -t "dictionary" +``` + +Expected: FAIL because `ports.dictionarySearch` is undefined. + +- [ ] **Step 3: Wire the local port** + +In `packages/context/src/mcp/local-project-ports.ts`, update the SL import block to include: + +```typescript +createKtxDictionarySearchService, +``` + +Add this port to the `ports` object returned by `createLocalProjectMcpContextPorts()` near `entityDetails`: + +```typescript + dictionarySearch: { + async search(input) { + return createKtxDictionarySearchService(project).search(input); + }, + }, +``` + +- [ ] **Step 4: Run local-port tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/local-project-ports.test.ts -t "dictionary" +``` + +Expected: PASS. + +- [ ] **Step 5: Commit local-port wiring** + +Run: + +```bash +git add packages/context/src/mcp/local-project-ports.ts packages/context/src/mcp/local-project-ports.test.ts +git commit -m "feat(context): expose local MCP dictionary search" +``` + +## Task 4: Final Verification + +**Files:** +- Verify all files changed in Tasks 1-3. + +- [ ] **Step 1: Run focused tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sl/dictionary-search.test.ts src/mcp/server.test.ts src/mcp/local-project-ports.test.ts +``` + +Expected: PASS for dictionary-search service, MCP registration, and local-port coverage. + +- [ ] **Step 2: Run context type-check** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +``` + +Expected: PASS. + +- [ ] **Step 3: Inspect diff** + +Run: + +```bash +git status --short +git diff --stat HEAD +``` + +Expected: only the dictionary-search service, MCP type/registration, tests, and exports changed. + +- [ ] **Step 4: Commit verification note if needed** + +If the previous tasks already committed all source changes, do not create an empty commit. If a small follow-up fix was required during verification, commit only those files: + +```bash +git add packages/context/src/sl/dictionary-search.ts packages/context/src/sl/dictionary-search.test.ts packages/context/src/sl/index.ts packages/context/src/mcp/types.ts packages/context/src/mcp/context-tools.ts packages/context/src/mcp/server.test.ts packages/context/src/mcp/index.ts packages/context/src/mcp/local-project-ports.ts packages/context/src/mcp/local-project-ports.test.ts +git commit -m "test(context): cover MCP dictionary search" +``` diff --git a/docs/superpowers/plans/2026-05-14-research-agent-mcp-discover-data.md b/docs/superpowers/plans/2026-05-14-research-agent-mcp-discover-data.md new file mode 100644 index 00000000..a917eb72 --- /dev/null +++ b/docs/superpowers/plans/2026-05-14-research-agent-mcp-discover-data.md @@ -0,0 +1,1315 @@ +# Research Agent MCP Discover Data Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add the MCP-shaped `discover_data` tool so external research agents get one ranked discovery view across wiki pages, semantic-layer sources/measures/dimensions, and raw warehouse schema. + +**Architecture:** Create a focused local discovery service in `packages/context/src/search/discover.ts` that builds deterministic per-kind refs from existing wiki, semantic-layer, and latest scan artifacts, fuses the wiki/SL/raw sub-searches with the existing RRF core, and re-reads local artifacts on every call for MCP daemon freshness. Register the service through the MCP context port and local project MCP ports without changing the existing ingest-only `discover_data` adapter yet. + +**Tech Stack:** TypeScript, Vitest, Zod, KTX local file store, KTX wiki/SL/scan services, KTX MCP context ports, existing `HybridSearchCore`/RRF search utilities. + +--- + +## Audit Summary + +Original spec: `docs/superpowers/specs/2026-05-14-research-agent-mcp-tools-design.md` + +Implemented v1 slices confirmed in current source: + +- Existing in-process MCP semantic runtime exists in `packages/context/src/mcp/server.ts`, `packages/context/src/mcp/context-tools.ts`, and `packages/context/src/mcp/local-project-ports.ts`. +- Ingest-only warehouse verification tools exist under `packages/context/src/ingest/tools/warehouse-verification/`. +- MCP `sql_execution` is implemented and parser-gated: `python/ktx-daemon/src/ktx_daemon/sql_analysis.py` has `validate_read_only_sql_response`, `python/ktx-daemon/src/ktx_daemon/app.py` exposes `POST /sql/validate-read-only`, `packages/context/src/sql-analysis/ports.ts` has `validateReadOnly()`, and `packages/context/src/mcp/context-tools.ts` registers `sql_execution`. +- MCP `entity_details` is implemented: `packages/context/src/scan/entity-details.ts`, `KtxEntityDetailsMcpPort`, context-tool registration, and local project wiring all exist. +- MCP `dictionary_search` is implemented: `packages/context/src/sl/dictionary-search.ts`, `KtxDictionarySearchMcpPort`, context-tool registration, and local project wiring all exist. + +V1-blocking gaps still open: + +- `discover_data` is not implemented on the MCP surface. There is no `packages/context/src/search/discover.ts`, no `KtxDiscoverDataMcpPort`, no `ports.discover`, no MCP registration, and no local project wiring. +- `ktx mcp start|stop|status|logs` and the HTTP Streamable MCP daemon do not exist. There is no `packages/cli/src/commands/mcp-commands.ts`, no `packages/cli/src/managed-mcp-daemon.ts`, and `packages/cli/src/cli-program.ts` does not register an `mcp` command subtree. +- `ktx setup-agents` does not install `ktx-research`, write Claude Code/Cursor MCP JSON entries, or print Codex/opencode snippets. `plannedKtxAgentFiles()` still installs only the existing `ktx` skill/rule files. +- Ingest-side warehouse verification tools still use `connectionName`, `targets`, and `rowLimit` contracts. The original spec says these should converge on `connectionId` naming, but that cleanup can be planned after the MCP research surface is complete because this plan adds a separate MCP adapter with the required shape. + +Non-blocking or explicitly out-of-scope gaps: + +- Python code execution via MCP. +- Stdio MCP transport. +- OS-level auto-start. +- Native TLS, audit logging, rate limiting, per-tool authorization, and multi-project daemon routing. +- Streaming SQL results. + +This plan covers only the next dependency-ordered v1 blocker: MCP `discover_data`. Later v1 plans still need to cover the HTTP daemon and setup-agent/research-skill installation. + +## File Structure + +Create: + +- `packages/context/src/search/discover.ts` + - Defines MCP-shaped `discover_data` input, ref, and response types. + - Searches wiki pages through `searchLocalKnowledgePages()` and `readLocalKnowledgePage()`. + - Searches semantic-layer records through `loadLocalSlSourceRecords()`. + - Searches raw schema by reading the latest `raw-sources//live-database/` scan artifacts directly. + - Fuses wiki, SL, and raw-schema candidates with `HybridSearchCore` using equal lane weights and normalizes final scores to `0..1`. + - Re-reads artifacts on every call; no long-lived cache. +- `packages/context/src/search/discover.test.ts` + - Covers unified result shape, kind filtering, connection scoping, score normalization, snippet cap, raw table refs, and freshness after a newer scan appears. + +Modify: + +- `packages/context/src/search/index.ts` + - Export `createKtxDiscoverDataService` and discover types. +- `packages/context/src/mcp/types.ts` + - Add `KtxDiscoverDataMcpPort` and `discover?: KtxDiscoverDataMcpPort` to `KtxMcpContextPorts`. +- `packages/context/src/mcp/context-tools.ts` + - Add the `discover_data` Zod schema and tool registration. +- `packages/context/src/mcp/server.test.ts` + - Assert `discover_data` registration and structured array output. +- `packages/context/src/mcp/local-project-ports.ts` + - Wire local project `discover.search()` to `createKtxDiscoverDataService()`. +- `packages/context/src/mcp/local-project-ports.test.ts` + - Cover local-port `discover_data` across wiki, SL, and raw schema. +- `packages/context/src/mcp/index.ts` + - Export the new MCP port type if it is not already covered by existing barrel exports. + +## Task 1: Add The Local Discover Data Service + +**Files:** +- Create: `packages/context/src/search/discover.test.ts` +- Create: `packages/context/src/search/discover.ts` +- Modify: `packages/context/src/search/index.ts` + +- [ ] **Step 1: Write failing service tests** + +Create `packages/context/src/search/discover.test.ts`: + +```typescript +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { initKtxProject, type KtxLocalProject } from '../project/index.js'; +import { writeLocalKnowledgePage } from '../wiki/local-knowledge.js'; +import { createKtxDiscoverDataService } from './discover.js'; + +describe('createKtxDiscoverDataService', () => { + let tempDir: string; + let project: KtxLocalProject; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-discover-data-')); + project = await initKtxProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' }); + project.config.connections.warehouse = { driver: 'postgres', url: 'env:DATABASE_URL' }; + project.config.connections.billing = { driver: 'postgres', url: 'env:BILLING_DATABASE_URL' }; + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + async function seedWiki(): Promise { + await writeLocalKnowledgePage(project, { + key: 'orders-playbook', + scope: 'GLOBAL', + summary: 'Paid order operations', + content: 'Use paid orders and order_count to inspect monthly customer activity for Acme Corp.', + tags: ['orders'], + }); + } + + async function seedSl(): Promise { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/orders.yaml', + [ + 'name: orders', + 'descriptions:', + ' user: Paid order facts', + 'table: public.orders', + 'grain: [id]', + 'columns:', + ' - name: status', + ' type: string', + ' descriptions:', + ' user: Payment status for the order', + ' - name: ordered_at', + ' type: time', + 'measures:', + ' - name: order_count', + ' expr: count(*)', + ' description: Number of paid orders', + '', + ].join('\n'), + 'ktx', + 'ktx@example.com', + 'seed sl source', + ); + } + + async function seedScan(input: { + connectionId?: string; + syncId: string; + tableName?: string; + comment?: string; + sampleValues?: string[]; + }): Promise { + const connectionId = input.connectionId ?? 'warehouse'; + const root = `raw-sources/${connectionId}/live-database/${input.syncId}`; + const tableName = input.tableName ?? 'orders'; + await project.fileStore.writeFile( + `${root}/connection.json`, + JSON.stringify( + { + connectionId, + driver: 'postgres', + extractedAt: `2026-05-14T09:00:00.000Z`, + scope: { schemas: ['public'] }, + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed scan connection', + ); + await project.fileStore.writeFile( + `${root}/tables/public-${tableName}.json`, + JSON.stringify( + { + catalog: null, + db: 'public', + name: tableName, + kind: 'table', + comment: input.comment ?? 'Orders table from warehouse', + estimatedRows: 123, + descriptions: { db: input.comment ?? 'Orders table from warehouse' }, + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: 'Order id', + }, + { + name: 'status', + nativeType: 'text', + normalizedType: 'text', + dimensionType: 'string', + nullable: false, + primaryKey: false, + comment: 'Order status', + sampleValues: input.sampleValues ?? ['paid', 'pending'], + }, + ], + foreignKeys: [], + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed table', + ); + await project.fileStore.writeFile( + `${root}/scan-report.json`, + JSON.stringify( + { + connectionId, + driver: 'postgres', + syncId: input.syncId, + runId: `scan-${input.syncId}`, + trigger: 'mcp', + mode: 'enriched', + dryRun: false, + artifactPaths: { + rawSourcesDir: root, + reportPath: `${root}/scan-report.json`, + manifestShards: [], + enrichmentArtifacts: [], + }, + diffSummary: { + tablesAdded: 1, + tablesModified: 0, + tablesDeleted: 0, + tablesUnchanged: 0, + columnsAdded: 0, + columnsModified: 0, + columnsDeleted: 0, + }, + manifestShardsWritten: 0, + structuralSyncStats: { + tablesCreated: 0, + tablesUpdated: 0, + tablesDeleted: 0, + columnsCreated: 0, + columnsUpdated: 0, + columnsDeleted: 0, + }, + enrichment: { + dataDictionary: 'completed', + tableDescriptions: 'completed', + columnDescriptions: 'completed', + embeddings: 'skipped', + deterministicRelationships: 'skipped', + llmRelationshipValidation: 'skipped', + statisticalValidation: 'skipped', + }, + capabilityGaps: [], + warnings: [], + relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 }, + enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] }, + createdAt: '2026-05-14T09:00:00.000Z', + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed scan report', + ); + } + + it('returns unified ranked refs across wiki, semantic-layer, and raw schema', async () => { + await seedWiki(); + await seedSl(); + await seedScan({ syncId: 'sync-1', sampleValues: ['paid', 'refunded'] }); + const service = createKtxDiscoverDataService(project, { userId: 'local-user' }); + + const results = await service.search({ query: 'paid orders', connectionId: 'warehouse', limit: 10 }); + + expect(results.map((result) => result.kind)).toEqual( + expect.arrayContaining(['wiki', 'sl_source', 'sl_measure', 'sl_dimension', 'table', 'column']), + ); + expect(results.every((result) => result.score >= 0 && result.score <= 1)).toBe(true); + expect(results.every((result) => result.snippet === null || result.snippet.length <= 200)).toBe(true); + expect(results).toContainEqual( + expect.objectContaining({ + kind: 'table', + id: 'public.orders', + connectionId: 'warehouse', + tableRef: { catalog: null, db: 'public', name: 'orders' }, + matchedOn: expect.stringMatching(/name|description|comment|display/), + }), + ); + expect(results).toContainEqual( + expect.objectContaining({ + kind: 'column', + id: 'public.orders.status', + connectionId: 'warehouse', + columnName: 'status', + matchedOn: expect.stringMatching(/name|comment|description|sample_value/), + }), + ); + expect(results).toContainEqual( + expect.objectContaining({ + kind: 'sl_measure', + id: 'orders.order_count', + connectionId: 'warehouse', + summary: 'Number of paid orders', + snippet: 'count(*)', + matchedOn: expect.stringMatching(/name|description|expr/), + }), + ); + }); + + it('honors kind filters and connection scope', async () => { + await seedWiki(); + await seedSl(); + await seedScan({ syncId: 'sync-1', connectionId: 'warehouse', tableName: 'orders' }); + await seedScan({ syncId: 'sync-2', connectionId: 'billing', tableName: 'invoices', comment: 'Billing invoices' }); + const service = createKtxDiscoverDataService(project); + + const results = await service.search({ + query: 'orders', + connectionId: 'warehouse', + kinds: ['table', 'column'], + limit: 10, + }); + + expect(results.every((result) => result.kind === 'table' || result.kind === 'column')).toBe(true); + expect(results.every((result) => result.connectionId === 'warehouse')).toBe(true); + expect(results.some((result) => result.id.includes('invoices'))).toBe(false); + expect(results.some((result) => result.kind === 'wiki')).toBe(false); + }); + + it('re-reads the latest scan artifacts on each call', async () => { + await seedScan({ syncId: 'sync-1', tableName: 'orders', comment: 'Old orders table' }); + const service = createKtxDiscoverDataService(project); + await expect(service.search({ query: 'orders', connectionId: 'warehouse', kinds: ['table'], limit: 10 })).resolves.toEqual( + expect.arrayContaining([expect.objectContaining({ id: 'public.orders' })]), + ); + + await seedScan({ syncId: 'sync-2', tableName: 'invoices', comment: 'Invoice facts' }); + const fresh = await service.search({ query: 'invoice', connectionId: 'warehouse', kinds: ['table'], limit: 10 }); + + expect(fresh).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'public.invoices' })])); + expect(fresh.some((result) => result.id === 'public.orders')).toBe(false); + }); +}); +``` + +- [ ] **Step 2: Run the failing service tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/search/discover.test.ts +``` + +Expected: FAIL with `Cannot find module './discover.js'`. + +- [ ] **Step 3: Implement the discover service** + +Create `packages/context/src/search/discover.ts`: + +```typescript +import type { KtxEmbeddingPort } from '../core/index.js'; +import type { KtxLocalProject } from '../project/index.js'; +import type { KtxScanReport, KtxSchemaColumn, KtxSchemaTable, KtxTableRef } from '../scan/index.js'; +import { DEFAULT_PRIORITY, loadLocalSlSourceRecords, resolveDescription } from '../sl/index.js'; +import type { SemanticLayerSource } from '../sl/index.js'; +import { readLocalKnowledgePage, searchLocalKnowledgePages } from '../wiki/local-knowledge.js'; +import { HybridSearchCore, type FusedSearchCandidate, type SearchCandidateGenerator } from './index.js'; + +export type KtxDiscoverDataKind = 'wiki' | 'sl_source' | 'sl_measure' | 'sl_dimension' | 'table' | 'column'; +export type KtxDiscoverDataMatchedOn = + | 'name' + | 'display' + | 'description' + | 'comment' + | 'expr' + | 'sample_value' + | 'body'; + +export interface KtxDiscoverDataInput { + query: string; + connectionId?: string; + kinds?: KtxDiscoverDataKind[]; + limit?: number; +} + +export interface KtxDiscoverDataRef { + kind: KtxDiscoverDataKind; + id: string; + score: number; + summary: string | null; + snippet: string | null; + matchedOn: KtxDiscoverDataMatchedOn; + connectionId?: string; + tableRef?: KtxTableRef; + columnName?: string; +} + +export type KtxDiscoverDataResponse = KtxDiscoverDataRef[]; + +export interface KtxDiscoverDataServiceOptions { + userId?: string; + embeddingService?: KtxEmbeddingPort | null; +} + +interface CandidateRecord { + ref: Omit; + rankScore: number; +} + +type RawTable = KtxSchemaTable & { + descriptions?: Record; + columns: Array; sampleValues?: unknown[] }>; +}; + +interface LatestScan { + report: KtxScanReport; + rawSourcesDir: string; + tables: RawTable[]; +} + +const ALL_KINDS: KtxDiscoverDataKind[] = ['wiki', 'sl_source', 'sl_measure', 'sl_dimension', 'table', 'column']; + +function normalize(value: string | null | undefined): string { + return (value ?? '').toLowerCase(); +} + +function queryTerms(query: string): string[] { + return query + .toLowerCase() + .split(/[^a-z0-9_]+/u) + .map((term) => term.trim()) + .filter(Boolean); +} + +function hasKind(kinds: ReadonlySet, kind: KtxDiscoverDataKind): boolean { + return kinds.has(kind); +} + +function cap200(value: string | null | undefined): string | null { + if (!value) { + return null; + } + const compact = value.replace(/\s+/g, ' ').trim(); + return compact.length > 200 ? compact.slice(0, 200) : compact; +} + +function snippetAround(text: string | null | undefined, terms: readonly string[]): string | null { + if (!text) { + return null; + } + const lower = text.toLowerCase(); + const index = terms.map((term) => lower.indexOf(term)).filter((position) => position >= 0).sort((a, b) => a - b)[0] ?? 0; + return cap200(text.slice(Math.max(0, index - 60), index + 140)); +} + +function textScore(value: string | null | undefined, terms: readonly string[]): number { + const haystack = normalize(value); + if (!haystack || terms.length === 0) { + return 0; + } + const matched = terms.filter((term) => haystack.includes(term)).length; + return matched / terms.length; +} + +function bestField( + fields: Array<{ matchedOn: KtxDiscoverDataMatchedOn; text: string | null | undefined; weight: number }>, + terms: readonly string[], +): { matchedOn: KtxDiscoverDataMatchedOn; score: number; text: string | null } | null { + const scored = fields + .map((field) => ({ + matchedOn: field.matchedOn, + score: textScore(field.text, terms) * field.weight, + text: field.text ?? null, + })) + .filter((field) => field.score > 0) + .sort((left, right) => right.score - left.score || left.matchedOn.localeCompare(right.matchedOn)); + return scored[0] ?? null; +} + +function displayForTable(table: KtxTableRef): string { + return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.'); +} + +function tableRef(table: KtxSchemaTable): KtxTableRef { + return { catalog: table.catalog, db: table.db, name: table.name }; +} + +async function readJson(project: KtxLocalProject, path: string): Promise { + return JSON.parse((await project.fileStore.readFile(path)).content) as T; +} + +async function latestScan(project: KtxLocalProject, connectionId: string): Promise { + const root = `raw-sources/${connectionId}/live-database`; + let files: string[]; + try { + files = (await project.fileStore.listFiles(root)).files; + } catch { + return null; + } + + const reportPath = files.filter((path) => path.endsWith('/scan-report.json')).sort().at(-1); + if (!reportPath) { + return null; + } + const report = await readJson(project, reportPath); + const rawSourcesDir = report.artifactPaths.rawSourcesDir ?? reportPath.slice(0, -'/scan-report.json'.length); + const listedTables = await project.fileStore.listFiles(`${rawSourcesDir}/tables`); + const tables: RawTable[] = []; + for (const path of listedTables.files.filter((file) => file.endsWith('.json')).sort()) { + tables.push(await readJson(project, path)); + } + return { report, rawSourcesDir, tables }; +} + +function configuredConnectionIds(project: KtxLocalProject, connectionId?: string): string[] { + return connectionId ? [connectionId] : Object.keys(project.config.connections).sort(); +} + +async function wikiCandidates( + project: KtxLocalProject, + input: KtxDiscoverDataInput, + options: KtxDiscoverDataServiceOptions, + terms: readonly string[], +): Promise { + const searchResults = await searchLocalKnowledgePages(project, { + query: input.query, + userId: options.userId, + embeddingService: options.embeddingService ?? null, + limit: Math.max(input.limit ?? 15, 25), + }); + const records: CandidateRecord[] = []; + for (const result of searchResults) { + const page = await readLocalKnowledgePage(project, { key: result.key, userId: options.userId }); + const content = page?.content ?? ''; + const matched = bestField( + [ + { matchedOn: 'name', text: result.key, weight: 1.1 }, + { matchedOn: 'description', text: result.summary, weight: 1 }, + { matchedOn: 'body', text: content, weight: 0.8 }, + ], + terms, + ); + records.push({ + rankScore: result.score + (matched?.score ?? 0), + ref: { + kind: 'wiki', + id: result.key, + summary: result.summary || null, + snippet: snippetAround(content, terms), + matchedOn: matched?.matchedOn ?? 'body', + }, + }); + } + return records.sort((left, right) => right.rankScore - left.rankScore || left.ref.id.localeCompare(right.ref.id)); +} + +async function slCandidates( + project: KtxLocalProject, + input: KtxDiscoverDataInput, + kinds: ReadonlySet, + terms: readonly string[], +): Promise { + const records: CandidateRecord[] = []; + for (const connectionId of configuredConnectionIds(project, input.connectionId)) { + const sources = await loadLocalSlSourceRecords(project, { connectionId }).catch(() => []); + for (const sourceRecord of sources) { + const source = sourceRecord.source; + if (hasKind(kinds, 'sl_source')) { + const description = resolveDescription(source.descriptions, { priority: DEFAULT_PRIORITY }); + const matched = bestField( + [ + { matchedOn: 'name', text: source.name, weight: 1.2 }, + { matchedOn: 'description', text: description, weight: 1 }, + { matchedOn: 'display', text: source.table ?? source.sql ?? null, weight: 0.8 }, + ], + terms, + ); + if (matched) { + records.push({ + rankScore: matched.score, + ref: { + kind: 'sl_source', + id: source.name, + connectionId, + summary: description, + snippet: + matched.matchedOn === 'description' + ? snippetAround(description, terms) + : cap200(`${source.name}: ${[...source.measures.map((measure) => measure.name), ...source.columns.map((column) => column.name)].slice(0, 3).join(', ')}`), + matchedOn: matched.matchedOn, + }, + }); + } + } + + if (hasKind(kinds, 'sl_measure')) { + for (const measure of source.measures) { + const matched = bestField( + [ + { matchedOn: 'name', text: measure.name, weight: 1.2 }, + { matchedOn: 'description', text: measure.description, weight: 1 }, + { matchedOn: 'expr', text: measure.expr, weight: 0.9 }, + ], + terms, + ); + if (matched) { + records.push({ + rankScore: matched.score, + ref: { + kind: 'sl_measure', + id: `${source.name}.${measure.name}`, + connectionId, + summary: measure.description ?? null, + snippet: cap200(measure.expr), + matchedOn: matched.matchedOn, + }, + }); + } + } + } + + if (hasKind(kinds, 'sl_dimension')) { + for (const column of source.columns) { + const description = resolveDescription(column.descriptions, { priority: DEFAULT_PRIORITY }); + const matched = bestField( + [ + { matchedOn: 'name', text: column.name, weight: 1.2 }, + { matchedOn: 'description', text: description, weight: 1 }, + { matchedOn: 'expr', text: column.expr, weight: 0.9 }, + ], + terms, + ); + if (matched) { + records.push({ + rankScore: matched.score, + ref: { + kind: 'sl_dimension', + id: `${source.name}.${column.name}`, + connectionId, + summary: description, + snippet: cap200(`${column.name} (${column.type})`), + matchedOn: matched.matchedOn, + }, + }); + } + } + } + } + } + return records.sort((left, right) => right.rankScore - left.rankScore || left.ref.id.localeCompare(right.ref.id)); +} + +async function rawCandidates( + project: KtxLocalProject, + input: KtxDiscoverDataInput, + kinds: ReadonlySet, + terms: readonly string[], +): Promise { + const records: CandidateRecord[] = []; + for (const connectionId of configuredConnectionIds(project, input.connectionId)) { + const scan = await latestScan(project, connectionId); + if (!scan) { + continue; + } + for (const table of scan.tables) { + const ref = tableRef(table); + const display = displayForTable(ref); + const tableDescription = resolveDescription(table.descriptions, { priority: DEFAULT_PRIORITY }) ?? table.comment; + if (hasKind(kinds, 'table')) { + const matched = bestField( + [ + { matchedOn: 'name', text: table.name, weight: 1.2 }, + { matchedOn: 'display', text: display, weight: 1.1 }, + { matchedOn: 'description', text: tableDescription, weight: 1 }, + { matchedOn: 'comment', text: table.comment, weight: 1 }, + ], + terms, + ); + if (matched) { + records.push({ + rankScore: matched.score, + ref: { + kind: 'table', + id: display, + connectionId, + tableRef: ref, + summary: tableDescription, + snippet: + matched.matchedOn === 'description' || matched.matchedOn === 'comment' + ? snippetAround(matched.text, terms) + : cap200(table.columns.slice(0, 5).map((column) => column.name).join(', ')), + matchedOn: matched.matchedOn, + }, + }); + } + } + + if (hasKind(kinds, 'column')) { + for (const column of table.columns) { + const columnDescription = resolveDescription(column.descriptions, { priority: DEFAULT_PRIORITY }) ?? column.comment; + const samples = (column.sampleValues ?? []).map((value) => String(value)).slice(0, 5); + const matched = bestField( + [ + { matchedOn: 'name', text: column.name, weight: 1.2 }, + { matchedOn: 'display', text: `${display}.${column.name}`, weight: 1.1 }, + { matchedOn: 'description', text: columnDescription, weight: 1 }, + { matchedOn: 'comment', text: column.comment, weight: 1 }, + { matchedOn: 'sample_value', text: samples.join(' '), weight: 0.9 }, + ], + terms, + ); + if (matched) { + records.push({ + rankScore: matched.score, + ref: { + kind: 'column', + id: `${display}.${column.name}`, + connectionId, + tableRef: ref, + columnName: column.name, + summary: columnDescription, + snippet: + matched.matchedOn === 'sample_value' + ? cap200(`${column.nativeType} - samples: ${samples.join(', ')}`) + : matched.matchedOn === 'description' || matched.matchedOn === 'comment' + ? snippetAround(matched.text, terms) + : cap200(column.nativeType), + matchedOn: matched.matchedOn, + }, + }); + } + } + } + } + } + return records.sort((left, right) => right.rankScore - left.rankScore || left.ref.id.localeCompare(right.ref.id)); +} + +function generator(name: string, candidates: CandidateRecord[], refsByKey: Map>): SearchCandidateGenerator { + candidates.forEach((candidate) => refsByKey.set(`${candidate.ref.kind}:${candidate.ref.connectionId ?? ''}:${candidate.ref.id}`, candidate.ref)); + return { + lane: name, + weight: 1, + async generate() { + return { + candidates: candidates.map((candidate, index) => ({ + id: `${candidate.ref.kind}:${candidate.ref.connectionId ?? ''}:${candidate.ref.id}`, + rank: index + 1, + rawScore: candidate.rankScore, + })), + }; + }, + }; +} + +function hydrate(fused: FusedSearchCandidate[], refsByKey: Map>): KtxDiscoverDataRef[] { + const maxScore = Math.max(...fused.map((candidate) => candidate.score), 0); + return fused + .map((candidate) => { + const ref = refsByKey.get(candidate.id); + if (!ref) { + return null; + } + return { + ...ref, + score: maxScore > 0 ? Number((candidate.score / maxScore).toFixed(6)) : 0, + }; + }) + .filter((result): result is KtxDiscoverDataRef => result !== null); +} + +export function createKtxDiscoverDataService( + project: KtxLocalProject, + options: KtxDiscoverDataServiceOptions = {}, +): { search(input: KtxDiscoverDataInput): Promise } { + return { + async search(input) { + const limit = Math.max(1, Math.min(input.limit ?? 15, 50)); + const query = input.query.trim(); + if (!query) { + return []; + } + const kinds = new Set(input.kinds ?? ALL_KINDS); + const terms = queryTerms(query); + const refsByKey = new Map>(); + const generators: SearchCandidateGenerator[] = []; + + if (hasKind(kinds, 'wiki')) { + generators.push(generator('wiki', await wikiCandidates(project, { ...input, limit }, options, terms), refsByKey)); + } + if (hasKind(kinds, 'sl_source') || hasKind(kinds, 'sl_measure') || hasKind(kinds, 'sl_dimension')) { + generators.push(generator('semantic_layer', await slCandidates(project, { ...input, limit }, kinds, terms), refsByKey)); + } + if (hasKind(kinds, 'table') || hasKind(kinds, 'column')) { + generators.push(generator('raw_schema', await rawCandidates(project, { ...input, limit }, kinds, terms), refsByKey)); + } + if (generators.length === 0) { + return []; + } + + const result = await new HybridSearchCore().search({ + queryText: query, + limit, + generators, + laneWeights: { wiki: 1, semantic_layer: 1, raw_schema: 1 }, + }); + return hydrate(result.results, refsByKey); + }, + }; +} +``` + +- [ ] **Step 4: Export the service** + +In `packages/context/src/search/index.ts`, add: + +```typescript +export { createKtxDiscoverDataService } from './discover.js'; +export type { + KtxDiscoverDataInput, + KtxDiscoverDataKind, + KtxDiscoverDataMatchedOn, + KtxDiscoverDataRef, + KtxDiscoverDataResponse, + KtxDiscoverDataServiceOptions, +} from './discover.js'; +``` + +- [ ] **Step 5: Run service tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/search/discover.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit the service** + +Run: + +```bash +git add packages/context/src/search/discover.ts packages/context/src/search/discover.test.ts packages/context/src/search/index.ts +git commit -m "feat: add MCP discover data service" +``` + +Expected: commit succeeds. + +## Task 2: Register `discover_data` In The MCP Tool Surface + +**Files:** +- Modify: `packages/context/src/mcp/types.ts` +- Modify: `packages/context/src/mcp/context-tools.ts` +- Modify: `packages/context/src/mcp/server.test.ts` +- Modify: `packages/context/src/mcp/index.ts` + +- [ ] **Step 1: Write failing MCP registration test** + +In `packages/context/src/mcp/server.test.ts`, extend the import from `./types.js` to include: + +```typescript + KtxDiscoverDataMcpPort, +``` + +Add this test after the `dictionary_search` registration test: + +```typescript + it('registers discover_data when the host provides a discover port', async () => { + const fake = makeFakeServer(); + const discover: KtxDiscoverDataMcpPort = { + search: vi.fn().mockResolvedValue([ + { + kind: 'table', + id: 'public.orders', + score: 1, + summary: 'Orders table', + snippet: 'id, status', + matchedOn: 'name', + connectionId: 'warehouse', + tableRef: { catalog: null, db: 'public', name: 'orders' }, + }, + ]), + }; + + createKtxMcpServer({ + server: fake.server, + userContext: { userId: 'local-user' }, + contextTools: { discover }, + }); + + expect(fake.tools.map((tool) => tool.name)).toEqual(['discover_data']); + await expect( + getTool(fake.tools, 'discover_data').handler({ + query: 'orders', + connectionId: 'warehouse', + kinds: ['table'], + limit: 5, + }), + ).resolves.toMatchObject({ + structuredContent: [ + { + kind: 'table', + id: 'public.orders', + connectionId: 'warehouse', + tableRef: { catalog: null, db: 'public', name: 'orders' }, + }, + ], + }); + expect(discover.search).toHaveBeenCalledWith({ + query: 'orders', + connectionId: 'warehouse', + kinds: ['table'], + limit: 5, + }); + }); +``` + +- [ ] **Step 2: Run the failing MCP registration test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/server.test.ts -t "discover_data" +``` + +Expected: FAIL with an import or type error for `KtxDiscoverDataMcpPort`. + +- [ ] **Step 3: Add MCP discover port types** + +In `packages/context/src/mcp/types.ts`, add this import near the other search/scan imports: + +```typescript +import type { KtxDiscoverDataInput, KtxDiscoverDataResponse } from '../search/index.js'; +``` + +Add this interface after `KtxDictionarySearchMcpPort`: + +```typescript +export interface KtxDiscoverDataMcpPort { + search(input: KtxDiscoverDataInput): Promise; +} +``` + +Add this optional port to `KtxMcpContextPorts`: + +```typescript + discover?: KtxDiscoverDataMcpPort; +``` + +- [ ] **Step 4: Add the Zod schema and registration** + +In `packages/context/src/mcp/context-tools.ts`, add this schema after `dictionarySearchSchema`: + +```typescript +const discoverDataKindSchema = z.enum(['wiki', 'sl_source', 'sl_measure', 'sl_dimension', 'table', 'column']); + +const discoverDataSchema = z.object({ + query: z.string().min(1), + connectionId: connectionIdSchema.optional(), + kinds: z.array(discoverDataKindSchema).optional(), + limit: z.number().int().min(1).max(50).default(15).optional(), +}); +``` + +Add this registration block after the `dictionary_search` registration block and before `sql_execution`: + +```typescript + if (ports.discover) { + const discover = ports.discover; + registerParsedTool( + server, + 'discover_data', + { + title: 'Discover Data', + description: + 'Search across KTX wiki pages, semantic-layer sources/measures/dimensions, and raw warehouse schema refs.', + inputSchema: discoverDataSchema.shape, + }, + discoverDataSchema, + async (input) => jsonToolResult(await discover.search(input)), + ); + } +``` + +- [ ] **Step 5: Export MCP port types** + +Check `packages/context/src/mcp/index.ts`. If it already exports all types from `./types.js`, leave it unchanged. If it lists individual type exports, add: + +```typescript +export type { KtxDiscoverDataMcpPort } from './types.js'; +``` + +- [ ] **Step 6: Run MCP registration tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/server.test.ts -t "discover_data" +``` + +Expected: PASS. + +- [ ] **Step 7: Commit MCP registration** + +Run: + +```bash +git add packages/context/src/mcp/types.ts packages/context/src/mcp/context-tools.ts packages/context/src/mcp/server.test.ts packages/context/src/mcp/index.ts +git commit -m "feat: expose discover data MCP tool" +``` + +Expected: commit succeeds. + +## Task 3: Wire Local Project MCP Ports + +**Files:** +- Modify: `packages/context/src/mcp/local-project-ports.ts` +- Modify: `packages/context/src/mcp/local-project-ports.test.ts` + +- [ ] **Step 1: Write failing local-port test** + +In `packages/context/src/mcp/local-project-ports.test.ts`, add this test inside the existing `describe('createLocalProjectMcpContextPorts', ...)` block: + +```typescript + it('exposes local project discover_data across wiki, semantic-layer, and raw schema', async () => { + await project.fileStore.writeFile( + 'wiki/global/orders-playbook.md', + [ + '---', + 'summary: Paid order operations', + 'tags: [orders]', + 'refs: []', + 'sl_refs: []', + 'usage_mode: auto', + '---', + '', + 'Paid orders are used for customer activity analysis.', + '', + ].join('\n'), + 'ktx', + 'ktx@example.com', + 'seed wiki', + ); + await project.fileStore.writeFile( + 'semantic-layer/warehouse/orders.yaml', + [ + 'name: orders', + 'descriptions:', + ' user: Paid order facts', + 'table: public.orders', + 'grain: [id]', + 'columns:', + ' - name: status', + ' type: string', + ' descriptions:', + ' user: Payment status', + 'measures:', + ' - name: order_count', + ' expr: count(*)', + ' description: Number of paid orders', + '', + ].join('\n'), + 'ktx', + 'ktx@example.com', + 'seed sl', + ); + await project.fileStore.writeFile( + 'raw-sources/warehouse/live-database/sync-1/connection.json', + JSON.stringify({ connectionId: 'warehouse', driver: 'postgres', extractedAt: '2026-05-14T09:00:00.000Z' }, null, 2), + 'ktx', + 'ktx@example.com', + 'seed connection', + ); + await project.fileStore.writeFile( + 'raw-sources/warehouse/live-database/sync-1/tables/public-orders.json', + JSON.stringify( + { + catalog: null, + db: 'public', + name: 'orders', + kind: 'table', + comment: 'Orders table', + estimatedRows: 10, + columns: [ + { + name: 'status', + nativeType: 'text', + normalizedType: 'text', + dimensionType: 'string', + nullable: false, + primaryKey: false, + comment: 'Order status', + sampleValues: ['paid'], + }, + ], + foreignKeys: [], + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed table', + ); + await project.fileStore.writeFile( + 'raw-sources/warehouse/live-database/sync-1/scan-report.json', + JSON.stringify( + { + connectionId: 'warehouse', + driver: 'postgres', + syncId: 'sync-1', + runId: 'scan-1', + trigger: 'mcp', + mode: 'enriched', + dryRun: false, + artifactPaths: { + rawSourcesDir: 'raw-sources/warehouse/live-database/sync-1', + reportPath: 'raw-sources/warehouse/live-database/sync-1/scan-report.json', + manifestShards: [], + enrichmentArtifacts: [], + }, + diffSummary: { + tablesAdded: 1, + tablesModified: 0, + tablesDeleted: 0, + tablesUnchanged: 0, + columnsAdded: 0, + columnsModified: 0, + columnsDeleted: 0, + }, + manifestShardsWritten: 0, + structuralSyncStats: { + tablesCreated: 0, + tablesUpdated: 0, + tablesDeleted: 0, + columnsCreated: 0, + columnsUpdated: 0, + columnsDeleted: 0, + }, + enrichment: { + dataDictionary: 'completed', + tableDescriptions: 'completed', + columnDescriptions: 'completed', + embeddings: 'skipped', + deterministicRelationships: 'skipped', + llmRelationshipValidation: 'skipped', + statisticalValidation: 'skipped', + }, + capabilityGaps: [], + warnings: [], + relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 }, + enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] }, + createdAt: '2026-05-14T09:00:00.000Z', + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed scan report', + ); + + const ports = createLocalProjectMcpContextPorts(project); + const results = await ports.discover?.search({ query: 'paid orders', connectionId: 'warehouse', limit: 10 }); + + expect(results).toEqual( + expect.arrayContaining([ + expect.objectContaining({ kind: 'wiki', id: 'orders-playbook' }), + expect.objectContaining({ kind: 'sl_source', id: 'orders', connectionId: 'warehouse' }), + expect.objectContaining({ kind: 'table', id: 'public.orders', connectionId: 'warehouse' }), + ]), + ); + }); +``` + +- [ ] **Step 2: Run the failing local-port test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/local-project-ports.test.ts -t "discover_data" +``` + +Expected: FAIL because `ports.discover` is undefined. + +- [ ] **Step 3: Wire the local port** + +In `packages/context/src/mcp/local-project-ports.ts`, add `createKtxDiscoverDataService` to the search import block: + +```typescript +import { createKtxDiscoverDataService } from '../search/index.js'; +``` + +Add this port in the `ports` object after `dictionarySearch`: + +```typescript + discover: { + async search(input) { + return createKtxDiscoverDataService(project, { userId: 'local', embeddingService }).search(input); + }, + }, +``` + +- [ ] **Step 4: Run local-port test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/local-project-ports.test.ts -t "discover_data" +``` + +Expected: PASS. + +- [ ] **Step 5: Commit local-port wiring** + +Run: + +```bash +git add packages/context/src/mcp/local-project-ports.ts packages/context/src/mcp/local-project-ports.test.ts +git commit -m "feat: wire local discover data MCP port" +``` + +Expected: commit succeeds. + +## Task 4: Verify The Discover Slice + +**Files:** +- Verify: `packages/context/src/search/discover.ts` +- Verify: `packages/context/src/mcp/context-tools.ts` +- Verify: `packages/context/src/mcp/local-project-ports.ts` + +- [ ] **Step 1: Run focused tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/search/discover.test.ts src/mcp/server.test.ts src/mcp/local-project-ports.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run context type-check** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +``` + +Expected: PASS. + +- [ ] **Step 3: Run context test suite** + +Run: + +```bash +pnpm --filter @ktx/context run test +``` + +Expected: PASS. + +- [ ] **Step 4: Check diff hygiene** + +Run: + +```bash +git diff --check +``` + +Expected: no output and exit code 0. + +- [ ] **Step 5: Document remaining v1 blockers in handoff** + +Run: + +```bash +test -e packages/context/src/search/discover.ts; printf 'discover:%s\n' "$?" +test -e packages/cli/src/commands/mcp-commands.ts; printf 'mcp-commands:%s\n' "$?" +test -e packages/cli/src/managed-mcp-daemon.ts; printf 'managed-mcp:%s\n' "$?" +test -e packages/cli/src/skills/research/SKILL.md; printf 'research-skill:%s\n' "$?" +``` + +Expected after this plan is implemented: + +```text +discover:0 +mcp-commands:1 +managed-mcp:1 +research-skill:1 +``` + +- [ ] **Step 6: Commit verification notes if code changed during verification** + +If verification required code fixes, run: + +```bash +git status --short +git add packages/context/src/search/discover.ts packages/context/src/search/discover.test.ts packages/context/src/search/index.ts packages/context/src/mcp/types.ts packages/context/src/mcp/context-tools.ts packages/context/src/mcp/server.test.ts packages/context/src/mcp/local-project-ports.ts packages/context/src/mcp/local-project-ports.test.ts packages/context/src/mcp/index.ts +git commit -m "test: verify MCP discover data" +``` + +Expected: commit succeeds only when there are verification fixes to commit. If `git status --short` is empty, skip this commit. + +## Self-Review + +- Spec coverage: this plan covers the MCP-shaped `discover_data` input/output contract, kind filtering, optional `connectionId`, RRF fusion across wiki/SL/raw lanes, deterministic summary/snippet provenance, raw `tableRef` and `columnName`, score normalization, local project MCP registration, and freshness by re-reading artifacts on every call. +- Remaining v1-blocking spec coverage after this slice: HTTP Streamable MCP daemon, `ktx mcp` CLI lifecycle commands, setup-agent MCP config writers/snippet printers, `ktx-research` skill installation, and ingest-side `connectionName` contract convergence. +- Placeholder scan: no placeholder or deferred-work wording remains in this plan. +- Type consistency: `KtxDiscoverDataInput`, `KtxDiscoverDataRef`, `KtxDiscoverDataResponse`, and `KtxDiscoverDataMcpPort` are defined before use and match the MCP/local-port registration snippets. diff --git a/docs/superpowers/plans/2026-05-14-research-agent-mcp-entity-details.md b/docs/superpowers/plans/2026-05-14-research-agent-mcp-entity-details.md new file mode 100644 index 00000000..db165c02 --- /dev/null +++ b/docs/superpowers/plans/2026-05-14-research-agent-mcp-entity-details.md @@ -0,0 +1,1175 @@ +# Research Agent MCP Entity Details Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add the MCP-shaped `entity_details` tool so external research agents can inspect raw table and column metadata from the latest scan snapshot. + +**Architecture:** Build a focused scan service over persisted `raw-sources//live-database/` artifacts, using `scan-report.json` as the latest scan identity and `readLocalScanStructuralSnapshot()` as the schema reader. Register `entity_details` as an MCP context tool with pure structured output, then expose it through local project MCP ports. + +**Tech Stack:** TypeScript, Vitest, Zod, KTX local file store, KTX scan artifacts, KTX MCP context ports. + +--- + +## Current Audit + +Original spec: `docs/superpowers/specs/2026-05-14-research-agent-mcp-tools-design.md` + +Implemented v1 slice: + +- `docs/superpowers/plans/2026-05-14-research-agent-mcp-sql-execution-foundation.md` is implemented. Evidence in current source: + - Python sqlglot validation exists at `python/ktx-daemon/src/ktx_daemon/sql_analysis.py`. + - `POST /sql/validate-read-only` exists at `python/ktx-daemon/src/ktx_daemon/app.py`. + - `SqlAnalysisPort.validateReadOnly()` exists at `packages/context/src/sql-analysis/ports.ts`. + - MCP `sql_execution` registration exists at `packages/context/src/mcp/context-tools.ts`. + - Local MCP SQL execution validates through `SqlAnalysisPort` before connector execution in `packages/context/src/mcp/local-project-ports.ts`. + +V1-blocking gaps after that slice: + +- `entity_details` is not registered on the MCP surface. +- `discover_data` is not registered on the MCP surface. +- `dictionary_search` is not registered on the MCP surface. +- `ktx mcp start|stop|status|logs` and the HTTP Streamable MCP daemon do not exist. +- `ktx setup-agents` does not install MCP client config or a `ktx-research` skill. +- Ingest-side warehouse verification still uses `connectionName` contracts in places; the MCP surface must use `connectionId`. + +This plan covers only the next dependency-aware v1 blocker: MCP `entity_details`. Later plans still need to cover `dictionary_search`, `discover_data`, the HTTP daemon, and setup-agent/research-skill installation. + +## File Structure + +Create: + +- `packages/context/src/scan/entity-details.ts` + - Reads latest live-database scan artifacts for a connection. + - Resolves driver display strings or structured table refs. + - Returns structured table/column metadata and structured per-entity errors. +- `packages/context/src/scan/entity-details.test.ts` + - Covers latest-scan selection, display-string resolution, structured refs, column filtering, ambiguity, missing scan, and missing columns. + +Modify: + +- `packages/context/src/scan/index.ts` + - Export the new service and types. +- `packages/context/src/mcp/types.ts` + - Add `KtxEntityDetailsMcpPort` and response types to `KtxMcpContextPorts`. +- `packages/context/src/mcp/context-tools.ts` + - Add the `entity_details` input schema and registration. +- `packages/context/src/mcp/server.test.ts` + - Assert the MCP tool registration and structured output. +- `packages/context/src/mcp/local-project-ports.ts` + - Wire the local project port to the scan entity-details service. +- `packages/context/src/mcp/local-project-ports.test.ts` + - Cover local-port `entity_details` success and missing-scan behavior. +- `packages/context/src/mcp/index.ts` + - Export the new MCP port/response types. + +## Task 1: Add The Scan Entity Details Service + +**Files:** +- Create: `packages/context/src/scan/entity-details.test.ts` +- Create: `packages/context/src/scan/entity-details.ts` +- Modify: `packages/context/src/scan/index.ts` + +- [ ] **Step 1: Write failing service tests** + +Create `packages/context/src/scan/entity-details.test.ts`: + +```typescript +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { initKtxProject, type KtxLocalProject } from '../project/index.js'; +import { createKtxEntityDetailsService } from './entity-details.js'; +import type { KtxConnectionDriver, KtxScanReport, KtxSchemaTable } from './types.js'; + +describe('createKtxEntityDetailsService', () => { + let tempDir: string; + let project: KtxLocalProject; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-entity-details-service-')); + project = await initKtxProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' }); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + function scanReport(input: { + connectionId: string; + syncId: string; + runId: string; + driver?: KtxConnectionDriver; + createdAt?: string; + }): KtxScanReport { + const rawSourcesDir = `raw-sources/${input.connectionId}/live-database/${input.syncId}`; + return { + connectionId: input.connectionId, + driver: input.driver ?? 'postgres', + syncId: input.syncId, + runId: input.runId, + trigger: 'mcp', + mode: 'structural', + dryRun: false, + artifactPaths: { + rawSourcesDir, + reportPath: `${rawSourcesDir}/scan-report.json`, + manifestShards: [], + enrichmentArtifacts: [], + }, + diffSummary: { added: 0, modified: 0, deleted: 0, unchanged: 1 }, + manifestShardsWritten: 0, + structuralSyncStats: { tablesWritten: 1, tablesDeleted: 0, foreignKeysWritten: 0 }, + enrichment: { + dataDictionary: 'skipped', + tableDescriptions: 'skipped', + columnDescriptions: 'skipped', + embeddings: 'skipped', + deterministicRelationships: 'skipped', + llmRelationshipValidation: 'skipped', + statisticalValidation: 'skipped', + }, + capabilityGaps: [], + warnings: [], + relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 }, + enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] }, + createdAt: input.createdAt ?? '2026-05-14T09:00:00.000Z', + }; + } + + function ordersTable(input: { db?: string | null; estimatedRows?: number | null } = {}): KtxSchemaTable { + return { + catalog: null, + db: input.db ?? 'public', + name: 'orders', + kind: 'table', + comment: 'Customer orders', + estimatedRows: input.estimatedRows ?? 12, + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: 'Order id', + }, + { + name: 'status', + nativeType: 'text', + normalizedType: 'text', + dimensionType: 'string', + nullable: false, + primaryKey: false, + comment: 'Order status', + }, + ], + foreignKeys: [ + { + fromColumn: 'customer_id', + toCatalog: null, + toDb: 'public', + toTable: 'customers', + toColumn: 'id', + constraintName: 'orders_customer_id_fkey', + }, + ], + }; + } + + async function seedScan(input: { + connectionId?: string; + syncId: string; + runId: string; + driver?: KtxConnectionDriver; + extractedAt?: string; + tables?: KtxSchemaTable[]; + }): Promise { + const connectionId = input.connectionId ?? 'warehouse'; + const report = scanReport({ + connectionId, + syncId: input.syncId, + runId: input.runId, + driver: input.driver, + createdAt: input.extractedAt, + }); + const root = report.artifactPaths.rawSourcesDir; + await project.fileStore.writeFile( + `${root}/connection.json`, + JSON.stringify( + { + connectionId, + driver: report.driver, + extractedAt: input.extractedAt ?? report.createdAt, + scope: { schemas: ['public'] }, + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed connection', + ); + for (const table of input.tables ?? [ordersTable()]) { + await project.fileStore.writeFile( + `${root}/tables/${table.db ?? 'default'}-${table.name}.json`, + JSON.stringify(table, null, 2), + 'ktx', + 'ktx@example.com', + `seed ${table.name}`, + ); + } + await project.fileStore.writeFile( + `${root}/scan-report.json`, + JSON.stringify(report, null, 2), + 'ktx', + 'ktx@example.com', + 'seed scan report', + ); + } + + it('returns the latest scan snapshot table details for a display string', async () => { + await seedScan({ syncId: 'sync-1', runId: 'scan-old', extractedAt: '2026-05-14T08:00:00.000Z' }); + await seedScan({ + syncId: 'sync-2', + runId: 'scan-new', + extractedAt: '2026-05-14T09:00:00.000Z', + tables: [ordersTable({ estimatedRows: 99 })], + }); + const service = createKtxEntityDetailsService(project); + + const result = await service.read({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders' }], + }); + + expect(result.results).toHaveLength(1); + expect(result.results[0]).toMatchObject({ + ok: true, + connectionId: 'warehouse', + display: 'public.orders', + estimatedRows: 99, + snapshot: { + syncId: 'sync-2', + scanRunId: 'scan-new', + extractedAt: '2026-05-14T09:00:00.000Z', + }, + columns: [ + { name: 'id', nativeType: 'integer', primaryKey: true }, + { name: 'status', nativeType: 'text', nullable: false }, + ], + }); + }); + + it('filters requested columns while keeping full-table foreign keys', async () => { + await seedScan({ syncId: 'sync-1', runId: 'scan-1' }); + const service = createKtxEntityDetailsService(project); + + const result = await service.read({ + connectionId: 'warehouse', + entities: [{ table: { catalog: null, db: 'public', name: 'orders' }, columns: ['status'] }], + }); + + expect(result.results[0]).toMatchObject({ + ok: true, + columns: [{ name: 'status' }], + foreignKeys: [ + { + fromColumn: 'customer_id', + toDb: 'public', + toTable: 'customers', + toColumn: 'id', + }, + ], + }); + }); + + it('returns a structured missing-scan error', async () => { + const service = createKtxEntityDetailsService(project); + + const result = await service.read({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders' }], + }); + + expect(result.results).toEqual([ + { + ok: false, + connectionId: 'warehouse', + table: 'public.orders', + error: { + code: 'scan_missing', + message: 'No live-database scan found for connection "warehouse"; run `ktx ingest warehouse` or `ktx scan warehouse`.', + }, + }, + ]); + }); + + it('reports ambiguous bare table names across schemas', async () => { + await seedScan({ + syncId: 'sync-1', + runId: 'scan-1', + tables: [ordersTable({ db: 'public' }), ordersTable({ db: 'archive' })], + }); + const service = createKtxEntityDetailsService(project); + + const result = await service.read({ + connectionId: 'warehouse', + entities: [{ table: 'orders' }], + }); + + expect(result.results[0]).toMatchObject({ + ok: false, + error: { + code: 'ambiguous_table', + candidates: [ + { tableRef: { catalog: null, db: 'archive', name: 'orders' }, display: 'archive.orders' }, + { tableRef: { catalog: null, db: 'public', name: 'orders' }, display: 'public.orders' }, + ], + }, + }); + }); + + it('reports missing requested columns with available column candidates', async () => { + await seedScan({ syncId: 'sync-1', runId: 'scan-1' }); + const service = createKtxEntityDetailsService(project); + + const result = await service.read({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders', columns: ['status', 'plan_tier'] }], + }); + + expect(result.results[0]).toMatchObject({ + ok: false, + error: { + code: 'column_not_found', + message: 'Column(s) not found on public.orders: plan_tier', + candidates: ['id', 'status'], + }, + }); + }); +}); +``` + +- [ ] **Step 2: Run failing service tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/scan/entity-details.test.ts +``` + +Expected: FAIL because `packages/context/src/scan/entity-details.ts` does not exist. + +- [ ] **Step 3: Implement the service** + +Create `packages/context/src/scan/entity-details.ts`: + +```typescript +import type { KtxLocalProject } from '../project/index.js'; +import { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js'; +import type { + KtxConnectionDriver, + KtxScanReport, + KtxSchemaColumn, + KtxSchemaSnapshot, + KtxSchemaTable, + KtxTableRef, +} from './types.js'; + +export type KtxEntityDetailsTableInput = string | KtxTableRef; + +export interface KtxEntityDetailsInput { + connectionId: string; + entities: Array<{ + table: KtxEntityDetailsTableInput; + columns?: string[]; + }>; +} + +export interface KtxEntityDetailsSnapshotInfo { + syncId: string; + extractedAt: string; + scanRunId: string | null; +} + +export interface KtxEntityDetailsColumn { + name: string; + nativeType: string; + normalizedType: string; + dimensionType: KtxSchemaColumn['dimensionType']; + nullable: boolean; + primaryKey: boolean; + comment: string | null; +} + +export interface KtxEntityDetailsRecord { + ok: true; + connectionId: string; + tableRef: KtxTableRef; + display: string; + kind: KtxSchemaTable['kind']; + comment: string | null; + estimatedRows: number | null; + columns: KtxEntityDetailsColumn[]; + foreignKeys: KtxSchemaTable['foreignKeys']; + snapshot: KtxEntityDetailsSnapshotInfo; +} + +export type KtxEntityDetailsErrorCode = 'scan_missing' | 'table_not_found' | 'ambiguous_table' | 'column_not_found'; + +export interface KtxEntityDetailsErrorResult { + ok: false; + connectionId: string; + table: KtxEntityDetailsTableInput; + snapshot?: KtxEntityDetailsSnapshotInfo; + error: { + code: KtxEntityDetailsErrorCode; + message: string; + candidates?: Array<{ tableRef: KtxTableRef; display: string }> | string[]; + }; +} + +export interface KtxEntityDetailsResponse { + results: Array; +} + +interface LatestScan { + report: KtxScanReport; + snapshot: KtxSchemaSnapshot; +} + +interface ResolveResult { + table: KtxSchemaTable | null; + error?: Omit & { message: string }; +} + +function normalize(value: string | null | undefined): string { + return (value ?? '').toLowerCase(); +} + +function refsEqual(left: KtxTableRef, right: KtxTableRef): boolean { + return ( + normalize(left.catalog) === normalize(right.catalog) && + normalize(left.db) === normalize(right.db) && + normalize(left.name) === normalize(right.name) + ); +} + +function cleanIdentifierPart(part: string): string { + return part.trim().replace(/^["'`\[]|["'`\]]$/g, ''); +} + +function splitDisplay(display: string): string[] { + return display + .trim() + .split('.') + .map(cleanIdentifierPart) + .filter(Boolean); +} + +function displayForTable(driver: KtxConnectionDriver, table: KtxTableRef): string { + if (driver === 'sqlite') { + return table.name; + } + return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.'); +} + +function tableRef(table: KtxSchemaTable): KtxTableRef { + return { catalog: table.catalog, db: table.db, name: table.name }; +} + +function candidateList(driver: KtxConnectionDriver, tables: KtxSchemaTable[]): Array<{ tableRef: KtxTableRef; display: string }> { + return tables + .map((table) => ({ + tableRef: tableRef(table), + display: displayForTable(driver, table), + })) + .sort((left, right) => left.display.localeCompare(right.display)); +} + +function parseDisplayRef(driver: KtxConnectionDriver, display: string): KtxTableRef | null { + const parts = splitDisplay(display); + if (driver === 'sqlite') { + return parts.length === 1 ? { catalog: null, db: null, name: parts[0]! } : null; + } + if (driver === 'bigquery' || driver === 'snowflake' || driver === 'sqlserver') { + return parts.length === 3 ? { catalog: parts[0]!, db: parts[1]!, name: parts[2]! } : null; + } + if (parts.length === 2) { + return { catalog: null, db: parts[0]!, name: parts[1]! }; + } + if (parts.length === 3) { + return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! }; + } + return null; +} + +function resolveTable(snapshot: KtxSchemaSnapshot, input: KtxEntityDetailsTableInput): ResolveResult { + if (typeof input !== 'string') { + const table = snapshot.tables.find((candidate) => refsEqual(candidate, input)) ?? null; + return table + ? { table } + : { + table: null, + error: { + code: 'table_not_found', + message: `Table not found in latest scan: ${displayForTable(snapshot.driver, input)}`, + candidates: candidateList(snapshot.driver, snapshot.tables), + }, + }; + } + + const parsed = parseDisplayRef(snapshot.driver, input); + if (parsed) { + const table = snapshot.tables.find((candidate) => refsEqual(candidate, parsed)) ?? null; + return table + ? { table } + : { + table: null, + error: { + code: 'table_not_found', + message: `Table not found in latest scan: ${input}`, + candidates: candidateList(snapshot.driver, snapshot.tables), + }, + }; + } + + const byName = snapshot.tables.filter((candidate) => normalize(candidate.name) === normalize(input)); + if (byName.length === 1) { + return { table: byName[0]! }; + } + if (byName.length > 1) { + return { + table: null, + error: { + code: 'ambiguous_table', + message: `Table name "${input}" is ambiguous across schemas/catalogs; pass a structured table ref.`, + candidates: candidateList(snapshot.driver, byName), + }, + }; + } + return { + table: null, + error: { + code: 'table_not_found', + message: `Table not found in latest scan: ${input}`, + candidates: candidateList(snapshot.driver, snapshot.tables), + }, + }; +} + +function toColumn(column: KtxSchemaColumn): KtxEntityDetailsColumn { + return { + name: column.name, + nativeType: column.nativeType, + normalizedType: column.normalizedType, + dimensionType: column.dimensionType, + nullable: column.nullable, + primaryKey: column.primaryKey, + comment: column.comment, + }; +} + +function snapshotInfo(report: KtxScanReport, snapshot: KtxSchemaSnapshot): KtxEntityDetailsSnapshotInfo { + return { + syncId: report.syncId, + extractedAt: snapshot.extractedAt, + scanRunId: report.runId ?? null, + }; +} + +async function readJson(project: KtxLocalProject, path: string): Promise { + return JSON.parse((await project.fileStore.readFile(path)).content) as T; +} + +async function latestScan(project: KtxLocalProject, connectionId: string): Promise { + const root = `raw-sources/${connectionId}/live-database`; + let listed; + try { + listed = await project.fileStore.listFiles(root); + } catch { + return null; + } + const reportPath = listed.files.filter((path) => path.endsWith('/scan-report.json')).sort().at(-1); + if (!reportPath) { + return null; + } + const report = await readJson(project, reportPath); + const rawSourcesDir = report.artifactPaths.rawSourcesDir ?? reportPath.slice(0, -'/scan-report.json'.length); + const snapshot = await readLocalScanStructuralSnapshot({ + project, + connectionId, + driver: report.driver, + rawSourcesDir, + extractedAtFallback: report.createdAt, + }); + return { report, snapshot }; +} + +export function createKtxEntityDetailsService(project: KtxLocalProject) { + return { + async read(input: KtxEntityDetailsInput): Promise { + const scan = await latestScan(project, input.connectionId); + if (!scan) { + return { + results: input.entities.map((entity) => ({ + ok: false, + connectionId: input.connectionId, + table: entity.table, + error: { + code: 'scan_missing', + message: `No live-database scan found for connection "${input.connectionId}"; run \`ktx ingest ${input.connectionId}\` or \`ktx scan ${input.connectionId}\`.`, + }, + })), + }; + } + + const info = snapshotInfo(scan.report, scan.snapshot); + const results: KtxEntityDetailsResponse['results'] = []; + for (const entity of input.entities) { + const resolved = resolveTable(scan.snapshot, entity.table); + if (!resolved.table) { + results.push({ + ok: false, + connectionId: input.connectionId, + table: entity.table, + snapshot: info, + error: resolved.error!, + }); + continue; + } + + const requested = new Set((entity.columns ?? []).map((column) => normalize(column))); + const columns = requested.size + ? resolved.table.columns.filter((column) => requested.has(normalize(column.name))) + : resolved.table.columns; + if (requested.size && columns.length !== requested.size) { + const found = new Set(columns.map((column) => normalize(column.name))); + const missing = [...requested].filter((column) => !found.has(column)); + results.push({ + ok: false, + connectionId: input.connectionId, + table: entity.table, + snapshot: info, + error: { + code: 'column_not_found', + message: `Column(s) not found on ${displayForTable(scan.snapshot.driver, resolved.table)}: ${missing.join(', ')}`, + candidates: resolved.table.columns.map((column) => column.name), + }, + }); + continue; + } + + results.push({ + ok: true, + connectionId: input.connectionId, + tableRef: tableRef(resolved.table), + display: displayForTable(scan.snapshot.driver, resolved.table), + kind: resolved.table.kind, + comment: resolved.table.comment, + estimatedRows: resolved.table.estimatedRows, + columns: columns.map(toColumn), + foreignKeys: resolved.table.foreignKeys, + snapshot: info, + }); + } + return { results }; + }, + }; +} +``` + +In `packages/context/src/scan/index.ts`, add these exports near the other scan-service exports: + +```typescript +export type { + KtxEntityDetailsColumn, + KtxEntityDetailsErrorCode, + KtxEntityDetailsErrorResult, + KtxEntityDetailsInput, + KtxEntityDetailsRecord, + KtxEntityDetailsResponse, + KtxEntityDetailsSnapshotInfo, + KtxEntityDetailsTableInput, +} from './entity-details.js'; +export { createKtxEntityDetailsService } from './entity-details.js'; +``` + +- [ ] **Step 4: Run service tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/scan/entity-details.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Commit the scan service** + +Run: + +```bash +git add packages/context/src/scan/entity-details.ts packages/context/src/scan/entity-details.test.ts packages/context/src/scan/index.ts +git commit -m "feat(context): add scan-backed entity details service" +``` + +## Task 2: Register The MCP `entity_details` Tool + +**Files:** +- Modify: `packages/context/src/mcp/types.ts` +- Modify: `packages/context/src/mcp/context-tools.ts` +- Modify: `packages/context/src/mcp/server.test.ts` +- Modify: `packages/context/src/mcp/index.ts` + +- [ ] **Step 1: Add MCP port types** + +In `packages/context/src/mcp/types.ts`, add this import near the other type imports: + +```typescript +import type { KtxEntityDetailsInput, KtxEntityDetailsResponse } from '../scan/entity-details.js'; +``` + +Add this interface immediately before `KtxSqlExecutionResponse`: + +```typescript +export interface KtxEntityDetailsMcpPort { + read(input: KtxEntityDetailsInput): Promise; +} +``` + +Add this optional port to `KtxMcpContextPorts`: + +```typescript + entityDetails?: KtxEntityDetailsMcpPort; +``` + +- [ ] **Step 2: Write failing MCP registration test** + +In `packages/context/src/mcp/server.test.ts`, update the `./types.js` import to include `KtxEntityDetailsMcpPort`. + +Add this test after the `sql_execution` registration test: + +```typescript + it('registers entity_details when the host provides an entity-details port', async () => { + const fake = makeFakeServer(); + const entityDetails: KtxEntityDetailsMcpPort = { + read: vi.fn().mockResolvedValue({ + results: [ + { + ok: true, + connectionId: 'warehouse', + tableRef: { catalog: null, db: 'public', name: 'orders' }, + display: 'public.orders', + kind: 'table', + comment: 'Customer orders', + estimatedRows: 12, + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: null, + }, + ], + foreignKeys: [], + snapshot: { + syncId: 'sync-1', + extractedAt: '2026-05-14T09:00:00.000Z', + scanRunId: 'scan-1', + }, + }, + ], + }), + }; + + createKtxMcpServer({ + server: fake.server, + userContext: { userId: 'local-user' }, + contextTools: { entityDetails }, + }); + + expect(fake.tools.map((tool) => tool.name)).toEqual(['entity_details']); + await expect( + getTool(fake.tools, 'entity_details').handler({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders', columns: ['id'] }], + }), + ).resolves.toMatchObject({ + structuredContent: { + results: [ + { + ok: true, + connectionId: 'warehouse', + display: 'public.orders', + columns: [{ name: 'id' }], + }, + ], + }, + }); + expect(entityDetails.read).toHaveBeenCalledWith({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders', columns: ['id'] }], + }); + }); +``` + +- [ ] **Step 3: Run failing MCP registration test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/server.test.ts -t entity_details +``` + +Expected: FAIL because `entity_details` is not registered. + +- [ ] **Step 4: Add schema and registration** + +In `packages/context/src/mcp/context-tools.ts`, add this schema after `scanArtifactReadSchema` and before `sqlExecutionSchema`: + +```typescript +const entityDetailsTableRefSchema = z.object({ + catalog: z.string().nullable(), + db: z.string().nullable(), + name: z.string().min(1), +}); + +const entityDetailsSchema = z.object({ + connectionId: connectionIdSchema, + entities: z + .array( + z.object({ + table: z.union([z.string().min(1), entityDetailsTableRefSchema]), + columns: z.array(z.string().min(1)).optional(), + }), + ) + .min(1) + .max(20), +}); +``` + +Add this registration block in `registerKtxContextTools`, after the semantic-layer block and before the `sqlExecution` block: + +```typescript + if (ports.entityDetails) { + const entityDetails = ports.entityDetails; + registerParsedTool( + server, + 'entity_details', + { + title: 'Entity Details', + description: + 'Read raw table and column metadata from the latest KTX live-database scan snapshot.', + inputSchema: entityDetailsSchema.shape, + }, + entityDetailsSchema, + async (input) => jsonToolResult(await entityDetails.read(input)), + ); + } +``` + +In `packages/context/src/mcp/index.ts`, add `KtxEntityDetailsMcpPort` to the exported type list. + +- [ ] **Step 5: Run MCP registration test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/server.test.ts -t entity_details +``` + +Expected: PASS. + +- [ ] **Step 6: Commit MCP registration** + +Run: + +```bash +git add packages/context/src/mcp/types.ts packages/context/src/mcp/context-tools.ts packages/context/src/mcp/server.test.ts packages/context/src/mcp/index.ts +git commit -m "feat(context): register MCP entity details tool" +``` + +## Task 3: Wire Local Project MCP Ports + +**Files:** +- Modify: `packages/context/src/mcp/local-project-ports.ts` +- Modify: `packages/context/src/mcp/local-project-ports.test.ts` + +- [ ] **Step 1: Write failing local-port tests** + +In `packages/context/src/mcp/local-project-ports.test.ts`, add this helper after `testConnector`: + +```typescript + async function seedScanReport(projectDir: string, syncId = 'sync-1'): Promise { + const root = `raw-sources/warehouse/live-database/${syncId}`; + await mkdir(join(projectDir, root, 'tables'), { recursive: true }); + await writeFile( + join(projectDir, root, 'connection.json'), + JSON.stringify( + { + connectionId: 'warehouse', + driver: 'postgres', + extractedAt: '2026-05-14T09:00:00.000Z', + scope: { schemas: ['public'] }, + }, + null, + 2, + ), + 'utf-8', + ); + await writeFile( + join(projectDir, root, 'tables', 'orders.json'), + JSON.stringify( + { + catalog: null, + db: 'public', + name: 'orders', + kind: 'table', + comment: 'Customer orders', + estimatedRows: 12, + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: null, + }, + ], + foreignKeys: [], + }, + null, + 2, + ), + 'utf-8', + ); + await writeFile( + join(projectDir, root, 'scan-report.json'), + JSON.stringify( + { + connectionId: 'warehouse', + driver: 'postgres', + syncId, + runId: 'scan-1', + trigger: 'mcp', + mode: 'structural', + dryRun: false, + artifactPaths: { + rawSourcesDir: root, + reportPath: `${root}/scan-report.json`, + manifestShards: [], + enrichmentArtifacts: [], + }, + diffSummary: { added: 0, modified: 0, deleted: 0, unchanged: 1 }, + manifestShardsWritten: 0, + structuralSyncStats: { tablesWritten: 1, tablesDeleted: 0, foreignKeysWritten: 0 }, + enrichment: { + dataDictionary: 'skipped', + tableDescriptions: 'skipped', + columnDescriptions: 'skipped', + embeddings: 'skipped', + deterministicRelationships: 'skipped', + llmRelationshipValidation: 'skipped', + statisticalValidation: 'skipped', + }, + capabilityGaps: [], + warnings: [], + relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 }, + enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] }, + createdAt: '2026-05-14T09:00:00.000Z', + }, + null, + 2, + ), + 'utf-8', + ); + } +``` + +Add these tests after the MCP SQL tests: + +```typescript + it('exposes local scan entity details through MCP ports', async () => { + const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + await seedScanReport(project.projectDir); + const ports = createLocalProjectMcpContextPorts(project); + + await expect( + ports.entityDetails?.read({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders', columns: ['id'] }], + }), + ).resolves.toMatchObject({ + results: [ + { + ok: true, + connectionId: 'warehouse', + display: 'public.orders', + columns: [{ name: 'id', nativeType: 'integer' }], + snapshot: { syncId: 'sync-1', scanRunId: 'scan-1' }, + }, + ], + }); + }); + + it('returns a structured local entity-details error when no scan exists', async () => { + const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + const ports = createLocalProjectMcpContextPorts(project); + + await expect( + ports.entityDetails?.read({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders' }], + }), + ).resolves.toMatchObject({ + results: [ + { + ok: false, + connectionId: 'warehouse', + error: { code: 'scan_missing' }, + }, + ], + }); + }); +``` + +- [ ] **Step 2: Run failing local-port tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/local-project-ports.test.ts -t "entity details" +``` + +Expected: FAIL because `ports.entityDetails` is undefined. + +- [ ] **Step 3: Wire the service into local ports** + +In `packages/context/src/mcp/local-project-ports.ts`, update the scan import block to include `createKtxEntityDetailsService`: + +```typescript + createKtxEntityDetailsService, +``` + +In the initial `ports` object returned by `createLocalProjectMcpContextPorts`, add this sibling after `semanticLayer` and before the closing `};`: + +```typescript + entityDetails: { + async read(input) { + return createKtxEntityDetailsService(project).read(input); + }, + }, +``` + +- [ ] **Step 4: Run local-port tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/local-project-ports.test.ts -t "entity details" +``` + +Expected: PASS. + +- [ ] **Step 5: Commit local-port wiring** + +Run: + +```bash +git add packages/context/src/mcp/local-project-ports.ts packages/context/src/mcp/local-project-ports.test.ts +git commit -m "feat(context): expose local MCP entity details" +``` + +## Task 4: Verification + +**Files:** +- Verify: all files changed in Tasks 1-3 + +- [ ] **Step 1: Run focused context tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/scan/entity-details.test.ts src/mcp/server.test.ts src/mcp/local-project-ports.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run context type-check** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +``` + +Expected: PASS. + +- [ ] **Step 3: Run dead-code check for new exports** + +Run: + +```bash +pnpm run dead-code +``` + +Expected: PASS. If Knip reports unrelated pre-existing findings, record the exact unrelated findings and do not broaden this entity-details slice. + +- [ ] **Step 4: Confirm remaining v1 blockers still need later plans** + +Run: + +```bash +test -e packages/context/src/sl/dictionary-search.ts; printf 'dictionary-search:%s\n' "$?" +test -e packages/context/src/search/discover.ts; printf 'discover:%s\n' "$?" +test -e packages/cli/src/commands/mcp-commands.ts; printf 'mcp-commands:%s\n' "$?" +test -e packages/cli/src/skills/research/SKILL.md; printf 'research-skill:%s\n' "$?" +``` + +Expected: + +```text +dictionary-search:1 +discover:1 +mcp-commands:1 +research-skill:1 +``` + +These markers mean this plan landed `entity_details` only and did not claim the remaining research-agent v1 work. + +- [ ] **Step 5: Commit verification-only doc changes if any** + +Run: + +```bash +git status --short +``` + +Expected: no uncommitted source changes after the task commits. If verification updates this plan document, commit only the plan document with: + +```bash +git add docs/superpowers/plans/2026-05-14-research-agent-mcp-entity-details.md +git commit -m "docs: record research MCP entity details plan" +``` + +## Self-Review + +- Spec coverage for this slice: covers MCP `entity_details`, latest scan freshness by reading `scan-report.json` on each call, structured table refs, driver display strings, column filtering, FK preservation, snapshot freshness, and structured errors. +- Remaining spec coverage after this slice: `dictionary_search`, `discover_data`, `ktx mcp` HTTP daemon, setup-agent MCP config, and `ktx-research` skill are still v1-blocking and need later plans. +- Type consistency: `KtxEntityDetailsInput` is reused by the scan service, MCP port, schema parser, and local project port. diff --git a/docs/superpowers/plans/2026-05-14-research-agent-mcp-http-daemon.md b/docs/superpowers/plans/2026-05-14-research-agent-mcp-http-daemon.md new file mode 100644 index 00000000..00645dec --- /dev/null +++ b/docs/superpowers/plans/2026-05-14-research-agent-mcp-http-daemon.md @@ -0,0 +1,1561 @@ +# Research Agent MCP HTTP Daemon Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add the HTTP-only `ktx mcp start|stop|status|logs` daemon so external MCP clients can reach the already implemented KTX research tools. + +**Architecture:** Keep the MCP tool contracts in `@ktx/context` and add CLI-owned HTTP hosting/lifecycle code. The public `ktx mcp start` command either runs a foreground HTTP server or spawns a hidden foreground child command, persists daemon state to `.ktx/mcp.json`, and writes logs to `.ktx/logs/mcp.log`; the HTTP server uses stateful `StreamableHTTPServerTransport` sessions with explicit host/origin/token checks. + +**Tech Stack:** TypeScript, Node 22 `node:http`, Commander, `@modelcontextprotocol/sdk@1.29.0`, Zod, Vitest, KTX managed Python daemon helpers. + +--- + +## Audit Summary + +Original spec: `docs/superpowers/specs/2026-05-14-research-agent-mcp-tools-design.md` + +Implemented v1 slices confirmed in current source: + +- MCP `sql_execution` is implemented and parser-gated: `python/ktx-daemon/src/ktx_daemon/sql_analysis.py` validates SQL with sqlglot, `python/ktx-daemon/src/ktx_daemon/app.py` exposes `/sql/validate-read-only`, `packages/context/src/mcp/context-tools.ts` registers `sql_execution`, and `packages/context/src/mcp/local-project-ports.ts` only exposes it when both SQL analysis and local scan connector creation are available. +- MCP `entity_details` is implemented: `packages/context/src/scan/entity-details.ts`, `KtxEntityDetailsMcpPort`, context-tool registration, and local project wiring all exist. +- MCP `dictionary_search` is implemented: `packages/context/src/sl/dictionary-search.ts`, `KtxDictionarySearchMcpPort`, context-tool registration, and local project wiring all exist. +- MCP `discover_data` is implemented: `packages/context/src/search/discover.ts`, `KtxDiscoverDataMcpPort`, context-tool registration, and local project wiring all exist. + +Remaining v1-blocking gaps: + +- `ktx mcp start|stop|status|logs` and the HTTP Streamable MCP daemon are missing. There is no `packages/cli/src/commands/mcp-commands.ts`, no `packages/cli/src/managed-mcp-daemon.ts`, and `packages/cli/src/cli-program.ts` does not register an `mcp` subtree. +- `ktx setup-agents` does not install MCP client config entries or the `ktx-research` skill. `plannedKtxAgentFiles()` still installs only the existing `ktx` skill/rules. +- Ingest-side warehouse verification tools still use `connectionName`, not the spec-required `connectionId`, and `WarehouseCatalogService` still exposes `connectionName` in its service contract. + +Non-blocking gaps: + +- TLS, audit logging, rate limiting, per-tool authorization, OS-level autostart, stdio MCP transport, and multi-project switching remain explicitly out of scope for v1. + +This plan covers only the next dependency-aware blocker: the HTTP Streamable MCP daemon and `ktx mcp` lifecycle command subtree. After this plan lands, the remaining v1 plans are setup-agent/research-skill installation and ingest warehouse-verification contract convergence. + +## Documentation Notes + +- Context7 was checked for current MCP TypeScript SDK Streamable HTTP examples. +- The local `@modelcontextprotocol/sdk@1.29.0` package metadata was checked with `pnpm view`; its exported import path supports `@modelcontextprotocol/sdk/server/streamableHttp.js`. +- The 1.29.0 tarball types show `StreamableHTTPServerTransport` accepts `sessionIdGenerator`, `onsessioninitialized`, `onsessionclosed`, `allowedHosts`, `allowedOrigins`, and `enableDnsRebindingProtection`, and exposes `handleRequest(req, res, parsedBody?)`. + +## File Structure + +- Create `packages/cli/src/mcp-http-server.ts` + - Owns the foreground HTTP server. + - Validates Host, Origin, and bearer token policy before handing requests to the MCP SDK transport. + - Hosts `/health` and stateful `/mcp` `POST`/`GET`/`DELETE`. + - Builds a fresh `McpServer` per session with `createDefaultKtxMcpServer()`. +- Create `packages/cli/src/mcp-http-server.test.ts` + - Unit tests for host normalization, origin validation, token enforcement, `/health`, initialize session creation, unknown-session rejection, and DELETE cleanup. +- Create `packages/cli/src/managed-mcp-daemon.ts` + - Owns `.ktx/mcp.json`, `.ktx/logs/mcp.log`, background spawning, status probes, stop, and log reading. +- Create `packages/cli/src/managed-mcp-daemon.test.ts` + - Unit tests for state paths, start spawn arguments, token redaction from state/argv, status, stale state, stop, and log tailing. +- Create `packages/cli/src/commands/mcp-commands.ts` + - Registers public `start|stop|status|logs` and hidden `serve-internal`. +- Create `packages/cli/src/commands/mcp-commands.test.ts` + - Command-level tests for option parsing, non-loopback token requirement, state output, and hidden server command wiring. +- Modify `packages/cli/src/cli-program.ts` + - Add `mcp` to project-aware root commands. + - Register the MCP command subtree. +- Modify `packages/cli/package.json` + - Add `@modelcontextprotocol/sdk` as a direct dependency of `@ktx/cli`, because the CLI package will import the Streamable HTTP transport directly. + +## Task 1: Add MCP HTTP Security Helper Tests + +**Files:** +- Create: `packages/cli/src/mcp-http-server.test.ts` +- Create later: `packages/cli/src/mcp-http-server.ts` + +- [ ] **Step 1: Write the failing security helper tests** + +Create `packages/cli/src/mcp-http-server.test.ts` with: + +```typescript +import { describe, expect, it } from 'vitest'; +import { + buildMcpSecurityConfig, + isMcpRequestAuthorized, + normalizeHostHeader, +} from './mcp-http-server.js'; + +describe('normalizeHostHeader', () => { + it('normalizes host headers before allow-list comparison', () => { + expect(normalizeHostHeader('LOCALHOST:7878')).toBe('localhost'); + expect(normalizeHostHeader('127.0.0.1:7878')).toBe('127.0.0.1'); + expect(normalizeHostHeader('[::1]:7878')).toBe('::1'); + expect(normalizeHostHeader(' Example.COM ')).toBe('example.com'); + }); +}); + +describe('buildMcpSecurityConfig', () => { + it('allows loopback hosts without a token', () => { + const config = buildMcpSecurityConfig({ + host: '127.0.0.1', + port: 7878, + token: undefined, + allowedHosts: [], + allowedOrigins: [], + }); + + expect(config.token).toBeUndefined(); + expect(config.allowedHosts).toEqual(['localhost', '127.0.0.1', '::1']); + }); + + it('requires a token for non-loopback binding', () => { + expect(() => + buildMcpSecurityConfig({ + host: '0.0.0.0', + port: 7878, + token: undefined, + allowedHosts: [], + allowedOrigins: [], + }), + ).toThrow('Binding KTX MCP to 0.0.0.0 requires --token or KTX_MCP_TOKEN'); + }); + + it('validates allowed origins as full origins', () => { + expect(() => + buildMcpSecurityConfig({ + host: '127.0.0.1', + port: 7878, + token: undefined, + allowedHosts: [], + allowedOrigins: ['localhost:7878'], + }), + ).toThrow('Allowed origin must be a full origin URL'); + }); +}); + +describe('isMcpRequestAuthorized', () => { + const config = buildMcpSecurityConfig({ + host: '0.0.0.0', + port: 7878, + token: 'secret-token', + allowedHosts: ['mcp.example.test'], + allowedOrigins: ['https://mcp.example.test'], + }); + + it('accepts a valid host, origin, and bearer token', () => { + expect( + isMcpRequestAuthorized( + { + path: '/mcp', + headers: { + host: 'mcp.example.test:7878', + origin: 'https://mcp.example.test', + authorization: 'Bearer secret-token', + }, + }, + config, + ), + ).toEqual({ ok: true }); + }); + + it('rejects bad host headers before MCP handling', () => { + expect( + isMcpRequestAuthorized( + { path: '/health', headers: { host: 'evil.example.test' } }, + config, + ), + ).toEqual({ ok: false, status: 403, message: 'Host header is not allowed for KTX MCP.' }); + }); + + it('rejects browser origins unless explicitly allowed', () => { + expect( + isMcpRequestAuthorized( + { + path: '/health', + headers: { host: 'mcp.example.test', origin: 'https://evil.example.test' }, + }, + config, + ), + ).toEqual({ ok: false, status: 403, message: 'Origin header is not allowed for KTX MCP.' }); + }); + + it('requires bearer auth on /mcp when token auth is enabled', () => { + expect( + isMcpRequestAuthorized( + { path: '/mcp', headers: { host: 'mcp.example.test', authorization: 'Bearer wrong' } }, + config, + ), + ).toEqual({ ok: false, status: 401, message: 'Missing or invalid KTX MCP bearer token.' }); + }); + + it('does not require bearer auth on /health', () => { + expect(isMcpRequestAuthorized({ path: '/health', headers: { host: 'mcp.example.test' } }, config)).toEqual({ + ok: true, + }); + }); +}); +``` + +- [ ] **Step 2: Run the new tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/mcp-http-server.test.ts +``` + +Expected: FAIL because `./mcp-http-server.js` does not exist. + +- [ ] **Step 3: Implement the security helpers** + +Create `packages/cli/src/mcp-http-server.ts` with the helper surface first: + +```typescript +import type { IncomingHttpHeaders } from 'node:http'; + +const DEFAULT_ALLOWED_HOSTS = ['localhost', '127.0.0.1', '::1'] as const; + +export interface McpSecurityConfigInput { + host: string; + port: number; + token?: string; + allowedHosts: string[]; + allowedOrigins: string[]; +} + +export interface McpSecurityConfig { + host: string; + port: number; + token?: string; + allowedHosts: string[]; + allowedOrigins: string[]; +} + +export type McpAuthorizationResult = + | { ok: true } + | { ok: false; status: 401 | 403; message: string }; + +function isLoopbackHost(host: string): boolean { + const normalized = normalizeHostHeader(host); + return normalized === 'localhost' || normalized === '127.0.0.1' || normalized === '::1'; +} + +export function normalizeHostHeader(value: string): string { + const trimmed = value.trim().toLowerCase(); + if (trimmed.startsWith('[')) { + const close = trimmed.indexOf(']'); + return close >= 0 ? trimmed.slice(1, close) : trimmed.replace(/^\[/, ''); + } + const colon = trimmed.lastIndexOf(':'); + if (colon > -1 && trimmed.indexOf(':') === colon) { + return trimmed.slice(0, colon); + } + return trimmed; +} + +function fullOrigin(value: string): string { + let parsed: URL; + try { + parsed = new URL(value); + } catch { + throw new Error(`Allowed origin must be a full origin URL: ${value}`); + } + if (!parsed.protocol || !parsed.host || parsed.pathname !== '/' || parsed.search || parsed.hash) { + throw new Error(`Allowed origin must be a full origin URL: ${value}`); + } + return parsed.origin; +} + +export function buildMcpSecurityConfig(input: McpSecurityConfigInput): McpSecurityConfig { + if (!isLoopbackHost(input.host) && !input.token) { + throw new Error(`Binding KTX MCP to ${input.host} requires --token or KTX_MCP_TOKEN`); + } + const allowedHostSet = new Set(DEFAULT_ALLOWED_HOSTS); + if (!isLoopbackHost(input.host)) { + allowedHostSet.add(normalizeHostHeader(input.host)); + } + for (const host of input.allowedHosts) { + allowedHostSet.add(normalizeHostHeader(host)); + } + return { + host: input.host, + port: input.port, + ...(input.token ? { token: input.token } : {}), + allowedHosts: [...allowedHostSet], + allowedOrigins: input.allowedOrigins.map(fullOrigin), + }; +} + +function headerValue(headers: IncomingHttpHeaders | Record, name: string): string | undefined { + const value = headers[name.toLowerCase()]; + return Array.isArray(value) ? value[0] : value; +} + +export function isMcpRequestAuthorized( + request: { path: string; headers: IncomingHttpHeaders | Record }, + config: McpSecurityConfig, +): McpAuthorizationResult { + const host = headerValue(request.headers, 'host'); + if (!host || !config.allowedHosts.includes(normalizeHostHeader(host))) { + return { ok: false, status: 403, message: 'Host header is not allowed for KTX MCP.' }; + } + const origin = headerValue(request.headers, 'origin'); + if (origin && !config.allowedOrigins.includes(origin)) { + return { ok: false, status: 403, message: 'Origin header is not allowed for KTX MCP.' }; + } + if (request.path === '/mcp' && config.token) { + const auth = headerValue(request.headers, 'authorization'); + if (auth !== `Bearer ${config.token}`) { + return { ok: false, status: 401, message: 'Missing or invalid KTX MCP bearer token.' }; + } + } + return { ok: true }; +} +``` + +- [ ] **Step 4: Run the security helper tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/mcp-http-server.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add packages/cli/src/mcp-http-server.ts packages/cli/src/mcp-http-server.test.ts +git commit -m "feat(cli): add mcp http security helpers" +``` + +## Task 2: Add Foreground MCP HTTP Server + +**Files:** +- Modify: `packages/cli/src/mcp-http-server.ts` +- Modify: `packages/cli/src/mcp-http-server.test.ts` +- Modify: `packages/cli/package.json` + +- [ ] **Step 1: Add the direct SDK dependency to the CLI package** + +In `packages/cli/package.json`, add this dependency inside `"dependencies"`: + +```json +"@modelcontextprotocol/sdk": "^1.29.0" +``` + +Keep the dependency list alphabetized by package name. + +- [ ] **Step 2: Write failing HTTP server behavior tests** + +Append these imports to `packages/cli/src/mcp-http-server.test.ts`: + +```typescript +import { request } from 'node:http'; +import { AddressInfo } from 'node:net'; +import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; +import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js'; +import { runKtxMcpHttpServer } from './mcp-http-server.js'; +``` + +Append these helpers and tests: + +```typescript +function postJson(port: number, path: string, body: unknown, headers: Record = {}) { + return new Promise<{ status: number; headers: Record; body: string }>( + (resolve, reject) => { + const payload = JSON.stringify(body); + const req = request( + { + host: '127.0.0.1', + port, + path, + method: 'POST', + headers: { + host: `127.0.0.1:${port}`, + 'content-type': 'application/json', + 'content-length': Buffer.byteLength(payload), + ...headers, + }, + }, + (res) => { + const chunks: Buffer[] = []; + res.on('data', (chunk: Buffer) => chunks.push(chunk)); + res.on('end', () => + resolve({ + status: res.statusCode ?? 0, + headers: res.headers, + body: Buffer.concat(chunks).toString('utf8'), + }), + ); + }, + ); + req.on('error', reject); + req.end(payload); + }, + ); +} + +function get(port: number, path: string, headers: Record = {}) { + return new Promise<{ status: number; headers: Record; body: string }>( + (resolve, reject) => { + const req = request( + { + host: '127.0.0.1', + port, + path, + method: 'GET', + headers: { host: `127.0.0.1:${port}`, ...headers }, + }, + (res) => { + const chunks: Buffer[] = []; + res.on('data', (chunk: Buffer) => chunks.push(chunk)); + res.on('end', () => + resolve({ + status: res.statusCode ?? 0, + headers: res.headers, + body: Buffer.concat(chunks).toString('utf8'), + }), + ); + }, + ); + req.on('error', reject); + req.end(); + }, + ); +} + +function createTestMcpServer() { + return () => { + const server = new McpServer({ name: 'ktx-test', version: '0.0.0-test' }); + server.registerTool('ping', { inputSchema: {} }, async () => ({ + content: [{ type: 'text', text: 'pong' }], + })); + return server; + }; +} + +describe('runKtxMcpHttpServer', () => { + it('serves /health with project metadata', async () => { + const handle = await runKtxMcpHttpServer({ + projectDir: '/tmp/ktx-project', + host: '127.0.0.1', + port: 0, + allowedHosts: [], + allowedOrigins: [], + createMcpServer: createTestMcpServer(), + }); + try { + const port = (handle.server.address() as AddressInfo).port; + const response = await get(port, '/health'); + expect(response.status).toBe(200); + expect(JSON.parse(response.body)).toEqual({ + status: 'ok', + projectDir: '/tmp/ktx-project', + port, + }); + } finally { + await handle.close(); + } + }); + + it('allocates a stateful MCP session on initialize', async () => { + const handle = await runKtxMcpHttpServer({ + projectDir: '/tmp/ktx-project', + host: '127.0.0.1', + port: 0, + allowedHosts: [], + allowedOrigins: [], + createMcpServer: createTestMcpServer(), + }); + try { + const port = (handle.server.address() as AddressInfo).port; + const response = await postJson(port, '/mcp', { + jsonrpc: '2.0', + id: 1, + method: 'initialize', + params: { + protocolVersion: '2025-06-18', + capabilities: {}, + clientInfo: { name: 'vitest', version: '0.0.0' }, + }, + }); + + expect(response.status).toBe(200); + expect(response.headers['mcp-session-id']).toBeTruthy(); + } finally { + await handle.close(); + } + }); + + it('rejects unknown session ids with 404', async () => { + const handle = await runKtxMcpHttpServer({ + projectDir: '/tmp/ktx-project', + host: '127.0.0.1', + port: 0, + allowedHosts: [], + allowedOrigins: [], + createMcpServer: createTestMcpServer(), + }); + try { + const port = (handle.server.address() as AddressInfo).port; + const response = await postJson( + port, + '/mcp', + { jsonrpc: '2.0', id: 2, method: 'tools/list', params: {} }, + { 'mcp-session-id': 'missing-session' }, + ); + + expect(response.status).toBe(404); + expect(response.body).toContain('Unknown MCP session'); + } finally { + await handle.close(); + } + }); +}); +``` + +- [ ] **Step 3: Run the HTTP server tests to verify the new cases fail** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/mcp-http-server.test.ts +``` + +Expected: FAIL because `runKtxMcpHttpServer` is not implemented. + +- [ ] **Step 4: Implement the foreground server** + +Extend `packages/cli/src/mcp-http-server.ts` with: + +```typescript +import { randomUUID } from 'node:crypto'; +import { createServer, type IncomingMessage, type Server, type ServerResponse } from 'node:http'; +import { createDefaultKtxMcpServer } from '@ktx/context/mcp'; +import { createLocalProjectMcpContextPorts } from '@ktx/context/mcp'; +import { createLocalProjectMemoryCapture } from '@ktx/context/memory'; +import { loadKtxProject, type KtxLocalProject } from '@ktx/context/project'; +import { isInitializeRequest } from '@modelcontextprotocol/sdk/types.js'; +import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js'; +import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; +import type { KtxCliIo } from './cli-runtime.js'; +import { createKtxCliIngestQueryExecutor } from './ingest-query-executor.js'; +import { createKtxCliScanConnector } from './local-scan-connectors.js'; +import { createManagedPythonSemanticLayerComputePort } from './managed-python-command.js'; +import { createManagedDaemonSqlAnalysisPort } from './managed-python-http.js'; + +export interface KtxMcpHttpServerHandle { + server: Server; + close(): Promise; +} + +export interface RunKtxMcpHttpServerOptions extends McpSecurityConfigInput { + projectDir: string; + cliVersion?: string; + io?: KtxCliIo; + createMcpServer?: () => McpServer; + loadProject?: typeof loadKtxProject; +} + +function writeJson(res: ServerResponse, status: number, body: object): void { + const payload = `${JSON.stringify(body)}\n`; + res.writeHead(status, { + 'content-type': 'application/json', + 'content-length': Buffer.byteLength(payload), + }); + res.end(payload); +} + +function writeText(res: ServerResponse, status: number, body: string): void { + res.writeHead(status, { 'content-type': 'text/plain; charset=utf-8' }); + res.end(body); +} + +function requestPath(req: IncomingMessage): string { + const url = new URL(req.url ?? '/', 'http://127.0.0.1'); + return url.pathname; +} + +async function readJsonBody(req: IncomingMessage): Promise { + const chunks: Buffer[] = []; + for await (const chunk of req) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const raw = Buffer.concat(chunks).toString('utf8'); + return raw.trim().length === 0 ? undefined : (JSON.parse(raw) as unknown); +} + +async function defaultMcpServerFactory(input: { + project: KtxLocalProject; + projectDir: string; + cliVersion: string; + io?: KtxCliIo; +}): Promise<() => McpServer> { + const queryExecutor = createKtxCliIngestQueryExecutor(input.project); + const semanticLayerCompute = await createManagedPythonSemanticLayerComputePort({ + cliVersion: input.cliVersion, + installPolicy: 'auto', + io: input.io ?? { + stdout: { write() {} }, + stderr: { write() {} }, + }, + }); + const sqlAnalysis = createManagedDaemonSqlAnalysisPort({ + cliVersion: input.cliVersion, + projectDir: input.projectDir, + installPolicy: 'auto', + io: input.io ?? { + stdout: { write() {} }, + stderr: { write() {} }, + }, + }); + const contextTools = createLocalProjectMcpContextPorts(input.project, { + semanticLayerCompute, + queryExecutor, + sqlAnalysis, + localScan: { + createConnector: async (connectionId) => createKtxCliScanConnector(input.project, connectionId), + }, + localIngest: { + semanticLayerCompute, + queryExecutor, + }, + }); + let memoryCapture; + try { + memoryCapture = createLocalProjectMemoryCapture(input.project, { semanticLayerCompute, queryExecutor }); + } catch (error) { + input.io?.stderr.write(`KTX MCP memory_capture disabled: ${error instanceof Error ? error.message : String(error)}\n`); + } + + return () => + createDefaultKtxMcpServer({ + name: 'ktx', + version: input.cliVersion, + userContext: { userId: 'local' }, + contextTools, + memoryCapture, + }); +} + +export async function runKtxMcpHttpServer(options: RunKtxMcpHttpServerOptions): Promise { + const config = buildMcpSecurityConfig(options); + const project = + options.createMcpServer === undefined + ? await (options.loadProject ?? loadKtxProject)({ projectDir: options.projectDir }) + : undefined; + const createMcpServer = + options.createMcpServer ?? + (await defaultMcpServerFactory({ + project: project!, + projectDir: options.projectDir, + cliVersion: options.cliVersion ?? '0.0.0-private', + io: options.io, + })); + const sessions = new Map(); + + async function newTransport(): Promise { + let transport: StreamableHTTPServerTransport; + transport = new StreamableHTTPServerTransport({ + sessionIdGenerator: () => randomUUID(), + onsessioninitialized: (sessionId) => { + sessions.set(sessionId, transport); + }, + onsessionclosed: (sessionId) => { + sessions.delete(sessionId); + }, + allowedHosts: config.allowedHosts, + allowedOrigins: config.allowedOrigins, + enableDnsRebindingProtection: true, + }); + transport.onclose = () => { + if (transport.sessionId) { + sessions.delete(transport.sessionId); + } + }; + await createMcpServer().connect(transport); + return transport; + } + + const server = createServer(async (req, res) => { + const path = requestPath(req); + const auth = isMcpRequestAuthorized({ path, headers: req.headers }, config); + if (!auth.ok) { + writeText(res, auth.status, auth.message); + return; + } + + if (path === '/health' && req.method === 'GET') { + const address = server.address(); + const port = typeof address === 'object' && address ? address.port : config.port; + writeJson(res, 200, { status: 'ok', projectDir: options.projectDir, port }); + return; + } + + if (path !== '/mcp' || !['POST', 'GET', 'DELETE'].includes(req.method ?? '')) { + writeText(res, 404, 'Not found'); + return; + } + + const sessionId = req.headers['mcp-session-id']; + const normalizedSessionId = Array.isArray(sessionId) ? sessionId[0] : sessionId; + + if (req.method === 'POST') { + let body: unknown; + try { + body = await readJsonBody(req); + } catch (error) { + writeText(res, 400, `Invalid JSON body: ${error instanceof Error ? error.message : String(error)}`); + return; + } + const existing = normalizedSessionId ? sessions.get(normalizedSessionId) : undefined; + if (existing) { + await existing.handleRequest(req, res, body); + return; + } + if (normalizedSessionId) { + writeText(res, 404, `Unknown MCP session: ${normalizedSessionId}`); + return; + } + if (!isInitializeRequest(body)) { + writeText(res, 400, 'MCP initialize request is required before session traffic.'); + return; + } + await (await newTransport()).handleRequest(req, res, body); + return; + } + + if (!normalizedSessionId || !sessions.has(normalizedSessionId)) { + writeText(res, 404, normalizedSessionId ? `Unknown MCP session: ${normalizedSessionId}` : 'Missing MCP session id.'); + return; + } + await sessions.get(normalizedSessionId)!.handleRequest(req, res); + }); + + await new Promise((resolve, reject) => { + server.once('error', reject); + server.listen(config.port, config.host, () => { + server.off('error', reject); + resolve(); + }); + }); + + return { + server, + async close() { + for (const transport of sessions.values()) { + await transport.close(); + } + await new Promise((resolve, reject) => { + server.close((error) => (error ? reject(error) : resolve())); + }); + }, + }; +} +``` + +- [ ] **Step 5: Run the HTTP server tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/mcp-http-server.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add packages/cli/package.json packages/cli/src/mcp-http-server.ts packages/cli/src/mcp-http-server.test.ts +git commit -m "feat(cli): host mcp over streamable http" +``` + +## Task 3: Add Managed MCP Daemon Lifecycle + +**Files:** +- Create: `packages/cli/src/managed-mcp-daemon.ts` +- Create: `packages/cli/src/managed-mcp-daemon.test.ts` + +- [ ] **Step 1: Write failing daemon lifecycle tests** + +Create `packages/cli/src/managed-mcp-daemon.test.ts` with: + +```typescript +import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { + mcpDaemonLayout, + readKtxMcpDaemonStatus, + startKtxMcpDaemon, + stopKtxMcpDaemon, + type KtxMcpDaemonChild, + type KtxMcpDaemonState, +} from './managed-mcp-daemon.js'; + +function child(pid = 4242): KtxMcpDaemonChild { + return { pid, unref: vi.fn() }; +} + +function state(projectDir: string, overrides: Partial = {}): KtxMcpDaemonState { + return { + schemaVersion: 1, + pid: 4242, + host: '127.0.0.1', + port: 7878, + tokenAuth: false, + projectDir, + startedAt: '2026-05-14T00:00:00.000Z', + logPath: join(projectDir, '.ktx/logs/mcp.log'), + ...overrides, + }; +} + +describe('managed MCP daemon lifecycle', () => { + let tempDir: string; + let projectDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-mcp-daemon-')); + projectDir = join(tempDir, 'project'); + await mkdir(projectDir, { recursive: true }); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('uses the spec state and log paths', () => { + expect(mcpDaemonLayout(projectDir)).toEqual({ + statePath: join(projectDir, '.ktx/mcp.json'), + logPath: join(projectDir, '.ktx/logs/mcp.log'), + }); + }); + + it('starts a detached child and writes state without the token value', async () => { + const spawnDaemon = vi.fn(() => child(5555)); + await startKtxMcpDaemon({ + projectDir, + cliVersion: '0.0.0-test', + host: '0.0.0.0', + port: 7879, + token: 'secret-token', + allowedHosts: ['mcp.example.test'], + allowedOrigins: ['https://mcp.example.test'], + binPath: '/repo/packages/cli/dist/bin.js', + spawnDaemon, + processAlive: vi.fn(() => false), + portAvailable: vi.fn(async () => true), + now: () => new Date('2026-05-14T00:00:00.000Z'), + }); + + expect(spawnDaemon).toHaveBeenCalledWith( + process.execPath, + [ + '/repo/packages/cli/dist/bin.js', + '--project-dir', + projectDir, + 'mcp', + 'serve-internal', + '--host', + '0.0.0.0', + '--port', + '7879', + '--allowed-host', + 'mcp.example.test', + '--allowed-origin', + 'https://mcp.example.test', + ], + expect.objectContaining({ + detached: true, + env: expect.objectContaining({ KTX_MCP_TOKEN: 'secret-token' }), + }), + ); + expect(JSON.stringify(JSON.parse(await readFile(join(projectDir, '.ktx/mcp.json'), 'utf8')))).not.toContain( + 'secret-token', + ); + }); + + it('reports running when the process is alive and health passes', async () => { + await mkdir(join(projectDir, '.ktx'), { recursive: true }); + await writeFile(join(projectDir, '.ktx/mcp.json'), `${JSON.stringify(state(projectDir), null, 2)}\n`); + + const status = await readKtxMcpDaemonStatus({ + projectDir, + processAlive: vi.fn(() => true), + fetchHealth: vi.fn(async () => ({ ok: true, body: { status: 'ok', projectDir, port: 7878 } })), + }); + + expect(status.kind).toBe('running'); + expect(status.url).toBe('http://127.0.0.1:7878/mcp'); + }); + + it('stops a recorded daemon and removes state', async () => { + await mkdir(join(projectDir, '.ktx'), { recursive: true }); + await writeFile(join(projectDir, '.ktx/mcp.json'), `${JSON.stringify(state(projectDir), null, 2)}\n`); + const alive = new Set([4242]); + const killProcess = vi.fn((pid: number) => alive.delete(pid)); + + await expect( + stopKtxMcpDaemon({ + projectDir, + processAlive: vi.fn((pid) => alive.has(pid)), + killProcess, + stopGraceMs: 1, + pollIntervalMs: 1, + }), + ).resolves.toEqual({ status: 'stopped' }); + + expect(killProcess).toHaveBeenCalledWith(4242, 'SIGTERM'); + await expect(readFile(join(projectDir, '.ktx/mcp.json'), 'utf8')).rejects.toThrow(); + }); +}); +``` + +- [ ] **Step 2: Run the lifecycle tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/managed-mcp-daemon.test.ts +``` + +Expected: FAIL because `./managed-mcp-daemon.js` does not exist. + +- [ ] **Step 3: Implement lifecycle state, start, status, and stop** + +Create `packages/cli/src/managed-mcp-daemon.ts` with: + +```typescript +import { spawn } from 'node:child_process'; +import { mkdir, open, readFile, rm, writeFile } from 'node:fs/promises'; +import { createServer } from 'node:net'; +import { dirname, join } from 'node:path'; +import { setTimeout as delay } from 'node:timers/promises'; +import { z } from 'zod'; + +export interface KtxMcpDaemonState { + schemaVersion: 1; + pid: number; + host: string; + port: number; + tokenAuth: boolean; + projectDir: string; + startedAt: string; + logPath: string; +} + +export interface KtxMcpDaemonChild { + pid?: number; + unref(): void; +} + +export type KtxMcpDaemonStatus = + | { kind: 'stopped'; detail: string } + | { kind: 'running'; detail: string; state: KtxMcpDaemonState; url: string } + | { kind: 'stale'; detail: string; state?: KtxMcpDaemonState }; + +const stateSchema = z.object({ + schemaVersion: z.literal(1), + pid: z.number().int().positive(), + host: z.string().min(1), + port: z.number().int().min(1).max(65535), + tokenAuth: z.boolean(), + projectDir: z.string().min(1), + startedAt: z.string().min(1), + logPath: z.string().min(1), +}); + +export function mcpDaemonLayout(projectDir: string): { statePath: string; logPath: string } { + return { + statePath: join(projectDir, '.ktx/mcp.json'), + logPath: join(projectDir, '.ktx/logs/mcp.log'), + }; +} + +function defaultProcessAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch { + return false; + } +} + +function defaultKillProcess(pid: number, signal: NodeJS.Signals): void { + try { + process.kill(pid, signal); + } catch (error) { + if ((error as { code?: unknown }).code !== 'ESRCH') { + throw error; + } + } +} + +async function readState(projectDir: string): Promise { + try { + return stateSchema.parse(JSON.parse(await readFile(mcpDaemonLayout(projectDir).statePath, 'utf8')) as unknown); + } catch (error) { + if ((error as { code?: unknown }).code === 'ENOENT') { + return undefined; + } + throw error; + } +} + +async function writeState(projectDir: string, state: KtxMcpDaemonState): Promise { + const { statePath } = mcpDaemonLayout(projectDir); + await mkdir(dirname(statePath), { recursive: true }); + await writeFile(statePath, `${JSON.stringify(state, null, 2)}\n`, 'utf8'); +} + +async function defaultPortAvailable(host: string, port: number): Promise { + return await new Promise((resolve) => { + const server = createServer(); + server.once('error', () => resolve(false)); + server.listen(port, host, () => server.close(() => resolve(true))); + }); +} + +function defaultSpawnDaemon( + command: string, + args: string[], + options: { detached: boolean; stdio: ['ignore', number, number]; env: NodeJS.ProcessEnv }, +): KtxMcpDaemonChild { + return spawn(command, args, options); +} + +async function defaultFetchHealth(state: KtxMcpDaemonState): Promise<{ ok: boolean; body: unknown; detail?: string }> { + try { + const response = await fetch(`http://${state.host}:${state.port}/health`, { + headers: { host: `${state.host}:${state.port}` }, + }); + const body = await response.json(); + return { ok: response.ok, body, detail: response.ok ? undefined : `HTTP ${response.status}` }; + } catch (error) { + return { ok: false, body: null, detail: error instanceof Error ? error.message : String(error) }; + } +} + +export async function startKtxMcpDaemon(options: { + projectDir: string; + cliVersion: string; + host: string; + port: number; + token?: string; + allowedHosts: string[]; + allowedOrigins: string[]; + binPath: string; + processAlive?: (pid: number) => boolean; + portAvailable?: (host: string, port: number) => Promise; + spawnDaemon?: typeof defaultSpawnDaemon; + now?: () => Date; +}): Promise<{ status: 'started'; state: KtxMcpDaemonState; url: string }> { + const existing = await readState(options.projectDir).catch(() => undefined); + const processAlive = options.processAlive ?? defaultProcessAlive; + if (existing && processAlive(existing.pid)) { + throw new Error(`KTX MCP daemon is already recorded at http://${existing.host}:${existing.port}/mcp`); + } + const portAvailable = options.portAvailable ?? defaultPortAvailable; + if (!(await portAvailable(options.host, options.port))) { + throw new Error(`Port ${options.port} is already in use. Choose another port with --port .`); + } + + const { logPath } = mcpDaemonLayout(options.projectDir); + await mkdir(dirname(logPath), { recursive: true }); + const log = await open(logPath, 'a'); + const args = [ + options.binPath, + '--project-dir', + options.projectDir, + 'mcp', + 'serve-internal', + '--host', + options.host, + '--port', + String(options.port), + ...options.allowedHosts.flatMap((host) => ['--allowed-host', host]), + ...options.allowedOrigins.flatMap((origin) => ['--allowed-origin', origin]), + ]; + const child = (options.spawnDaemon ?? defaultSpawnDaemon)(process.execPath, args, { + detached: true, + stdio: ['ignore', log.fd, log.fd], + env: { + ...process.env, + KTX_CLI_VERSION: options.cliVersion, + ...(options.token ? { KTX_MCP_TOKEN: options.token } : {}), + }, + }); + if (!child.pid) { + throw new Error('Failed to start KTX MCP daemon: child process pid was not available.'); + } + child.unref(); + const state: KtxMcpDaemonState = { + schemaVersion: 1, + pid: child.pid, + host: options.host, + port: options.port, + tokenAuth: Boolean(options.token), + projectDir: options.projectDir, + startedAt: (options.now ?? (() => new Date()))().toISOString(), + logPath, + }; + await writeState(options.projectDir, state); + return { status: 'started', state, url: `http://${state.host}:${state.port}/mcp` }; +} + +export async function readKtxMcpDaemonStatus(options: { + projectDir: string; + processAlive?: (pid: number) => boolean; + fetchHealth?: (state: KtxMcpDaemonState) => Promise<{ ok: boolean; body: unknown; detail?: string }>; +}): Promise { + let state: KtxMcpDaemonState | undefined; + try { + state = await readState(options.projectDir); + } catch (error) { + return { kind: 'stale', detail: `MCP daemon state is invalid: ${error instanceof Error ? error.message : String(error)}` }; + } + if (!state) { + return { kind: 'stopped', detail: `No MCP daemon state at ${mcpDaemonLayout(options.projectDir).statePath}` }; + } + const processAlive = options.processAlive ?? defaultProcessAlive; + if (!processAlive(state.pid)) { + return { kind: 'stale', detail: `MCP daemon process ${state.pid} is not running`, state }; + } + const health = await (options.fetchHealth ?? defaultFetchHealth)(state); + if (!health.ok) { + return { kind: 'stale', detail: health.detail ?? 'MCP daemon health check failed', state }; + } + return { + kind: 'running', + detail: `KTX MCP daemon running at http://${state.host}:${state.port}/mcp`, + state, + url: `http://${state.host}:${state.port}/mcp`, + }; +} + +export async function stopKtxMcpDaemon(options: { + projectDir: string; + processAlive?: (pid: number) => boolean; + killProcess?: (pid: number, signal: NodeJS.Signals) => void; + stopGraceMs?: number; + pollIntervalMs?: number; +}): Promise<{ status: 'stopped' | 'already-stopped' }> { + const state = await readState(options.projectDir); + const { statePath } = mcpDaemonLayout(options.projectDir); + if (!state) { + return { status: 'already-stopped' }; + } + const processAlive = options.processAlive ?? defaultProcessAlive; + const killProcess = options.killProcess ?? defaultKillProcess; + if (processAlive(state.pid)) { + killProcess(state.pid, 'SIGTERM'); + const deadline = Date.now() + (options.stopGraceMs ?? 10_000); + while (Date.now() <= deadline && processAlive(state.pid)) { + await delay(options.pollIntervalMs ?? 100); + } + if (processAlive(state.pid)) { + killProcess(state.pid, 'SIGKILL'); + } + } + await rm(statePath, { force: true }); + return { status: 'stopped' }; +} +``` + +- [ ] **Step 4: Run the daemon lifecycle tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/managed-mcp-daemon.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add packages/cli/src/managed-mcp-daemon.ts packages/cli/src/managed-mcp-daemon.test.ts +git commit -m "feat(cli): manage mcp daemon lifecycle" +``` + +## Task 4: Register `ktx mcp` Commands + +**Files:** +- Create: `packages/cli/src/commands/mcp-commands.ts` +- Create: `packages/cli/src/commands/mcp-commands.test.ts` +- Modify: `packages/cli/src/cli-program.ts` + +- [ ] **Step 1: Write failing command tests** + +Create `packages/cli/src/commands/mcp-commands.test.ts` with: + +```typescript +import { Command } from '@commander-js/extra-typings'; +import { describe, expect, it, vi } from 'vitest'; +import type { KtxCliCommandContext } from '../cli-program.js'; +import { registerMcpCommands } from './mcp-commands.js'; + +function makeContext(overrides: Partial = {}): KtxCliCommandContext { + let exitCode = 0; + return { + io: { + stdout: { write: vi.fn() }, + stderr: { write: vi.fn() }, + }, + deps: {}, + packageInfo: { name: '@ktx/cli', version: '0.0.0-test' }, + setExitCode: (code) => { + exitCode = code; + }, + runInit: vi.fn(), + writeDebug: vi.fn(), + ...overrides, + get exitCode() { + return exitCode; + }, + } as KtxCliCommandContext; +} + +describe('registerMcpCommands', () => { + it('registers the public mcp lifecycle commands', () => { + const program = new Command().exitOverride(); + registerMcpCommands(program, makeContext()); + const mcp = program.commands.find((command) => command.name() === 'mcp'); + + expect(mcp?.commands.map((command) => command.name()).sort()).toEqual([ + 'logs', + 'serve-internal', + 'start', + 'status', + 'stop', + ]); + expect(mcp?.commands.find((command) => command.name() === 'serve-internal')?.hidden).toBe(true); + }); + + it('rejects non-loopback start without token before spawning', async () => { + const program = new Command().exitOverride(); + const startDaemon = vi.fn(); + const context = makeContext({ deps: { mcp: { startDaemon } } } as Partial); + registerMcpCommands(program, context); + + await expect(program.parseAsync(['mcp', 'start', '--host', '0.0.0.0'], { from: 'user' })).rejects.toThrow( + 'Binding KTX MCP to 0.0.0.0 requires --token or KTX_MCP_TOKEN', + ); + expect(startDaemon).not.toHaveBeenCalled(); + }); +}); +``` + +If `KtxCliDeps` does not yet include `mcp`, add this test helper shape in the test file: + +```typescript +type TestDeps = KtxCliCommandContext['deps'] & { + mcp?: { + startDaemon?: unknown; + stopDaemon?: unknown; + readStatus?: unknown; + runServer?: unknown; + }; +}; +``` + +Then cast `deps: { mcp: { startDaemon } } as TestDeps`. + +- [ ] **Step 2: Run the command tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/commands/mcp-commands.test.ts +``` + +Expected: FAIL because `./mcp-commands.js` does not exist. + +- [ ] **Step 3: Add MCP command dependency hooks** + +Find `KtxCliDeps` in `packages/cli/src/cli-runtime.ts` and add: + +```typescript + mcp?: { + startDaemon?: typeof import('./managed-mcp-daemon.js').startKtxMcpDaemon; + stopDaemon?: typeof import('./managed-mcp-daemon.js').stopKtxMcpDaemon; + readStatus?: typeof import('./managed-mcp-daemon.js').readKtxMcpDaemonStatus; + runServer?: typeof import('./mcp-http-server.js').runKtxMcpHttpServer; + }; +``` + +- [ ] **Step 4: Implement the MCP command subtree** + +Create `packages/cli/src/commands/mcp-commands.ts` with: + +```typescript +import { spawn } from 'node:child_process'; +import { readFile } from 'node:fs/promises'; +import { fileURLToPath } from 'node:url'; +import { Command } from '@commander-js/extra-typings'; +import { + buildMcpSecurityConfig, + runKtxMcpHttpServer, +} from '../mcp-http-server.js'; +import { + mcpDaemonLayout, + readKtxMcpDaemonStatus, + startKtxMcpDaemon, + stopKtxMcpDaemon, +} from '../managed-mcp-daemon.js'; +import { + collectOption, + parsePositiveIntegerOption, + resolveCommandProjectDir, + type KtxCliCommandContext, +} from '../cli-program.js'; + +function tokenFromOption(value: string | undefined): string | undefined { + return value ?? process.env.KTX_MCP_TOKEN; +} + +function binPath(): string { + return fileURLToPath(new URL('../bin.js', import.meta.url)); +} + +export function registerMcpCommands(program: Command, context: KtxCliCommandContext): void { + const mcp = program.command('mcp').description('Run the KTX MCP HTTP server'); + + mcp + .command('start') + .description('Start the KTX MCP HTTP server') + .option('--host ', 'Host to bind', '127.0.0.1') + .option('--port ', 'Port to bind', parsePositiveIntegerOption, 7878) + .option('--token ', 'Bearer token required for non-loopback binding') + .option('--foreground', 'Run in the foreground', false) + .option('--allowed-host ', 'Additional allowed Host header', collectOption, []) + .option('--allowed-origin ', 'Allowed browser Origin header', collectOption, []) + .action(async (options, command) => { + const projectDir = resolveCommandProjectDir(command); + const token = tokenFromOption(options.token); + buildMcpSecurityConfig({ + host: options.host, + port: options.port, + token, + allowedHosts: options.allowedHost, + allowedOrigins: options.allowedOrigin, + }); + if (options.foreground) { + await (context.deps.mcp?.runServer ?? runKtxMcpHttpServer)({ + projectDir, + cliVersion: context.packageInfo.version, + host: options.host, + port: options.port, + token, + allowedHosts: options.allowedHost, + allowedOrigins: options.allowedOrigin, + io: context.io, + }); + context.io.stdout.write(`KTX MCP server listening at http://${options.host}:${options.port}/mcp\n`); + return; + } + const result = await (context.deps.mcp?.startDaemon ?? startKtxMcpDaemon)({ + projectDir, + cliVersion: context.packageInfo.version, + host: options.host, + port: options.port, + token, + allowedHosts: options.allowedHost, + allowedOrigins: options.allowedOrigin, + binPath: binPath(), + }); + context.io.stdout.write(`KTX MCP daemon started: ${result.url}\n`); + }); + + mcp.command('stop').description('Stop the KTX MCP daemon').action(async (_options, command) => { + const result = await (context.deps.mcp?.stopDaemon ?? stopKtxMcpDaemon)({ + projectDir: resolveCommandProjectDir(command), + }); + context.io.stdout.write(result.status === 'stopped' ? 'KTX MCP daemon stopped.\n' : 'KTX MCP daemon is not running.\n'); + }); + + mcp.command('status').description('Show KTX MCP daemon status').action(async (_options, command) => { + const status = await (context.deps.mcp?.readStatus ?? readKtxMcpDaemonStatus)({ + projectDir: resolveCommandProjectDir(command), + }); + context.io.stdout.write(`${status.detail}\n`); + if (status.kind === 'running') { + context.io.stdout.write(`URL: ${status.url}\n`); + context.io.stdout.write(`PID: ${status.state.pid}\n`); + context.io.stdout.write(`Token auth: ${status.state.tokenAuth ? 'enabled' : 'disabled'}\n`); + context.io.stdout.write(`Project: ${status.state.projectDir}\n`); + } + }); + + mcp.command('logs').description('Print the KTX MCP daemon log').option('--follow', 'Follow log output', false).action(async (options, command) => { + const logPath = mcpDaemonLayout(resolveCommandProjectDir(command)).logPath; + if (options.follow) { + const child = spawn('tail', ['-f', logPath], { stdio: ['ignore', 'pipe', 'pipe'] }); + child.stdout?.on('data', (chunk: Buffer) => context.io.stdout.write(chunk.toString('utf8'))); + child.stderr?.on('data', (chunk: Buffer) => context.io.stderr.write(chunk.toString('utf8'))); + await new Promise((resolve) => child.on('close', resolve)); + return; + } + context.io.stdout.write(await readFile(logPath, 'utf8')); + }); + + mcp + .command('serve-internal', { hidden: true }) + .option('--host ', 'Host to bind', '127.0.0.1') + .requiredOption('--port ', 'Port to bind', parsePositiveIntegerOption) + .option('--allowed-host ', 'Additional allowed Host header', collectOption, []) + .option('--allowed-origin ', 'Allowed browser Origin header', collectOption, []) + .action(async (options, command) => { + await (context.deps.mcp?.runServer ?? runKtxMcpHttpServer)({ + projectDir: resolveCommandProjectDir(command), + cliVersion: context.packageInfo.version, + host: options.host, + port: options.port, + token: process.env.KTX_MCP_TOKEN, + allowedHosts: options.allowedHost, + allowedOrigins: options.allowedOrigin, + io: context.io, + }); + }); +} +``` + +- [ ] **Step 5: Wire the command into the root CLI** + +In `packages/cli/src/cli-program.ts`: + +Add the import: + +```typescript +import { registerMcpCommands } from './commands/mcp-commands.js'; +``` + +Change: + +```typescript +const PROJECT_AWARE_ROOT_COMMANDS = new Set(['setup', 'connection', 'ingest', 'wiki', 'sl', 'status']); +``` + +to: + +```typescript +const PROJECT_AWARE_ROOT_COMMANDS = new Set(['setup', 'connection', 'ingest', 'wiki', 'sl', 'status', 'mcp']); +``` + +Add registration after `registerStatusCommands(program, context);`: + +```typescript + registerMcpCommands(program, context); +``` + +- [ ] **Step 6: Run command tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/commands/mcp-commands.test.ts src/cli-program.test.ts +``` + +Expected: PASS. + +- [ ] **Step 7: Commit** + +```bash +git add \ + packages/cli/src/commands/mcp-commands.ts \ + packages/cli/src/commands/mcp-commands.test.ts \ + packages/cli/src/cli-program.ts \ + packages/cli/src/cli-runtime.ts +git commit -m "feat(cli): add ktx mcp commands" +``` + +## Task 5: Final Verification And Handoff + +**Files:** +- Verify: `packages/cli/src/mcp-http-server.ts` +- Verify: `packages/cli/src/managed-mcp-daemon.ts` +- Verify: `packages/cli/src/commands/mcp-commands.ts` +- Verify: `packages/cli/package.json` + +- [ ] **Step 1: Run focused CLI tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run \ + src/mcp-http-server.test.ts \ + src/managed-mcp-daemon.test.ts \ + src/commands/mcp-commands.test.ts \ + src/cli-program.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run CLI type-check** + +Run: + +```bash +pnpm --filter @ktx/cli run type-check +``` + +Expected: PASS. + +- [ ] **Step 3: Run CLI package tests** + +Run: + +```bash +pnpm --filter @ktx/cli run test +``` + +Expected: PASS. + +- [ ] **Step 4: Run workspace type-check** + +Run: + +```bash +pnpm run type-check +``` + +Expected: PASS. + +- [ ] **Step 5: Confirm remaining v1 blockers** + +Run: + +```bash +test -e packages/cli/src/skills/research/SKILL.md; printf 'research-skill:%s\n' "$?" +rg -n "connectionName" packages/context/src/ingest/tools/warehouse-verification +rg -n "mcpServers|mcp_servers|opencode|KTX_MCP_TOKEN" packages/cli/src/setup-agents.ts packages/cli/src/setup-agents.test.ts +``` + +Expected after this plan is implemented: + +```text +research-skill:1 +``` + +Expected `rg "connectionName"`: matches remain under `packages/context/src/ingest/tools/warehouse-verification`, proving ingest contract convergence still needs a later v1 plan. + +Expected setup-agent `rg`: no complete MCP client config writer/snippet matrix yet, proving setup-agent/research-skill installation still needs a later v1 plan. + +- [ ] **Step 6: Commit final fixes if verification required any** + +If verification required changes, commit them: + +```bash +git add packages/cli/src packages/cli/package.json pnpm-lock.yaml +git commit -m "fix(cli): stabilize mcp daemon verification" +``` + +If no verification changes were needed, do not create an empty commit. + +## Self-Review + +- Spec coverage in this plan: covers `ktx mcp start|stop|status|logs`, foreground/background lifecycle, `.ktx/mcp.json`, `.ktx/logs/mcp.log`, HTTP-only `/mcp`, `/health`, stateful sessions, Host/Origin validation, non-loopback token requirement, and bearer checks on `/mcp`. +- Remaining v1-blocking spec coverage after this plan: setup-agent MCP client config installation, `ktx-research` skill installation, and ingest-side warehouse-verification `connectionName` to `connectionId` contract convergence. +- Placeholder scan: the plan contains no deferred work markers or vague implementation instructions. +- Type consistency: public names are consistent across tasks: `runKtxMcpHttpServer`, `buildMcpSecurityConfig`, `isMcpRequestAuthorized`, `mcpDaemonLayout`, `startKtxMcpDaemon`, `readKtxMcpDaemonStatus`, `stopKtxMcpDaemon`, and `registerMcpCommands`. diff --git a/docs/superpowers/plans/2026-05-14-research-agent-mcp-ingest-contract-convergence.md b/docs/superpowers/plans/2026-05-14-research-agent-mcp-ingest-contract-convergence.md new file mode 100644 index 00000000..b57d72b9 --- /dev/null +++ b/docs/superpowers/plans/2026-05-14-research-agent-mcp-ingest-contract-convergence.md @@ -0,0 +1,804 @@ +# Research Agent MCP Ingest Contract Convergence Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Finish the v1 research-agent MCP spec by converging the existing ingest warehouse-verification tools on `connectionId` terminology and a shared raw-schema catalog service. + +**Architecture:** Move the existing warehouse catalog reader out of the ingest-only tool folder into `packages/context/src/scan/warehouse-catalog.ts`, rename its public contract from `connectionName` to `connectionId`, and make the ingest adapters consume that shared service. Keep the ingest tools' ingest-specific output shape (`markdown` plus `structured`) and their existing `targets` / `rowLimit` controls; the v1 blocker is the divergent connection parameter and stale prompt guidance, not changing ingest output into the MCP pure-structured shape. + +**Tech Stack:** TypeScript, Zod, Vitest, existing KTX local file-store scan artifacts, existing ingest BaseTool framework. + +--- + +## Audit Summary + +Implemented and no longer v1-blocking: + +- MCP `sql_execution`, `entity_details`, `dictionary_search`, and `discover_data` are registered in `packages/context/src/mcp/context-tools.ts` and wired through local project ports. +- `sql_execution` is parser-gated through the Python sqlglot validator before reaching local scan connectors. +- The HTTP-only `ktx mcp` daemon exists with Streamable HTTP `POST`, `GET`, and `DELETE` handling, session tracking, host/origin checks, token checks for `/mcp`, lifecycle state, and CLI commands. +- `ktx setup-agents` installs the `ktx-research` skill, writes Claude/Cursor JSON MCP config entries, and prints Codex/opencode snippets. + +Remaining v1 blocker: + +- The ingest warehouse-verification tools still expose and teach `connectionName` while the spec requires `connectionId` across `warehouse-verification/*.tool.ts`, `WarehouseCatalogService`, callers, tests, and prompt assets. + +Non-blocking follow-ups not covered here: + +- `ktx mcp status` does not print `startedAt` as a separate line, although the state file records it. +- `ktx setup-agents` writes safe `${KTX_MCP_TOKEN}` references for shared project configs, but it does not offer the spec's optional skip prompt when token auth is active. +- `discover_data` sample-value snippets use ASCII `" - samples: "` instead of the spec prose's middle-dot separator. + +## File Structure + +- Move: `packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.ts` to `packages/context/src/scan/warehouse-catalog.ts` + - Shared live-database scan catalog reader, display resolver, raw schema search, and table detail source of truth. +- Modify: `packages/context/src/scan/index.ts` + - Export the shared warehouse catalog service and public types. +- Modify: `packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.ts` + - Accept `connectionId`, call shared catalog service, and emit connectionId-shaped markdown and structured output. +- Modify: `packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.ts` + - Accept optional `connectionId`, search raw schema via shared catalog service, and teach follow-up calls with `connectionId`. +- Modify: `packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.ts` + - Accept `connectionId`, keep `rowLimit`, and pass `connectionId` to `SlConnectionCatalogPort.executeQuery`. +- Modify tests: + - `packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.test.ts` + - `packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.test.ts` + - `packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.test.ts` + - `packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.test.ts` + - Rename the service test file to `packages/context/src/scan/warehouse-catalog.test.ts`. +- Modify prompt assets: + - `packages/context/skills/_shared/identifier-verification.md` + - `packages/context/skills/dbt_ingest/SKILL.md` + - `packages/context/skills/historic_sql_patterns/SKILL.md` + - `packages/context/skills/historic_sql_table_digest/SKILL.md` + - `packages/context/skills/live_database_ingest/SKILL.md` + - `packages/context/skills/looker_ingest/SKILL.md` + - `packages/context/skills/lookml_ingest/SKILL.md` + - `packages/context/skills/metabase_ingest/SKILL.md` + - `packages/context/skills/metricflow_ingest/SKILL.md` + - `packages/context/skills/notion_synthesize/SKILL.md` + - `packages/context/skills/sl_capture/SKILL.md` + - `packages/context/skills/wiki_capture/SKILL.md` + - Preserve Looker/LookML prose where `connectionName` refers to a Looker runtime field, not a KTX tool parameter. + +## Task 1: Add Failing Contract Tests + +**Files:** +- Modify: `packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.test.ts` +- Modify: `packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.test.ts` +- Modify: `packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.test.ts` +- Modify: `packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.test.ts` +- Modify: `packages/context/src/ingest/ingest-runtime-assets.test.ts` + +- [ ] **Step 1: Add entity_details input-contract coverage** + +Add this test inside the existing `describe('EntityDetailsTool', ...)` block: + +```typescript +it('uses connectionId as the public input field', async () => { + expect( + tool.parseInput({ + connectionId: 'warehouse', + targets: [{ display: 'public.orders' }], + }), + ).toEqual({ + connectionId: 'warehouse', + targets: [{ display: 'public.orders' }], + }); + + expect(() => + tool.parseInput({ + connectionName: 'warehouse', + targets: [{ display: 'public.orders' }], + }), + ).toThrow(); +}); +``` + +Update the existing `tool.call(...)` inputs in the same test file from `connectionName` to `connectionId`. For example: + +```typescript +const result = await tool.call({ connectionId: 'warehouse', targets: [{ display: 'public.orders' }] }, context); +``` + +- [ ] **Step 2: Add sql_execution input-contract coverage** + +Add this test inside `packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.test.ts`: + +```typescript +it('uses connectionId as the public input field', () => { + expect( + tool.parseInput({ + connectionId: 'warehouse', + sql: 'select 1', + rowLimit: 5, + }), + ).toEqual({ + connectionId: 'warehouse', + sql: 'select 1', + rowLimit: 5, + }); + + expect(() => + tool.parseInput({ + connectionName: 'warehouse', + sql: 'select 1', + rowLimit: 5, + }), + ).toThrow(); +}); +``` + +Update the existing `tool.call(...)` inputs in the same test file from `connectionName` to `connectionId`. + +- [ ] **Step 3: Add discover_data input and hint coverage** + +Update the existing discover tests so the first case calls: + +```typescript +const result = await tool.call({ query: 'orders', connectionId: 'warehouse', limit: 5 }, context); +``` + +Change the routing-hint assertions to: + +```typescript +expect(result.markdown).toContain('use `entity_details({connectionId, targets: [{display}]})`'); +``` + +In the multi-connection test, use a `connectionId` hit field and assert the follow-up call is connectionId-shaped: + +```typescript +catalog.searchByName.mockImplementation(async (connectionId: string, query: string) => [ + { + kind: 'table', + connectionId, + ref: { catalog: null, db: 'public', name: `${connectionId}_${query}` }, + display: `public.${connectionId}_${query}`, + matchedOn: 'name', + }, +]); + +const result = await tool.call({ query: 'orders', limit: 10 }, multiConnectionContext); + +expect(catalog.searchByName).toHaveBeenCalledWith('analytics', 'orders', 10); +expect(catalog.searchByName).toHaveBeenCalledWith('warehouse', 'orders', 10); +expect(result.markdown).toContain('connectionId=analytics'); +expect(result.markdown).toContain('connectionId=warehouse'); +expect(result.markdown).toContain( + 'entity_details({connectionId: "analytics", targets: [{display: "public.analytics_orders"}]})', +); +expect(result.structured.raw?.hits.map((hit) => hit.connectionId)).toEqual(['analytics', 'warehouse']); +``` + +Add a parse contract test: + +```typescript +it('uses connectionId as the optional connection filter', () => { + expect(tool.parseInput({ query: 'orders', connectionId: 'warehouse', limit: 5 })).toEqual({ + query: 'orders', + connectionId: 'warehouse', + limit: 5, + }); + + expect(() => tool.parseInput({ query: 'orders', connectionName: 'warehouse', limit: 5 })).toThrow(); +}); +``` + +- [ ] **Step 4: Add shared catalog output coverage** + +Rename `packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.test.ts` to `packages/context/src/scan/warehouse-catalog.test.ts`. + +Update the import to: + +```typescript +import { WarehouseCatalogService } from './warehouse-catalog.js'; +``` + +Update the main detail assertion to use `connectionId`: + +```typescript +const detail = await catalog.getTable({ connectionId: 'warehouse', catalog: null, db: 'public', name: 'orders' }); + +expect(detail).toMatchObject({ + connectionId: 'warehouse', + display: 'public.orders', +}); +expect(detail).not.toHaveProperty('connectionName'); +``` + +Add raw hit coverage: + +```typescript +const hits = await catalog.searchByName('warehouse', 'orders', 5); +expect(hits[0]).toMatchObject({ + kind: 'table', + connectionId: 'warehouse', + display: 'public.orders', +}); +expect(hits[0]).not.toHaveProperty('connectionName'); +``` + +- [ ] **Step 5: Update prompt-asset test expectations first** + +In `packages/context/src/ingest/ingest-runtime-assets.test.ts`, change the identifier verification expectations to: + +```typescript +expect(shared).toContain('sql_execution({connectionId, sql: "SELECT DISTINCT'); +expect(shared).toContain('sql_execution({connectionId, sql: "SELECT 1 FROM'); +expect(shared).not.toContain('entity_details({connectionName'); +expect(shared).not.toContain('sql_execution({connectionName'); +``` + +- [ ] **Step 6: Run focused tests and verify they fail** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run \ + src/ingest/tools/warehouse-verification/entity-details.tool.test.ts \ + src/ingest/tools/warehouse-verification/discover-data.tool.test.ts \ + src/ingest/tools/warehouse-verification/sql-execution.tool.test.ts \ + src/scan/warehouse-catalog.test.ts \ + src/ingest/ingest-runtime-assets.test.ts +``` + +Expected: FAIL because schemas still require `connectionName`, the catalog service still returns `connectionName`, and the prompt asset still contains old tool-call examples. + +## Task 2: Move And Rename The Shared Warehouse Catalog Service + +**Files:** +- Move: `packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.ts` to `packages/context/src/scan/warehouse-catalog.ts` +- Modify: `packages/context/src/scan/index.ts` +- Delete: `packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.ts` + +- [ ] **Step 1: Move the service into the scan package** + +Run: + +```bash +git mv packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.ts packages/context/src/scan/warehouse-catalog.ts +``` + +- [ ] **Step 2: Fix imports for the new location** + +In `packages/context/src/scan/warehouse-catalog.ts`, change the imports at the top to: + +```typescript +import { getDialectForDriver } from '../connections/index.js'; +import type { KtxFileStorePort } from '../core/index.js'; +import type { + KtxConnectionDriver, + KtxSchemaColumn, + KtxSchemaForeignKey, + KtxSchemaTable, + KtxTableRef, +} from './types.js'; +``` + +- [ ] **Step 3: Rename public catalog fields and method parameters** + +In `packages/context/src/scan/warehouse-catalog.ts`, rename the service's public contract to this shape: + +```typescript +export interface TableDetail { + connectionId: string; + catalog: string | null; + db: string | null; + name: string; + display: string; + kind: string; + comment: string | null; + description: string | null; + rowCount: number | null; + columns: WarehouseColumnDetail[]; + foreignKeys: KtxSchemaForeignKey[]; +} + +export type RawSchemaHit = + | { + kind: 'table'; + connectionId: string; + ref: KtxTableRef; + display: string; + matchedOn: 'name' | 'db' | 'comment' | 'description'; + } + | { + kind: 'column'; + connectionId: string; + ref: KtxTableRef & { column: string }; + display: string; + matchedOn: 'name' | 'comment' | 'description'; + }; + +interface ConnectionCatalog { + connectionId: string; + syncId: string; + driver: CatalogDriver; + tables: KtxSchemaTable[]; + profile: RelationshipProfileArtifact | null; +} +``` + +Update the method signatures to: + +```typescript +async hasScan(connectionId: string): Promise +async getLatestSyncId(connectionId: string): Promise +async listTables(connectionId: string): Promise +async getTable(ref: { connectionId: string } & KtxTableRef): Promise +async resolveDisplay(connectionId: string, display: string): Promise<{ resolved: KtxTableRef | null; candidates: KtxTableRef[]; dialect: string }> +async resolveDisplayTarget(connectionId: string, display: string): Promise +async searchByName(connectionId: string, query: string, limit: number): Promise +private loadCatalog(connectionId: string): Promise +private async readCatalog(connectionId: string): Promise +``` + +Within those methods, use `connectionId` for the cache key, raw artifact root, returned `TableDetail.connectionId`, and returned `RawSchemaHit.connectionId`. + +- [ ] **Step 4: Export the shared service** + +Add these exports to `packages/context/src/scan/index.ts` near the existing entity-details exports: + +```typescript +export type { + DisplayTargetResolution, + RawSchemaHit, + TableDetail, + WarehouseCatalogServiceDeps, +} from './warehouse-catalog.js'; +export { WarehouseCatalogService } from './warehouse-catalog.js'; +``` + +- [ ] **Step 5: Run the catalog test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/scan/warehouse-catalog.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit the shared catalog move** + +Run: + +```bash +git add packages/context/src/scan/warehouse-catalog.ts packages/context/src/scan/warehouse-catalog.test.ts packages/context/src/scan/index.ts packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.ts +git commit -m "refactor(context): share warehouse catalog service" +``` + +## Task 3: Rename Ingest Warehouse-Verification Tool Inputs + +**Files:** +- Modify: `packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.ts` +- Modify: `packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.ts` +- Modify: `packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.ts` +- Modify: `packages/context/src/ingest/tools/warehouse-verification/index.ts` + +- [ ] **Step 1: Update imports from the shared scan service** + +In `entity-details.tool.ts`, use: + +```typescript +import { WarehouseCatalogService, type TableDetail } from '../../../scan/warehouse-catalog.js'; +``` + +In `discover-data.tool.ts`, use: + +```typescript +import { WarehouseCatalogService, type RawSchemaHit } from '../../../scan/warehouse-catalog.js'; +``` + +In `index.ts`, use: + +```typescript +import { WarehouseCatalogService } from '../../../scan/warehouse-catalog.js'; +``` + +- [ ] **Step 2: Rename entity_details input and calls** + +In `entity-details.tool.ts`, update the schema: + +```typescript +const entityDetailsInputSchema = z.object({ + connectionId: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/), + targets: z.array(targetSchema).min(1).max(50), +}); +``` + +Update `resolveTarget`: + +```typescript +async function resolveTarget( + catalog: WarehouseCatalogService, + connectionId: string, + target: EntityDetailsTarget, +): Promise<{ resolved: (KtxTableRef & { column?: string }) | null; candidates: KtxTableRef[] }> { + if ('display' in target) { + return catalog.resolveDisplayTarget(connectionId, target.display); + } + + const candidateResolution = await catalog.resolveDisplayTarget(connectionId, targetLabel(target)); + return { + resolved: { + catalog: target.catalog, + db: target.db, + name: target.name, + column: target.column, + }, + candidates: candidateResolution.candidates, + }; +} +``` + +Update the start of `call`: + +```typescript +async call(input: EntityDetailsInput, context: ToolContext): Promise> { + const allowed = allowedConnectionNames(context); + if (allowed && !allowed.has(input.connectionId)) { + return { + markdown: `Connection "${input.connectionId}" is not available to this ingest stage.`, + structured: { resolved: [], missing: [], scanAvailable: false }, + }; + } + + const catalog = this.catalogFactory(context); + const scanAvailable = await catalog.hasScan(input.connectionId); + if (!scanAvailable) { + return { + markdown: `No live-database scan available for connection "${input.connectionId}"; run \`ktx scan\` first.`, + structured: { resolved: [], missing: [], scanAvailable: false }, + }; + } +``` + +Update the table lookup: + +```typescript +const resolution = await resolveTarget(catalog, input.connectionId, target); +const detail = await catalog.getTable({ connectionId: input.connectionId, ...resolution.resolved }); +``` + +- [ ] **Step 3: Rename sql_execution input and calls** + +In `sql-execution.tool.ts`, update the schema: + +```typescript +const sqlExecutionInputSchema = z.object({ + connectionId: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/), + sql: z.string().min(1), + rowLimit: z.number().int().positive().max(1000).optional().default(100), +}); +``` + +Update the allowed-connection guard: + +```typescript +const allowed = context.session?.allowedConnectionNames; +if (allowed && !allowed.has(input.connectionId)) { + return { + markdown: `Connection "${input.connectionId}" is not available to this ingest stage.`, + structured: { + headers: [], + rows: [], + rowCount: 0, + truncated: false, + sql: input.sql, + wrappedSql: '', + error: 'connection_not_allowed', + }, + }; +} +``` + +Update execution: + +```typescript +const result = await this.connections.executeQuery(input.connectionId, wrappedSql); +``` + +- [ ] **Step 4: Rename discover_data input, raw hits, and routing hints** + +In `discover-data.tool.ts`, update the schema: + +```typescript +const discoverDataInputSchema = z.object({ + query: z.string().optional(), + connectionId: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/).optional(), + limit: z.number().int().positive().max(50).optional().default(10), + sourceName: z.string().optional(), +}); +``` + +Update the out-of-scope check: + +```typescript +if (input.connectionId && allowed && !allowed.has(input.connectionId)) { + return { + markdown: `Connection "${input.connectionId}" is not available to this ingest stage.`, + structured: { wiki: null, sl: null, raw: null }, + }; +} +``` + +Update the source inspect mode: + +```typescript +const sl = await this.deps.slDiscoverTool.call( + { sourceName: input.sourceName, connectionId: input.connectionId }, + context, +); +``` + +Update the SL discover call: + +```typescript +const slResult = await this.deps.slDiscoverTool.call( + { query: query || undefined, connectionId: input.connectionId }, + context, +); +``` + +Update the raw search loop and hints: + +```typescript +const connections = input.connectionId ? [input.connectionId] : [...(allowed ?? [])].sort(); +const rawHits: RawSchemaHit[] = []; +for (const connectionId of connections) { + rawHits.push(...(await catalog.searchByName(connectionId, query, limit))); +} +if (rawHits.length > 0) { + parts.push( + '## Raw Warehouse Schema', + '> use `entity_details({connectionId, targets: [{display}]})` for full DDL + sample values', + ); + parts.push( + rawHits + .slice(0, limit) + .map( + (hit) => + `- ${hit.kind}: ${hit.display} [connectionId=${hit.connectionId}] (matched on ${hit.matchedOn}) - ` + + `follow up with \`entity_details({connectionId: "${hit.connectionId}", targets: [{display: "${hit.display}"}]})\``, + ) + .join('\n'), + ); + raw = { hits: rawHits.slice(0, limit) }; +} +``` + +- [ ] **Step 5: Run focused tool tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run \ + src/ingest/tools/warehouse-verification/entity-details.tool.test.ts \ + src/ingest/tools/warehouse-verification/discover-data.tool.test.ts \ + src/ingest/tools/warehouse-verification/sql-execution.tool.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit the ingest tool contract rename** + +Run: + +```bash +git add packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.ts packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.ts packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.ts packages/context/src/ingest/tools/warehouse-verification/index.ts packages/context/src/ingest/tools/warehouse-verification/*.test.ts +git commit -m "refactor(context): use connectionId in warehouse verification tools" +``` + +## Task 4: Update Prompt Assets And Runtime Tests + +**Files:** +- Modify: `packages/context/skills/_shared/identifier-verification.md` +- Modify: `packages/context/skills/dbt_ingest/SKILL.md` +- Modify: `packages/context/skills/historic_sql_patterns/SKILL.md` +- Modify: `packages/context/skills/historic_sql_table_digest/SKILL.md` +- Modify: `packages/context/skills/live_database_ingest/SKILL.md` +- Modify: `packages/context/skills/looker_ingest/SKILL.md` +- Modify: `packages/context/skills/lookml_ingest/SKILL.md` +- Modify: `packages/context/skills/metabase_ingest/SKILL.md` +- Modify: `packages/context/skills/metricflow_ingest/SKILL.md` +- Modify: `packages/context/skills/notion_synthesize/SKILL.md` +- Modify: `packages/context/skills/sl_capture/SKILL.md` +- Modify: `packages/context/skills/wiki_capture/SKILL.md` +- Modify: `packages/context/src/ingest/ingest-runtime-assets.test.ts` + +- [ ] **Step 1: Update the shared identifier verification protocol** + +Replace the tool-call examples in `packages/context/skills/_shared/identifier-verification.md` with: + +```markdown +2. `entity_details({connectionId, targets: [{display: ""}]})` - + confirm the identifier resolves; inspect native types, FK/PK, and + sampleValues. +3. For literal values from the source, such as status codes or plan tiers, + check whether they appear in `entity_details` sampleValues for the relevant + column. If sampleValues is short or the sample may have missed real values, + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. +4. If the candidate identifier still does not resolve, do one of: + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. + If it errors, the identifier is fictional. +``` + +- [ ] **Step 2: Update copied skill assets** + +In the listed `packages/context/skills/*/SKILL.md` files, replace only KTX tool-call examples: + +```text +entity_details({connectionName, targets: +``` + +with: + +```text +entity_details({connectionId, targets: +``` + +Replace: + +```text +sql_execution({connectionName, sql: +``` + +with: + +```text +sql_execution({connectionId, sql: +``` + +Replace concrete KTX tool-call examples like: + +```text +sql_execution({connectionName: "warehouse", sql: +``` + +with: + +```text +sql_execution({connectionId: "warehouse", sql: +``` + +In `packages/context/skills/sl_capture/SKILL.md`, replace the JSON field inside the example object: + +```yaml +connectionName: "warehouse", +``` + +with: + +```yaml +connectionId: "warehouse", +``` + +Do not change `packages/context/skills/looker_ingest/SKILL.md` text that defines Looker runtime `connectionName`, and do not change LookML parser docs where `connectionName` names a LookML model property. + +- [ ] **Step 3: Update runtime asset tests** + +In `packages/context/src/ingest/ingest-runtime-assets.test.ts`, ensure the identifier test asserts the new examples: + +```typescript +expect(shared).toContain('sql_execution({connectionId, sql: "SELECT DISTINCT'); +expect(shared).toContain('sql_execution({connectionId, sql: "SELECT 1 FROM'); +expect(shared).not.toContain('entity_details({connectionName'); +expect(shared).not.toContain('sql_execution({connectionName'); +``` + +- [ ] **Step 4: Run prompt asset checks** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/ingest/ingest-runtime-assets.test.ts +``` + +Expected: PASS. + +- [ ] **Step 5: Verify stale tool-call examples are gone** + +Run: + +```bash +rg -n "entity_details\\(\\{connectionName|sql_execution\\(\\{connectionName|connectionName=" packages/context/skills packages/context/src/ingest/ingest-runtime-assets.test.ts +``` + +Expected: no output. If this reports Looker/LookML prose that is not a KTX tool-call example, narrow the regex and keep the Looker/LookML prose unchanged. + +- [ ] **Step 6: Commit prompt asset updates** + +Run: + +```bash +git add packages/context/skills packages/context/src/ingest/ingest-runtime-assets.test.ts +git commit -m "docs(context): update ingest verification prompts for connectionId" +``` + +## Task 5: Final Verification + +**Files:** +- Verify all files changed in Tasks 1-4. + +- [ ] **Step 1: Run focused research-agent ingest tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run \ + src/scan/warehouse-catalog.test.ts \ + src/ingest/tools/warehouse-verification/entity-details.tool.test.ts \ + src/ingest/tools/warehouse-verification/discover-data.tool.test.ts \ + src/ingest/tools/warehouse-verification/sql-execution.tool.test.ts \ + src/ingest/ingest-runtime-assets.test.ts +``` + +Expected: PASS. + +- [ ] **Step 2: Run context type-check** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +``` + +Expected: PASS. + +- [ ] **Step 3: Run dead-code checks after TypeScript changes** + +Run: + +```bash +pnpm run dead-code +``` + +Expected: PASS. If Knip reports unrelated pre-existing findings, record the exact unrelated findings in the implementation handoff and do not add broad ignores. + +- [ ] **Step 4: Verify the v1-blocking old contract is gone** + +Run: + +```bash +rg -n "connectionName" packages/context/src/ingest/tools/warehouse-verification packages/context/src/scan/warehouse-catalog.ts packages/context/src/scan/warehouse-catalog.test.ts +``` + +Expected: no output. + +Run: + +```bash +rg -n "entity_details\\(\\{connectionName|sql_execution\\(\\{connectionName|connectionName=" packages/context/skills packages/context/src/ingest/ingest-runtime-assets.test.ts +``` + +Expected: no output. + +- [ ] **Step 5: Inspect git status** + +Run: + +```bash +git status --short +``` + +Expected: only the intended scan catalog move, warehouse-verification tools/tests, prompt assets, and ingest runtime asset test changes are present. + +- [ ] **Step 6: Commit final fixes if verification required any** + +If Steps 1-5 required follow-up edits, commit those edits: + +```bash +git add packages/context/src packages/context/skills +git commit -m "test(context): verify warehouse verification connectionId contract" +``` + +If `git status --short` is empty after the earlier task commits, skip this commit. + +## Self-Review + +- Spec coverage: This plan covers the remaining v1 requirement that ingest-side warehouse verification uses `connectionId` and shares the raw-schema catalog service instead of preserving a divergent `connectionName` contract. +- Placeholder scan: The plan contains no deferred-work marker phrases. +- Type consistency: The plan uses `connectionId` consistently in public tool inputs, `TableDetail`, `RawSchemaHit`, `WarehouseCatalogService` method parameters, tests, and prompt assets. diff --git a/docs/superpowers/plans/2026-05-14-research-agent-mcp-setup-agents.md b/docs/superpowers/plans/2026-05-14-research-agent-mcp-setup-agents.md new file mode 100644 index 00000000..13f66764 --- /dev/null +++ b/docs/superpowers/plans/2026-05-14-research-agent-mcp-setup-agents.md @@ -0,0 +1,938 @@ +# Research Agent MCP Setup Agents Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make `ktx setup-agents` install the `ktx-research` skill and configure or print MCP client entries that point agents at the local `ktx mcp` HTTP endpoint. + +**Architecture:** Keep `packages/cli/src/setup-agents.ts` as the setup orchestration point. Add a small MCP-client config planner/writer in the same module, backed by `.ktx/mcp.json` when present, and install the research skill from a copied runtime asset so source checkouts and published CLI builds use the same `SKILL.md`. + +**Tech Stack:** TypeScript, Vitest, Node fs/path APIs, Commander setup options, KTX MCP daemon state, JSON config writers. + +--- + +## Current Audit + +Original spec: `docs/superpowers/specs/2026-05-14-research-agent-mcp-tools-design.md` + +Implemented v1 slices confirmed in current source: + +- MCP `sql_execution`, `entity_details`, `dictionary_search`, and `discover_data` are registered in `packages/context/src/mcp/context-tools.ts`. +- Local project MCP ports wire all four tools in `packages/context/src/mcp/local-project-ports.ts`. +- Parser-backed SQL validation exists in `python/ktx-daemon/src/ktx_daemon/sql_analysis.py` and is exposed through `POST /sql/validate-read-only`. +- `ktx mcp start|stop|status|logs` exists in `packages/cli/src/commands/mcp-commands.ts`, with HTTP hosting in `packages/cli/src/mcp-http-server.ts` and daemon state in `packages/cli/src/managed-mcp-daemon.ts`. +- Targeted verification passed: + - `pnpm --filter @ktx/context exec vitest run src/mcp/server.test.ts src/search/discover.test.ts src/scan/entity-details.test.ts src/sl/dictionary-search.test.ts` + - `pnpm --filter @ktx/cli exec vitest run src/mcp-http-server.test.ts src/managed-mcp-daemon.test.ts src/commands/mcp-commands.test.ts src/setup-agents.test.ts` + +V1-blocking gaps remaining against the original spec: + +- `ktx setup-agents` still installs only the existing `ktx` agent files; it does not install `ktx-research`. +- `ktx setup-agents` does not write Claude Code or Cursor MCP JSON config entries. +- `ktx setup-agents` does not print Codex or opencode copy-paste snippets. +- `ktx setup-agents --remove` cannot remove written MCP JSON keys because none are written or tracked. +- The ingest-side warehouse-verification tools still use `connectionName`, `targets`, and `rowLimit`, and `WarehouseCatalogService` still exposes connection-name terminology. That is a separate v1-blocking subsystem and is not mixed into this setup-agent plan. + +Non-blocking or explicitly out-of-scope gaps: + +- Python code execution over MCP. +- Stdio MCP transport. +- OS-level auto-start. +- Native TLS, audit logging, rate limiting, per-tool authorization, and multi-project daemon routing. +- Streaming SQL results. + +## File Structure + +Create: + +- `packages/cli/src/skills/research/SKILL.md` + - Canonical research skill body from the spec. + - Copied into `dist/skills/research/SKILL.md` during `@ktx/cli` build. +- `packages/cli/scripts/copy-runtime-assets.mjs` + - Copies `src/skills` into `dist/skills` after TypeScript compilation. + +Modify: + +- `packages/cli/package.json` + - Append the runtime asset copy step to the `build` script. +- `packages/cli/src/setup-agents.ts` + - Add `local` agent scope for Claude Code's per-project private config path. + - Add `research-skill` file entries in `plannedKtxAgentFiles()`. + - Read the research skill asset when writing research-skill entries. + - Add MCP endpoint resolution from `.ktx/mcp.json`, falling back to `http://localhost:7878/mcp`. + - Add JSON writers for Claude Code and Cursor MCP entries. + - Add printed snippets for Codex and opencode. + - Track written JSON keys in the install manifest. + - Print the daemon-start hint when the daemon is not currently running. +- `packages/cli/src/setup-agents.test.ts` + - Cover research skill install paths, MCP JSON writers, snippets, manifest removal, token handling, and no literal-token rendering. +- `packages/cli/src/commands/setup-commands.ts` + - Add `--local` for Claude Code local-scope setup. + - Reject `--local` with non-Claude targets and reject `--local --global`. +- `packages/cli/src/setup.ts` + - No behavior change beyond accepting `KtxAgentScope` with the new `local` value. +- `packages/cli/src/cli-program.ts` + - Keep the default bare setup `agentScope: 'project'`; no code change needed unless TypeScript requires the widened scope type in nearby annotations. + +## Task 1: Add The Research Skill Runtime Asset + +**Files:** +- Create: `packages/cli/src/skills/research/SKILL.md` +- Create: `packages/cli/scripts/copy-runtime-assets.mjs` +- Modify: `packages/cli/package.json` +- Modify: `packages/cli/src/setup-agents.test.ts` +- Modify: `packages/cli/src/setup-agents.ts` + +- [ ] **Step 1: Write the failing research-skill install tests** + +In `packages/cli/src/setup-agents.test.ts`, update the first test to expect `ktx-research` entries. Replace the project-scoped assertions with: + +```typescript + it('plans project-scoped CLI and research files for every target', () => { + expect(plannedKtxAgentFiles({ projectDir: tempDir, target: 'claude-code', scope: 'project', mode: 'cli' })).toEqual([ + { kind: 'file', path: join(tempDir, '.claude/skills/ktx/SKILL.md'), role: 'skill' }, + { kind: 'file', path: join(tempDir, '.claude/skills/ktx-research/SKILL.md'), role: 'research-skill' }, + { kind: 'file', path: join(tempDir, '.claude/rules/ktx.md'), role: 'rule' }, + ]); + expect(plannedKtxAgentFiles({ projectDir: tempDir, target: 'codex', scope: 'project', mode: 'cli' })).toEqual([ + { kind: 'file', path: join(tempDir, '.agents/skills/ktx/SKILL.md'), role: 'skill' }, + { kind: 'file', path: join(tempDir, '.agents/skills/ktx-research/SKILL.md'), role: 'research-skill' }, + { kind: 'file', path: join(tempDir, '.codex/instructions/ktx.md'), role: 'rule' }, + ]); + expect(plannedKtxAgentFiles({ projectDir: tempDir, target: 'cursor', scope: 'project', mode: 'cli' })).toEqual([ + { kind: 'file', path: join(tempDir, '.cursor/rules/ktx.mdc') }, + { kind: 'file', path: join(tempDir, '.cursor/rules/ktx-research.mdc'), role: 'research-skill' }, + ]); + expect(plannedKtxAgentFiles({ projectDir: tempDir, target: 'opencode', scope: 'project', mode: 'cli' })).toEqual([ + { kind: 'file', path: join(tempDir, '.opencode/commands/ktx.md') }, + { kind: 'file', path: join(tempDir, '.opencode/commands/ktx-research.md'), role: 'research-skill' }, + ]); + expect(plannedKtxAgentFiles({ projectDir: tempDir, target: 'universal', scope: 'project', mode: 'cli' })).toEqual([ + { kind: 'file', path: join(tempDir, '.agents/skills/ktx/SKILL.md') }, + { kind: 'file', path: join(tempDir, '.agents/skills/ktx-research/SKILL.md'), role: 'research-skill' }, + ]); + }); +``` + +Add this test after `installs target files, writes a manifest, and marks agents complete`: + +```typescript + it('installs the research skill from the runtime asset', async () => { + const io = makeIo(); + + await expect( + runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'universal', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + io.io, + ), + ).resolves.toMatchObject({ status: 'ready' }); + + const researchSkill = await readFile(join(tempDir, '.agents/skills/ktx-research/SKILL.md'), 'utf-8'); + expect(researchSkill).toContain('name: ktx-research'); + expect(researchSkill).toContain('Always run `discover_data` before writing SQL.'); + expect(researchSkill).toContain('Treat a `dictionary_search` miss as non-authoritative.'); + }); +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/setup-agents.test.ts +``` + +Expected: FAIL because `plannedKtxAgentFiles()` does not return `ktx-research` entries and the installed research skill file does not exist. + +- [ ] **Step 3: Add the research skill asset** + +Create `packages/cli/src/skills/research/SKILL.md`: + +```markdown +--- +name: ktx-research +description: Use when answering a question that needs data from a KTX-connected database - investigating, analyzing, "how many", "show me", "what's the breakdown of", finding records by value, exploring tables, comparing periods, or any data-investigation request. Triggers even when the user does not say "research"; if the answer requires querying a configured KTX connection, this skill applies. +--- + +# KTX Research Workflow + +You have access to KTX MCP tools for investigating data. Follow this workflow. + + +1. **Discover** - call `discover_data` first to see what exists across wiki, semantic-layer sources, and raw tables. Returns refs only. +2. **Inspect top hits in parallel** - for each promising ref: + - `kind: 'wiki'` -> `wiki_read` + - `kind: 'sl_source'`, `kind: 'sl_measure'`, or `kind: 'sl_dimension'` -> `sl_read_source` + - `kind: 'table'` or `kind: 'column'` -> `entity_details` +3. **Resolve literals** - if the user named a value such as "Acme Corp" or "status=shipped", call `dictionary_search` to find which column holds it. +4. **Query** - + - Prefer `sl_query` when the semantic layer covers the question. + - Use `sql_execution` only for questions the semantic layer does not cover. +5. **Capture learnings** - at the end of the turn, call `memory_capture` so future turns benefit. Skip when the answer carries no durable knowledge. + + + +- Always run `discover_data` before writing SQL. Do not guess table names. +- Prefer the semantic layer over raw SQL when both can answer the question; measures are the source of truth. +- Read entity details before writing SQL against an unfamiliar table. Do not assume column names. +- Treat `sql_execution` as read-only. Writes are rejected by the server. +- Validate value mentions with `dictionary_search` instead of guessing case or spelling. Treat a `dictionary_search` miss as non-authoritative. The index is built from profile-sampled values, so a missing value may simply have been outside the sample. Follow up with `sql_execution` against the most plausible columns before concluding the value is absent. + + + +**Input:** "How many orders did Acme Corp place last month?" + +**Workflow:** +1. `dictionary_search({ values: ["Acme Corp"] })` finds `customers.name`. +2. `discover_data({ query: "orders customer monthly" })` finds an orders semantic-layer source. +3. `sl_read_source({ connectionId: "warehouse", sourceName: "orders_facts" })` confirms the source grain, measures, and dimensions. +4. `sl_query({ connectionId: "warehouse", measures: ["order_count"], filters: ["customer_name = 'Acme Corp'"] })` answers through the semantic layer. +5. `memory_capture({ userMessage, assistantMessage })` captures the durable finding. + +--- + +**Input:** "What columns does the events table have?" + +**Workflow:** +1. `discover_data({ query: "events table" })` returns a `table` ref. +2. `entity_details({ connectionId: "warehouse", entities: [{ table: "analytics.events" }] })` returns columns, types, and foreign keys. +3. Answer directly. No query is needed. + +``` + +- [ ] **Step 4: Copy skill assets during CLI build** + +Create `packages/cli/scripts/copy-runtime-assets.mjs`: + +```javascript +import { cp, mkdir, rm } from 'node:fs/promises'; +import { dirname, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const packageRoot = fileURLToPath(new URL('..', import.meta.url)); +const skillsSource = join(packageRoot, 'src', 'skills'); +const skillsTarget = join(packageRoot, 'dist', 'skills'); + +await rm(skillsTarget, { recursive: true, force: true }); +await mkdir(dirname(skillsTarget), { recursive: true }); +await cp(skillsSource, skillsTarget, { recursive: true }); +``` + +Modify `packages/cli/package.json`: + +```json +"build": "node -e \"fs.rmSync('dist', { recursive: true, force: true })\" && tsc -p tsconfig.json && node scripts/copy-runtime-assets.mjs && node ../../scripts/prepare-cli-bin.mjs" +``` + +- [ ] **Step 5: Add research-skill install entries and content loading** + +In `packages/cli/src/setup-agents.ts`, update the manifest entry role type: + +```typescript +| { kind: 'file'; path: string; role?: 'skill' | 'rule' | 'research-skill' } +``` + +Add this helper near `ktxCliLauncher()`: + +```typescript +async function readResearchSkillContent(): Promise { + const path = fileURLToPath(new URL('./skills/research/SKILL.md', import.meta.url)); + const content = await readFile(path, 'utf-8'); + return content.endsWith('\n') ? content : `${content}\n`; +} +``` + +Update `plannedKtxAgentFiles()` so every supported project target includes the `ktx-research` entry shown in Step 1. For global targets, return: + +```typescript +if (input.scope === 'global') { + if (input.target === 'claude-code') { + const home = process.env.HOME ?? ''; + return [ + { kind: 'file', path: join(home, '.claude/skills/ktx/SKILL.md'), role: 'skill' as const }, + { kind: 'file', path: join(home, '.claude/skills/ktx-research/SKILL.md'), role: 'research-skill' as const }, + { kind: 'file', path: join(home, '.claude/rules/ktx.md'), role: 'rule' as const }, + ]; + } + if (input.target === 'codex') { + const codexHome = process.env.CODEX_HOME ?? join(process.env.HOME ?? '', '.codex'); + return [ + { kind: 'file', path: join(codexHome, 'skills/ktx/SKILL.md'), role: 'skill' as const }, + { kind: 'file', path: join(codexHome, 'skills/ktx-research/SKILL.md'), role: 'research-skill' as const }, + { kind: 'file', path: join(codexHome, 'instructions/ktx.md'), role: 'rule' as const }, + ]; + } + if (input.target === 'cursor' || input.target === 'opencode') { + return []; + } + throw new Error(`Global ${input.target} installation is not supported; omit --global.`); +} +``` + +In `installTarget()`, switch the file content selection to: + +```typescript +const content = + entry.role === 'rule' + ? ruleInstructionContent({ projectDir: input.projectDir }) + : entry.role === 'research-skill' + ? await readResearchSkillContent() + : cliInstructionContent({ projectDir: input.projectDir, launcher }); +``` + +- [ ] **Step 6: Run tests to verify the research skill passes** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/setup-agents.test.ts +``` + +Expected: PASS for the research skill install tests. MCP config tests are added in the next task and will fail until implemented. + +- [ ] **Step 7: Commit** + +```bash +git add packages/cli/src/skills/research/SKILL.md packages/cli/scripts/copy-runtime-assets.mjs packages/cli/package.json packages/cli/src/setup-agents.ts packages/cli/src/setup-agents.test.ts +git commit -m "feat(cli): install KTX research skill" +``` + +## Task 2: Add MCP Client Config Planning And Rendering + +**Files:** +- Modify: `packages/cli/src/setup-agents.test.ts` +- Modify: `packages/cli/src/setup-agents.ts` + +- [ ] **Step 1: Write failing MCP config planner tests** + +In `packages/cli/src/setup-agents.test.ts`, add these tests before `removes only manifest-listed files`: + +```typescript + it('writes Claude Code project MCP config and tracks the json key', async () => { + const io = makeIo(); + + await expect( + runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'claude-code', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + io.io, + ), + ).resolves.toMatchObject({ status: 'ready' }); + + const mcpJson = JSON.parse(await readFile(join(tempDir, '.mcp.json'), 'utf-8')) as { + mcpServers: { ktx: { type: string; url: string; headers?: Record } }; + }; + expect(mcpJson.mcpServers.ktx).toEqual({ type: 'http', url: 'http://localhost:7878/mcp' }); + expect(await readKtxAgentInstallManifest(tempDir)).toMatchObject({ + entries: expect.arrayContaining([{ kind: 'json-key', path: join(tempDir, '.mcp.json'), jsonPath: ['mcpServers', 'ktx'] }]), + }); + expect(io.stdout()).toContain('Run `ktx mcp start` to enable the configured KTX MCP server.'); + }); + + it('writes Cursor project MCP config', async () => { + const io = makeIo(); + + await runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'cursor', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + io.io, + ); + + const cursorJson = JSON.parse(await readFile(join(tempDir, '.cursor/mcp.json'), 'utf-8')) as { + mcpServers: { ktx: { url: string; headers?: Record } }; + }; + expect(cursorJson.mcpServers.ktx).toEqual({ url: 'http://localhost:7878/mcp' }); + }); + + it('prints Codex and opencode snippets without mutating printed-only config files', async () => { + const codexIo = makeIo(); + await runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'codex', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + codexIo.io, + ); + expect(codexIo.stdout()).toContain('[mcp_servers.ktx]'); + expect(codexIo.stdout()).toContain('url = "http://localhost:7878/mcp"'); + + const opencodeIo = makeIo(); + await runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'opencode', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + opencodeIo.io, + ); + expect(opencodeIo.stdout()).toContain('"mcp"'); + expect(opencodeIo.stdout()).toContain('"type": "remote"'); + await expect(readFile(join(tempDir, 'opencode.json'), 'utf-8')).rejects.toThrow(); + }); + + it('uses MCP daemon state for port and token metadata without rendering literal tokens', async () => { + await mkdir(join(tempDir, '.ktx'), { recursive: true }); + await writeFile( + join(tempDir, '.ktx/mcp.json'), + `${JSON.stringify( + { + schemaVersion: 1, + pid: 999999, + host: '127.0.0.1', + port: 8787, + tokenAuth: true, + projectDir: tempDir, + startedAt: '2026-05-14T00:00:00.000Z', + logPath: join(tempDir, '.ktx/logs/mcp.log'), + }, + null, + 2, + )}\n`, + 'utf-8', + ); + const io = makeIo(); + const previousToken = process.env.KTX_MCP_TOKEN; + process.env.KTX_MCP_TOKEN = 'secret-token'; + + try { + await runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'claude-code', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + io.io, + ); + + const rendered = JSON.stringify(JSON.parse(await readFile(join(tempDir, '.mcp.json'), 'utf-8'))); + expect(rendered).toContain('http://127.0.0.1:8787/mcp'); + expect(rendered).toContain('Bearer ${KTX_MCP_TOKEN}'); + expect(rendered).not.toContain('secret-token'); + expect(io.stdout()).toContain('Run `ktx mcp start` to enable the configured KTX MCP server.'); + } finally { + process.env.KTX_MCP_TOKEN = previousToken; + } + }); +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/setup-agents.test.ts +``` + +Expected: FAIL because no MCP config writer or snippet renderer exists. + +- [ ] **Step 3: Add JSON helpers and MCP endpoint resolution** + +In `packages/cli/src/setup-agents.ts`, add `existsSync` and `readKtxMcpDaemonStatus` imports: + +```typescript +import { existsSync } from 'node:fs'; +import { readKtxMcpDaemonStatus } from './managed-mcp-daemon.js'; +``` + +Add these types and helpers after `type InstallEntry`: + +```typescript +interface KtxMcpEndpointInfo { + url: string; + tokenAuth: boolean; + running: boolean; +} + +interface KtxMcpClientInstallResult { + entries: InstallEntry[]; + snippets: string[]; + notices: string[]; +} + +async function readJsonObject(path: string): Promise> { + if (!existsSync(path)) return {}; + const parsed = JSON.parse(await readFile(path, 'utf-8')) as unknown; + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + throw new Error(`Expected JSON object in ${path}`); + } + return parsed as Record; +} + +function objectAtPath(root: Record, jsonPath: string[]): Record { + let cursor = root; + for (const segment of jsonPath) { + const current = cursor[segment]; + if (!current || typeof current !== 'object' || Array.isArray(current)) { + cursor[segment] = {}; + } + cursor = cursor[segment] as Record; + } + return cursor; +} + +async function writeJsonKey(path: string, jsonPath: string[], value: unknown): Promise { + const root = await readJsonObject(path); + const parent = objectAtPath(root, jsonPath.slice(0, -1)); + parent[jsonPath.at(-1) as string] = value; + await mkdir(dirname(path), { recursive: true }); + await writeFile(path, `${JSON.stringify(root, null, 2)}\n`, 'utf-8'); +} + +async function resolveMcpEndpoint(projectDir: string): Promise { + const status = await readKtxMcpDaemonStatus({ projectDir }).catch(() => null); + if (status?.kind === 'running') { + return { + url: status.url, + tokenAuth: status.state.tokenAuth, + running: true, + }; + } + if (status?.kind === 'stale' && status.state) { + return { + url: `http://${status.state.host}:${status.state.port}/mcp`, + tokenAuth: status.state.tokenAuth || Boolean(process.env.KTX_MCP_TOKEN), + running: false, + }; + } + return { + url: 'http://localhost:7878/mcp', + tokenAuth: Boolean(process.env.KTX_MCP_TOKEN), + running: false, + }; +} +``` + +- [ ] **Step 4: Add MCP entry renderers** + +Add these helpers after `resolveMcpEndpoint()`: + +```typescript +function tokenHeaders(endpoint: KtxMcpEndpointInfo): Record | undefined { + return endpoint.tokenAuth ? { Authorization: 'Bearer ${KTX_MCP_TOKEN}' } : undefined; +} + +function claudeMcpEntry(endpoint: KtxMcpEndpointInfo): Record { + return { + type: 'http', + url: endpoint.url, + ...(tokenHeaders(endpoint) ? { headers: tokenHeaders(endpoint) } : {}), + }; +} + +function cursorMcpEntry(endpoint: KtxMcpEndpointInfo): Record { + return { + url: endpoint.url, + ...(tokenHeaders(endpoint) ? { headers: tokenHeaders(endpoint) } : {}), + }; +} + +function codexSnippet(endpoint: KtxMcpEndpointInfo): string { + if (endpoint.tokenAuth) { + return [ + 'Codex MCP config does not currently document HTTP headers.', + 'Run KTX on loopback without token auth for Codex, or configure headers after Codex documents support.', + ].join('\n'); + } + return [`[mcp_servers.ktx]`, `url = "${endpoint.url}"`].join('\n'); +} + +function opencodeSnippet(endpoint: KtxMcpEndpointInfo): string { + return JSON.stringify( + { + mcp: { + ktx: { + type: 'remote', + url: endpoint.url, + enabled: true, + ...(tokenHeaders(endpoint) ? { headers: tokenHeaders(endpoint) } : {}), + }, + }, + }, + null, + 2, + ); +} + +function claudeConfigPath(projectDir: string, scope: KtxAgentScope): { path: string; jsonPath: string[] } { + const home = process.env.HOME ?? ''; + if (scope === 'global') { + return { path: join(home, '.claude.json'), jsonPath: ['mcpServers', 'ktx'] }; + } + if (scope === 'local') { + return { path: join(home, '.claude.json'), jsonPath: ['projects', resolve(projectDir), 'mcpServers', 'ktx'] }; + } + return { path: join(resolve(projectDir), '.mcp.json'), jsonPath: ['mcpServers', 'ktx'] }; +} + +function cursorConfigPath(projectDir: string, scope: KtxAgentScope): { path: string; jsonPath: string[] } { + const home = process.env.HOME ?? ''; + return { + path: scope === 'global' ? join(home, '.cursor/mcp.json') : join(resolve(projectDir), '.cursor/mcp.json'), + jsonPath: ['mcpServers', 'ktx'], + }; +} +``` + +- [ ] **Step 5: Add the MCP client install planner** + +Add this function after the snippet helpers: + +```typescript +async function installMcpClientConfig(input: { + projectDir: string; + target: KtxAgentTarget; + scope: KtxAgentScope; +}): Promise { + const endpoint = await resolveMcpEndpoint(input.projectDir); + const entries: InstallEntry[] = []; + const snippets: string[] = []; + const notices: string[] = []; + + if (!endpoint.running) { + notices.push('Run `ktx mcp start` to enable the configured KTX MCP server.'); + } + + if (input.target === 'claude-code') { + const config = claudeConfigPath(input.projectDir, input.scope); + await writeJsonKey(config.path, config.jsonPath, claudeMcpEntry(endpoint)); + entries.push({ kind: 'json-key', path: config.path, jsonPath: config.jsonPath }); + } else if (input.target === 'cursor') { + const config = cursorConfigPath(input.projectDir, input.scope); + await writeJsonKey(config.path, config.jsonPath, cursorMcpEntry(endpoint)); + entries.push({ kind: 'json-key', path: config.path, jsonPath: config.jsonPath }); + } else if (input.target === 'codex') { + snippets.push(`Codex MCP snippet for ~/.codex/config.toml:\n${codexSnippet(endpoint)}`); + } else if (input.target === 'opencode') { + const path = + input.scope === 'global' ? '~/.config/opencode/opencode.json' : `${relative(input.projectDir, join(input.projectDir, 'opencode.json'))}`; + snippets.push(`opencode MCP snippet for ${path}:\n${opencodeSnippet(endpoint)}`); + } + + return { entries, snippets, notices }; +} +``` + +- [ ] **Step 6: Call the MCP planner during setup** + +Keep `installTarget()` responsible only for writing agent files and returning those file entries. + +In `runKtxSetupAgentsStep()`, replace the current install loop: + +```typescript + const entries: InstallEntry[] = []; + for (const install of installs) entries.push(...(await installTarget({ projectDir: args.projectDir, ...install }))); +``` + +with: + +```typescript + const entries: InstallEntry[] = []; + const snippets: string[] = []; + const notices = new Set(); + for (const install of installs) { + entries.push(...(await installTarget({ projectDir: args.projectDir, ...install }))); + const mcpResult = await installMcpClientConfig({ projectDir: args.projectDir, target: install.target, scope: install.scope }); + entries.push(...mcpResult.entries); + for (const snippet of mcpResult.snippets) snippets.push(snippet); + for (const notice of mcpResult.notices) notices.add(notice); + } +``` + +After the install summary write: + +```typescript + for (const snippet of snippets) { + io.stdout.write(`\n${snippet}\n`); + } + for (const notice of notices) { + io.stdout.write(`\n${notice}\n`); + } +``` + +- [ ] **Step 7: Run tests to verify MCP config passes** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/setup-agents.test.ts +``` + +Expected: PASS for research-skill and MCP config tests. + +- [ ] **Step 8: Commit** + +```bash +git add packages/cli/src/setup-agents.ts packages/cli/src/setup-agents.test.ts +git commit -m "feat(cli): configure MCP clients in setup agents" +``` + +## Task 3: Add Claude Local Scope + +**Files:** +- Modify: `packages/cli/src/commands/setup-commands.ts` +- Modify: `packages/cli/src/setup-agents.ts` +- Modify: `packages/cli/src/setup-agents.test.ts` +- Modify: `packages/cli/src/setup.test.ts` +- Modify: `packages/cli/src/index.test.ts` + +- [ ] **Step 1: Write failing local-scope tests** + +Add this test to `packages/cli/src/setup-agents.test.ts`: + +```typescript + it('writes Claude Code local MCP config under the project key in ~/.claude.json', async () => { + const home = await mkdtemp(join(tmpdir(), 'ktx-setup-agents-home-')); + const previousHome = process.env.HOME; + process.env.HOME = home; + try { + const io = makeIo(); + await runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'claude-code', + scope: 'local', + mode: 'cli', + skipAgents: false, + }, + io.io, + ); + + const config = JSON.parse(await readFile(join(home, '.claude.json'), 'utf-8')) as { + projects: Record; + }; + expect(config.projects[tempDir].mcpServers.ktx).toEqual({ type: 'http', url: 'http://localhost:7878/mcp' }); + } finally { + process.env.HOME = previousHome; + await rm(home, { recursive: true, force: true }); + } + }); +``` + +Add these command-level tests after the existing `dispatches setup agent flags` test in `packages/cli/src/index.test.ts`: + +```typescript + it('rejects --local with non-Claude targets', async () => { + const setup = vi.fn(async () => 0); + const setupIo = makeIo(); + + await expect( + runKtxCli( + ['--project-dir', tempDir, 'setup', '--agents', '--target', 'cursor', '--local', '--no-input'], + setupIo.io, + { setup }, + ), + ).resolves.toBe(0); + + expect(setupIo.stderr()).toContain('--local is only supported with --target claude-code'); + expect(setup).not.toHaveBeenCalled(); + }); + + it('rejects --local and --global together', async () => { + const setup = vi.fn(async () => 0); + const setupIo = makeIo(); + + await expect( + runKtxCli( + ['--project-dir', tempDir, 'setup', '--agents', '--target', 'claude-code', '--local', '--global', '--no-input'], + setupIo.io, + { setup }, + ), + ).resolves.toBe(0); + + expect(setupIo.stderr()).toContain('Choose only one agent scope: --local or --global.'); + expect(setup).not.toHaveBeenCalled(); + }); +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/setup-agents.test.ts src/index.test.ts +``` + +Expected: FAIL because `KtxAgentScope` does not include `local` and the setup command has no `--local` option. + +- [ ] **Step 3: Add the local scope type and command option** + +In `packages/cli/src/setup-agents.ts`, change: + +```typescript +export type KtxAgentScope = 'project' | 'global'; +``` + +to: + +```typescript +export type KtxAgentScope = 'project' | 'global' | 'local'; +``` + +In `packages/cli/src/commands/setup-commands.ts`, add `local` to `isOnlyAgentOptions()`: + +```typescript +'local', +``` + +Add the command option after `--global`: + +```typescript +.option('--local', 'Install Claude Code MCP config into the private per-project ~/.claude.json scope', false) +``` + +In the setup action before `const mode = ...`, add: + +```typescript + if (options.local && options.global) { + context.io.stderr.write('Choose only one agent scope: --local or --global.\n'); + context.setExitCode(1); + return; + } + if (options.local && options.target && options.target !== 'claude-code') { + context.io.stderr.write('--local is only supported with --target claude-code.\n'); + context.setExitCode(1); + return; + } +``` + +Replace: + +```typescript +const resolvedAgentScope = options.global ? 'global' : 'project'; +``` + +with: + +```typescript +const resolvedAgentScope = options.local ? 'local' : options.global ? 'global' : 'project'; +``` + +- [ ] **Step 4: Run local-scope tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/setup-agents.test.ts src/index.test.ts +``` + +Expected: PASS for the new local-scope coverage. + +- [ ] **Step 5: Commit** + +```bash +git add packages/cli/src/commands/setup-commands.ts packages/cli/src/setup-agents.ts packages/cli/src/setup-agents.test.ts packages/cli/src/setup.test.ts packages/cli/src/index.test.ts +git commit -m "feat(cli): support Claude local MCP setup scope" +``` + +## Task 4: Final Verification + +**Files:** +- Verify all files changed in Tasks 1-3. + +- [ ] **Step 1: Run focused CLI tests** + +Run: + +```bash +pnpm --filter @ktx/cli exec vitest run src/setup-agents.test.ts src/commands/mcp-commands.test.ts src/mcp-http-server.test.ts src/managed-mcp-daemon.test.ts +``` + +Expected: all selected test files pass. + +- [ ] **Step 2: Run CLI type-check** + +Run: + +```bash +pnpm --filter @ktx/cli run type-check +``` + +Expected: TypeScript completes with no errors. + +- [ ] **Step 3: Run CLI build** + +Run: + +```bash +pnpm --filter @ktx/cli run build +``` + +Expected: build succeeds and `packages/cli/dist/skills/research/SKILL.md` exists. + +- [ ] **Step 4: Run dead-code check for the changed TypeScript surface** + +Run: + +```bash +pnpm run dead-code +``` + +Expected: Biome and Knip complete with no new findings from the setup-agent changes. + +- [ ] **Step 5: Inspect git status** + +Run: + +```bash +git status --short +``` + +Expected: only intended setup-agent, skill asset, package script, and test files are modified. + +## Self-Review + +Spec coverage: + +- Covers `ktx-research` skill installation paths for Claude Code, Codex, Cursor, opencode, and universal project targets. +- Covers Claude Code and Cursor JSON MCP writers. +- Covers Codex and opencode printed snippets. +- Covers token handling with `${KTX_MCP_TOKEN}` and no literal token rendering. +- Covers `.ktx/mcp.json` port selection and daemon-start hint. +- Covers manifest tracking for written JSON keys and removal through existing `json-key` cleanup. + +Known v1 gap not covered by this plan: + +- Ingest warehouse-verification contract convergence from `connectionName` to `connectionId`, shared service extraction, and caller/test updates remains v1-blocking and needs its own focused plan after this setup-agent slice lands. diff --git a/docs/superpowers/plans/2026-05-14-research-agent-mcp-sql-execution-foundation.md b/docs/superpowers/plans/2026-05-14-research-agent-mcp-sql-execution-foundation.md new file mode 100644 index 00000000..94611b90 --- /dev/null +++ b/docs/superpowers/plans/2026-05-14-research-agent-mcp-sql-execution-foundation.md @@ -0,0 +1,999 @@ +# Research Agent MCP SQL Execution Foundation Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add the parser-backed safety prerequisite and MCP `sql_execution` surface needed before the research-agent MCP tools can safely execute warehouse SQL. + +**Architecture:** Keep connector `executeReadOnly()` as the execution path, but make the MCP adapter require a sqlglot-backed validator before calling any connector. Extend the existing Python SQL-analysis daemon with a read-only validation endpoint, expose it through the TypeScript SQL-analysis port, then register an MCP `sql_execution` tool only when the host provides that validator and a local scan connector factory. + +**Tech Stack:** TypeScript, Vitest, Zod, Python, pytest, FastAPI, sqlglot, KTX MCP context ports, KTX scan connectors. + +--- + +## Audit Summary + +Original spec: `docs/superpowers/specs/2026-05-14-research-agent-mcp-tools-design.md` + +Implemented plans that overlap with the spec: + +- `docs/superpowers/plans/2026-05-11-managed-agent-mcp-semantic-runtime.md` is implemented for the existing in-process MCP semantic runtime. Current evidence: `packages/context/src/mcp/context-tools.ts` registers `connection_*`, `wiki_*`, `sl_*`, `ingest_*`, and `scan_*` tools, and `packages/context/src/mcp/local-project-ports.ts` provides local ports for those surfaces. +- `docs/superpowers/plans/2026-05-12-warehouse-verification-tools.md` plus its May 12 and May 13 closure plans are implemented for ingest-only warehouse verification. Current evidence: `packages/context/src/ingest/tools/warehouse-verification/{discover-data,entity-details,sql-execution,warehouse-catalog.service}.ts` exist and are wired for ingest agents. + +V1-blocking gaps remaining against the original spec: + +- The public MCP research tools are not registered. `KtxMcpContextPorts` has no `discover`, `entityDetails`, `dictionarySearch`, or `sqlExecution` ports. +- The existing ingest `discover_data`, `entity_details`, and `sql_execution` tools use `connectionName`, `targets`, and `rowLimit`, and return markdown plus structured output. The spec requires MCP-shaped `connectionId`, `entities` / `maxRows`, and pure structured outputs. +- `sql_execution` cannot be safely exposed yet: `packages/context/src/connections/read-only-sql.ts` still uses first-token regex checks. The spec requires a sqlglot/AST-backed guard or connector-side read-only session before MCP registration. +- `packages/context/src/scan/entity-details.ts`, `packages/context/src/sl/dictionary-search.ts`, and `packages/context/src/search/discover.ts` do not exist. +- `WarehouseCatalogService` caches by connection only and does not invalidate when latest scan artifact identity advances. +- `dictionary_search` has no MCP service, no coverage metadata, and no per-connection miss reasons. +- `discover_data` has no unified ranked MCP result shape with `summary`, `snippet`, `matchedOn`, `kind`, `tableRef`, and RRF fusion across wiki, SL, and raw schema. +- `ktx mcp start|stop|status|logs` does not exist, and no HTTP Streamable MCP daemon exists. +- `ktx setup-agents` installs only the existing `ktx` CLI skill/rules; it does not install `ktx-research` or MCP client config entries/snippets. + +Non-blocking or explicitly out-of-scope gaps: + +- Python code execution over MCP. +- Stdio MCP transport. +- OS-level auto-start. +- Native TLS, audit logging, rate limiting, per-tool authorization, and multi-project daemon routing. +- Streaming SQL results. +- Full DDL-style ingest `entity_details` markdown formatting and hard write-time validation in ingest writer tools. + +This plan covers the first prerequisite blocker: parser-backed SQL validation and MCP `sql_execution`. The remaining v1-blocking tool, daemon, and setup-agent work stays visible for subsequent plans. + +## File Structure + +Create no new files. + +Modify these files: + +- `python/ktx-daemon/src/ktx_daemon/sql_analysis.py`: add a sqlglot-backed read-only SQL validator. +- `python/ktx-daemon/src/ktx_daemon/app.py`: expose `POST /sql/validate-read-only`. +- `python/ktx-daemon/tests/test_sql_analysis.py`: cover accepted SELECT/WITH and rejected CTE-DML, multi-statement, command, pragma, and parse-error payloads. +- `python/ktx-daemon/tests/test_app.py`: cover the new HTTP endpoint. +- `packages/context/src/sql-analysis/ports.ts`: add `validateReadOnly()` to `SqlAnalysisPort`. +- `packages/context/src/sql-analysis/http-sql-analysis-port.ts`: call `/sql/validate-read-only` and map its response. +- `packages/context/src/sql-analysis/http-sql-analysis-port.test.ts`: cover request and response mapping. +- `packages/context/src/mcp/types.ts`: add `KtxSqlExecutionMcpPort` and `sqlExecution` to `KtxMcpContextPorts`. +- `packages/context/src/mcp/context-tools.ts`: add the MCP `sql_execution` schema and registration. +- `packages/context/src/mcp/server.test.ts`: assert MCP registration and structured output for `sql_execution`. +- `packages/context/src/mcp/local-project-ports.ts`: expose local project SQL execution only when both `SqlAnalysisPort.validateReadOnly()` and a local scan connector factory are available. +- `packages/context/src/mcp/local-project-ports.test.ts`: cover validator success and validator rejection. + +### Task 1: Add sqlglot Read-Only Validation + +**Files:** +- Modify: `python/ktx-daemon/tests/test_sql_analysis.py` +- Modify: `python/ktx-daemon/src/ktx_daemon/sql_analysis.py` +- Modify: `python/ktx-daemon/tests/test_app.py` +- Modify: `python/ktx-daemon/src/ktx_daemon/app.py` + +- [ ] **Step 1: Write failing sqlglot validator tests** + +In `python/ktx-daemon/tests/test_sql_analysis.py`, update the import block to include the new request model and function: + +```python +from ktx_daemon.sql_analysis import ( + AnalyzeSqlBatchItem, + AnalyzeSqlBatchRequest, + ValidateReadOnlySqlRequest, + _columns_from_nodes, + analyze_sql_batch_response, + validate_read_only_sql_response, +) +``` + +Add these tests after `test_columns_from_nodes_ignores_non_expression_clause_values`: + +```python +def test_validate_read_only_sql_accepts_select_and_with_queries() -> None: + select_response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest( + dialect="postgres", + sql="select id, status from public.orders where status = 'paid'", + ) + ) + with_response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest( + dialect="postgres", + sql=( + "with paid as (select * from public.orders where status = 'paid') " + "select count(*) from paid" + ), + ) + ) + + assert select_response.ok is True + assert select_response.error is None + assert with_response.ok is True + assert with_response.error is None + + +def test_validate_read_only_sql_rejects_cte_dml() -> None: + response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest( + dialect="postgres", + sql="with x as (insert into audit.events values (1) returning *) select * from x", + ) + ) + + assert response.ok is False + assert response.error == "SQL contains read/write operation: Insert" + + +def test_validate_read_only_sql_rejects_multi_statement_payloads() -> None: + response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest( + dialect="postgres", + sql="select * from public.orders; delete from public.orders", + ) + ) + + assert response.ok is False + assert response.error == "Only one SQL statement can be executed." + + +def test_validate_read_only_sql_rejects_commands_and_pragmas() -> None: + command_response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest(dialect="postgres", sql="call refresh_stats()") + ) + pragma_response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest(dialect="sqlite", sql="pragma table_info(users)") + ) + + assert command_response.ok is False + assert command_response.error == "SQL contains read/write operation: Command" + assert pragma_response.ok is False + assert pragma_response.error == "SQL contains read/write operation: Pragma" + + +def test_validate_read_only_sql_reports_parse_errors() -> None: + response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest(dialect="postgres", sql="select * from where") + ) + + assert response.ok is False + assert response.error is not None + assert "Invalid expression" in response.error +``` + +- [ ] **Step 2: Run failing Python validator tests** + +Run: + +```bash +source .venv/bin/activate && uv run pytest python/ktx-daemon/tests/test_sql_analysis.py -q +``` + +Expected: FAIL with an import error for `ValidateReadOnlySqlRequest` or `validate_read_only_sql_response`. + +- [ ] **Step 3: Implement the sqlglot validator** + +In `python/ktx-daemon/src/ktx_daemon/sql_analysis.py`, add this model after `AnalyzeSqlBatchResponse`: + +```python +class ValidateReadOnlySqlRequest(BaseModel): + dialect: str + sql: str + + +class ValidateReadOnlySqlResponse(BaseModel): + ok: bool + error: str | None = None +``` + +Add this constant after the model definitions: + +```python +_READ_ONLY_ROOT_TYPES = (exp.Select, exp.Union) +_READ_WRITE_NODE_TYPES = ( + exp.Alter, + exp.Analyze, + exp.Cache, + exp.Command, + exp.Commit, + exp.Copy, + exp.Create, + exp.Delete, + exp.Describe, + exp.Drop, + exp.Execute, + exp.Grant, + exp.Insert, + exp.Merge, + exp.Pragma, + exp.Refresh, + exp.Revoke, + exp.Rollback, + exp.Set, + exp.Show, + exp.Transaction, + exp.TruncateTable, + exp.Uncache, + exp.Update, + exp.Use, +) +``` + +Add this function after `_analyze_payload`: + +```python +def validate_read_only_sql_response( + request: ValidateReadOnlySqlRequest, +) -> ValidateReadOnlySqlResponse: + try: + statements = sqlglot.parse(request.sql, read=request.dialect) + except sqlglot.errors.SqlglotError as exc: + return ValidateReadOnlySqlResponse(ok=False, error=str(exc)) + + if len(statements) != 1: + return ValidateReadOnlySqlResponse( + ok=False, + error="Only one SQL statement can be executed.", + ) + + tree = statements[0] + if tree is None: + return ValidateReadOnlySqlResponse(ok=False, error="SQL did not parse to a statement.") + if not isinstance(tree, _READ_ONLY_ROOT_TYPES): + return ValidateReadOnlySqlResponse( + ok=False, + error=f"SQL contains read/write operation: {type(tree).__name__}", + ) + + for node in tree.walk(): + if isinstance(node, _READ_WRITE_NODE_TYPES): + return ValidateReadOnlySqlResponse( + ok=False, + error=f"SQL contains read/write operation: {type(node).__name__}", + ) + + return ValidateReadOnlySqlResponse(ok=True, error=None) +``` + +- [ ] **Step 4: Run Python validator tests** + +Run: + +```bash +source .venv/bin/activate && uv run pytest python/ktx-daemon/tests/test_sql_analysis.py -q +``` + +Expected: PASS. + +- [ ] **Step 5: Write failing HTTP endpoint test** + +In `python/ktx-daemon/tests/test_app.py`, add this test after `test_sql_parse_table_identifier_endpoint`: + +```python +def test_sql_validate_read_only_endpoint() -> None: + client = TestClient(create_app()) + + ok_response = client.post( + "/sql/validate-read-only", + json={"dialect": "postgres", "sql": "select * from public.orders"}, + ) + bad_response = client.post( + "/sql/validate-read-only", + json={ + "dialect": "postgres", + "sql": "with x as (insert into audit.events values (1) returning *) select * from x", + }, + ) + + assert ok_response.status_code == 200 + assert ok_response.json() == {"ok": True, "error": None} + assert bad_response.status_code == 200 + assert bad_response.json() == { + "ok": False, + "error": "SQL contains read/write operation: Insert", + } +``` + +- [ ] **Step 6: Run failing HTTP endpoint test** + +Run: + +```bash +source .venv/bin/activate && uv run pytest python/ktx-daemon/tests/test_app.py -q -k validate_read_only +``` + +Expected: FAIL with HTTP 404 for `/sql/validate-read-only`. + +- [ ] **Step 7: Register the HTTP endpoint** + +In `python/ktx-daemon/src/ktx_daemon/app.py`, update the SQL-analysis import to include the new symbols: + +```python +from ktx_daemon.sql_analysis import ( + AnalyzeSqlBatchRequest, + AnalyzeSqlBatchResponse, + ValidateReadOnlySqlRequest, + ValidateReadOnlySqlResponse, + analyze_sql_batch_response, + validate_read_only_sql_response, +) +``` + +Add this endpoint immediately before the existing `@app.post("/sql/analyze-batch", ...)` route: + +```python + @app.post("/sql/validate-read-only", response_model=ValidateReadOnlySqlResponse) + async def sql_validate_read_only( + request: ValidateReadOnlySqlRequest, + ) -> ValidateReadOnlySqlResponse: + try: + return validate_read_only_sql_response(request) + except Exception as error: + logger.exception("SQL read-only validation failed: %s", error) + raise HTTPException( + status_code=500, + detail=f"SQL read-only validation failed: {error}", + ) from error +``` + +- [ ] **Step 8: Run Python HTTP endpoint test** + +Run: + +```bash +source .venv/bin/activate && uv run pytest python/ktx-daemon/tests/test_app.py -q -k validate_read_only +``` + +Expected: PASS. + +- [ ] **Step 9: Commit Python validator** + +Run: + +```bash +git add python/ktx-daemon/src/ktx_daemon/sql_analysis.py python/ktx-daemon/src/ktx_daemon/app.py python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py +git commit -m "feat(daemon): validate read-only SQL with sqlglot" +``` + +### Task 2: Expose Read-Only Validation Through the TypeScript SQL-Analysis Port + +**Files:** +- Modify: `packages/context/src/sql-analysis/ports.ts` +- Modify: `packages/context/src/sql-analysis/http-sql-analysis-port.test.ts` +- Modify: `packages/context/src/sql-analysis/http-sql-analysis-port.ts` + +- [ ] **Step 1: Add the port contract** + +In `packages/context/src/sql-analysis/ports.ts`, add this interface after `SqlAnalysisBatchResult`: + +```typescript +export interface SqlReadOnlyValidationResult { + ok: boolean; + error?: string | null; +} +``` + +Update `SqlAnalysisPort` to include the new method: + +```typescript +export interface SqlAnalysisPort { + analyzeForFingerprint(sql: string, dialect: SqlAnalysisDialect): Promise; + analyzeBatch( + items: SqlAnalysisBatchItem[], + dialect: SqlAnalysisDialect, + ): Promise>; + validateReadOnly(sql: string, dialect: SqlAnalysisDialect): Promise; +} +``` + +- [ ] **Step 2: Write failing HTTP port tests** + +In `packages/context/src/sql-analysis/http-sql-analysis-port.test.ts`, add this test inside the existing `describe('createHttpSqlAnalysisPort', ...)` block: + +```typescript + it('maps read-only SQL validation responses', async () => { + const requests: Array<{ path: string; payload: Record }> = []; + const port = createHttpSqlAnalysisPort({ + baseUrl: 'http://127.0.0.1:8765', + requestJson: async (path, payload) => { + requests.push({ path, payload }); + return { ok: false, error: 'SQL contains read/write operation: Insert' }; + }, + }); + + await expect(port.validateReadOnly('with x as (insert into t values (1)) select * from x', 'postgres')).resolves.toEqual({ + ok: false, + error: 'SQL contains read/write operation: Insert', + }); + expect(requests).toEqual([ + { + path: '/sql/validate-read-only', + payload: { + dialect: 'postgres', + sql: 'with x as (insert into t values (1)) select * from x', + }, + }, + ]); + }); +``` + +Add this test after it: + +```typescript + it('rejects malformed read-only validation responses', async () => { + const port = createHttpSqlAnalysisPort({ + baseUrl: 'http://127.0.0.1:8765', + requestJson: async () => ({ ok: 'yes' }), + }); + + await expect(port.validateReadOnly('select 1', 'postgres')).rejects.toThrow( + 'sql analysis response is missing boolean field ok', + ); + }); +``` + +- [ ] **Step 3: Run failing HTTP port tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sql-analysis/http-sql-analysis-port.test.ts +``` + +Expected: FAIL because `validateReadOnly` is not implemented. + +- [ ] **Step 4: Implement HTTP response mapping** + +In `packages/context/src/sql-analysis/http-sql-analysis-port.ts`, update the type import to include `SqlReadOnlyValidationResult`: + +```typescript + SqlReadOnlyValidationResult, +``` + +Add this helper after `requiredStringArray`: + +```typescript +function requiredBoolean(raw: Record, field: string): boolean { + const value = raw[field]; + if (typeof value !== 'boolean') { + throw new Error(`sql analysis response is missing boolean field ${field}`); + } + return value; +} +``` + +Add this mapper after `mapBatchResponse`: + +```typescript +function mapReadOnlyValidation(raw: Record): SqlReadOnlyValidationResult { + const error = optionalString(raw, 'error'); + return { + ok: requiredBoolean(raw, 'ok'), + ...(error !== undefined ? { error } : {}), + }; +} +``` + +Add this method to the object returned by `createHttpSqlAnalysisPort`: + +```typescript + async validateReadOnly(sql: string, dialect: SqlAnalysisDialect) { + const raw = await requestJson('/sql/validate-read-only', { + dialect, + sql, + }); + return mapReadOnlyValidation(raw); + }, +``` + +- [ ] **Step 5: Run HTTP port tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sql-analysis/http-sql-analysis-port.test.ts +``` + +Expected: PASS. + +- [ ] **Step 6: Commit TypeScript SQL-analysis port** + +Run: + +```bash +git add packages/context/src/sql-analysis/ports.ts packages/context/src/sql-analysis/http-sql-analysis-port.ts packages/context/src/sql-analysis/http-sql-analysis-port.test.ts +git commit -m "feat(context): expose read-only SQL validation port" +``` + +### Task 3: Register the MCP `sql_execution` Tool Contract + +**Files:** +- Modify: `packages/context/src/mcp/types.ts` +- Modify: `packages/context/src/mcp/context-tools.ts` +- Modify: `packages/context/src/mcp/server.test.ts` + +- [ ] **Step 1: Add the MCP SQL execution port types** + +In `packages/context/src/mcp/types.ts`, add these interfaces immediately before `KtxMcpContextPorts`: + +```typescript +export interface KtxSqlExecutionResponse { + headers: string[]; + headerTypes?: string[]; + rows: unknown[][]; + rowCount: number; +} + +export interface KtxSqlExecutionMcpPort { + execute(input: { connectionId: string; sql: string; maxRows: number }): Promise; +} +``` + +Then add the new optional port to `KtxMcpContextPorts`: + +```typescript + sqlExecution?: KtxSqlExecutionMcpPort; +``` + +- [ ] **Step 2: Write failing MCP registration test** + +In `packages/context/src/mcp/server.test.ts`, update the type import from `./types.js` to include `KtxSqlExecutionMcpPort`. + +Add this test in `describe('createKtxMcpServer', ...)` after the existing connection-list registration test: + +```typescript + it('registers parser-gated sql_execution when the host provides a SQL execution port', async () => { + const fake = makeFakeServer(); + const sqlExecution: KtxSqlExecutionMcpPort = { + execute: vi.fn().mockResolvedValue({ + headers: ['status', 'count'], + headerTypes: ['text', 'bigint'], + rows: [['paid', 42]], + rowCount: 1, + }), + }; + + createKtxMcpServer({ + server: fake.server, + userContext: { userId: 'local-user' }, + contextTools: { + sqlExecution, + }, + }); + + expect(fake.tools.map((tool) => tool.name)).toEqual(['sql_execution']); + await expect( + getTool(fake.tools, 'sql_execution').handler({ + connectionId: 'warehouse', + sql: 'select status, count(*) from public.orders group by status', + maxRows: 50, + }), + ).resolves.toEqual({ + content: [ + { + type: 'text', + text: JSON.stringify( + { + headers: ['status', 'count'], + headerTypes: ['text', 'bigint'], + rows: [['paid', 42]], + rowCount: 1, + }, + null, + 2, + ), + }, + ], + structuredContent: { + headers: ['status', 'count'], + headerTypes: ['text', 'bigint'], + rows: [['paid', 42]], + rowCount: 1, + }, + }); + expect(sqlExecution.execute).toHaveBeenCalledWith({ + connectionId: 'warehouse', + sql: 'select status, count(*) from public.orders group by status', + maxRows: 50, + }); + }); +``` + +- [ ] **Step 3: Run failing MCP registration test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/server.test.ts -t sql_execution +``` + +Expected: FAIL because `sql_execution` is not registered. + +- [ ] **Step 4: Add the MCP schema and registration** + +In `packages/context/src/mcp/context-tools.ts`, add this schema after `scanArtifactReadSchema`: + +```typescript +const sqlExecutionSchema = z.object({ + connectionId: connectionIdSchema, + sql: z.string().min(1), + maxRows: z.number().int().min(1).max(10_000).default(1000).optional(), +}); +``` + +Add this registration block in `registerKtxContextTools`, after the semantic-layer block and before the ingest block: + +```typescript + if (ports.sqlExecution) { + const sqlExecution = ports.sqlExecution; + registerParsedTool( + server, + 'sql_execution', + { + title: 'SQL Execution', + description: + 'Execute one parser-validated read-only SQL query against a configured KTX connection and return structured rows.', + inputSchema: sqlExecutionSchema.shape, + }, + sqlExecutionSchema, + async (input) => { + try { + return jsonToolResult( + await sqlExecution.execute({ + connectionId: input.connectionId, + sql: input.sql, + maxRows: input.maxRows ?? 1000, + }), + ); + } catch (error) { + return jsonErrorToolResult(error instanceof Error ? error.message : String(error)); + } + }, + ); + } +``` + +- [ ] **Step 5: Run MCP registration test** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/server.test.ts -t sql_execution +``` + +Expected: PASS. + +- [ ] **Step 6: Commit MCP tool contract** + +Run: + +```bash +git add packages/context/src/mcp/types.ts packages/context/src/mcp/context-tools.ts packages/context/src/mcp/server.test.ts +git commit -m "feat(context): register MCP sql execution tool" +``` + +### Task 4: Implement Local Project SQL Execution With Parser Validation + +**Files:** +- Modify: `packages/context/src/mcp/local-project-ports.ts` +- Modify: `packages/context/src/mcp/local-project-ports.test.ts` + +- [ ] **Step 1: Write failing local-port success test** + +In `packages/context/src/mcp/local-project-ports.test.ts`, update the imports from `../scan/index.js` to include `type KtxQueryResult`. + +Replace the existing `testConnector` helper with this version so tests can opt into read-only SQL: + +```typescript + function testConnector( + snapshot = testSnapshot(), + queryResult?: KtxQueryResult, + ): KtxScanConnector { + return { + id: `test:${snapshot.connectionId}`, + driver: snapshot.driver, + capabilities: createKtxConnectorCapabilities({ readOnlySql: queryResult !== undefined }), + introspect: vi.fn(async () => snapshot), + executeReadOnly: queryResult === undefined ? undefined : vi.fn(async () => queryResult), + cleanup: vi.fn(async () => {}), + }; + } +``` + +Add this test after `tests a local project connection through the native scan connector factory`: + +```typescript + it('executes MCP SQL only after parser-backed validation passes', async () => { + const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + const connector = testConnector(testSnapshot(), { + headers: ['id'], + headerTypes: ['integer'], + rows: [[1]], + totalRows: 1, + rowCount: 1, + }); + const createConnector = vi.fn(async () => connector); + const sqlAnalysis = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(), + validateReadOnly: vi.fn(async () => ({ ok: true, error: null })), + }; + const ports = createLocalProjectMcpContextPorts(project, { + sqlAnalysis, + localScan: { + createConnector, + }, + }); + + await expect( + ports.sqlExecution?.execute({ + connectionId: 'warehouse', + sql: 'select id from public.orders', + maxRows: 5, + }), + ).resolves.toEqual({ + headers: ['id'], + headerTypes: ['integer'], + rows: [[1]], + rowCount: 1, + }); + expect(sqlAnalysis.validateReadOnly).toHaveBeenCalledWith('select id from public.orders', 'postgres'); + expect(createConnector).toHaveBeenCalledWith('warehouse'); + expect(connector.executeReadOnly).toHaveBeenCalledWith( + { + connectionId: 'warehouse', + sql: 'select id from public.orders', + maxRows: 5, + }, + { runId: 'mcp-sql-execution' }, + ); + expect(connector.cleanup).toHaveBeenCalled(); + }); +``` + +- [ ] **Step 2: Write failing local-port rejection test** + +Add this test after the success test: + +```typescript + it('rejects MCP SQL before connector execution when parser validation fails', async () => { + const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + const connector = testConnector(testSnapshot(), { + headers: ['id'], + rows: [[1]], + totalRows: 1, + rowCount: 1, + }); + const sqlAnalysis = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(), + validateReadOnly: vi.fn(async () => ({ + ok: false, + error: 'SQL contains read/write operation: Insert', + })), + }; + const ports = createLocalProjectMcpContextPorts(project, { + sqlAnalysis, + localScan: { + createConnector: vi.fn(async () => connector), + }, + }); + + await expect( + ports.sqlExecution?.execute({ + connectionId: 'warehouse', + sql: 'with x as (insert into t values (1) returning *) select * from x', + maxRows: 1000, + }), + ).rejects.toThrow('SQL contains read/write operation: Insert'); + expect(connector.executeReadOnly).not.toHaveBeenCalled(); + }); +``` + +- [ ] **Step 3: Run failing local-port tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/local-project-ports.test.ts -t "MCP SQL" +``` + +Expected: FAIL because `CreateLocalProjectMcpContextPortsOptions` has no `sqlAnalysis` option and no `sqlExecution` port. + +- [ ] **Step 4: Add SQL-analysis option and helper imports** + +In `packages/context/src/mcp/local-project-ports.ts`, add this import with the other context imports: + +```typescript +import type { SqlAnalysisDialect, SqlAnalysisPort } from '../sql-analysis/index.js'; +``` + +Add `sqlAnalysis` to `CreateLocalProjectMcpContextPortsOptions`: + +```typescript + sqlAnalysis?: SqlAnalysisPort; +``` + +Add this helper near `dialectForDriver`: + +```typescript +function sqlAnalysisDialectForDriver(driver: string | undefined): SqlAnalysisDialect { + return dialectForDriver(driver) as SqlAnalysisDialect; +} +``` + +- [ ] **Step 5: Implement the local SQL execution port** + +In `packages/context/src/mcp/local-project-ports.ts`, add this function before `createLocalProjectMcpContextPorts`: + +```typescript +async function executeValidatedReadOnlySql( + project: KtxLocalProject, + options: CreateLocalProjectMcpContextPortsOptions, + input: { connectionId: string; sql: string; maxRows: number }, +): Promise<{ headers: string[]; headerTypes?: string[]; rows: unknown[][]; rowCount: number }> { + const connectionId = assertSafeConnectionId(input.connectionId); + const connection = project.config.connections[connectionId]; + if (!connection) { + throw new Error(`Connection "${connectionId}" is not configured in ktx.yaml`); + } + if (!options.sqlAnalysis) { + throw new Error('sql_execution requires parser-backed SQL validation.'); + } + const validation = await options.sqlAnalysis.validateReadOnly( + input.sql, + sqlAnalysisDialectForDriver(connection.driver), + ); + if (!validation.ok) { + throw new Error(validation.error ?? 'SQL is not read-only.'); + } + const createConnector = options.localScan?.createConnector; + if (!createConnector) { + throw new Error('sql_execution requires a local scan connector factory.'); + } + + let connector: KtxScanConnector | null = null; + try { + connector = await createConnector(connectionId); + if (!connector.capabilities.readOnlySql || !connector.executeReadOnly) { + throw new Error(`Connection "${connectionId}" does not support read-only SQL execution.`); + } + const result = await connector.executeReadOnly( + { + connectionId, + sql: input.sql, + maxRows: input.maxRows, + }, + { runId: 'mcp-sql-execution' }, + ); + return { + headers: result.headers, + ...(result.headerTypes ? { headerTypes: result.headerTypes } : {}), + rows: result.rows, + rowCount: result.rowCount ?? result.rows.length, + }; + } finally { + await cleanupConnector(connector); + } +} +``` + +In `createLocalProjectMcpContextPorts`, add this conditional block immediately after the initial `ports` object is created and before the existing `if (options.localIngest)` block: + +```typescript + if (options.sqlAnalysis && options.localScan?.createConnector) { + ports.sqlExecution = { + async execute(input) { + return executeValidatedReadOnlySql(project, options, input); + }, + }; + } +``` + +- [ ] **Step 6: Run local-port tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/mcp/local-project-ports.test.ts -t "MCP SQL" +``` + +Expected: PASS. + +- [ ] **Step 7: Commit local MCP SQL execution** + +Run: + +```bash +git add packages/context/src/mcp/local-project-ports.ts packages/context/src/mcp/local-project-ports.test.ts +git commit -m "feat(context): execute MCP SQL through validated connector path" +``` + +### Task 5: Verification + +**Files:** +- Verify: all modified files from Tasks 1-4 + +- [ ] **Step 1: Run Python SQL-analysis and app tests** + +Run: + +```bash +source .venv/bin/activate && uv run pytest python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py -q +``` + +Expected: PASS. + +- [ ] **Step 2: Run focused TypeScript tests** + +Run: + +```bash +pnpm --filter @ktx/context exec vitest run src/sql-analysis/http-sql-analysis-port.test.ts src/mcp/server.test.ts src/mcp/local-project-ports.test.ts +``` + +Expected: PASS. + +- [ ] **Step 3: Run type-check** + +Run: + +```bash +pnpm --filter @ktx/context run type-check +``` + +Expected: PASS. + +- [ ] **Step 4: Run Python pre-commit on changed Python files** + +Run: + +```bash +source .venv/bin/activate && uv run pre-commit run --files python/ktx-daemon/src/ktx_daemon/sql_analysis.py python/ktx-daemon/src/ktx_daemon/app.py python/ktx-daemon/tests/test_sql_analysis.py python/ktx-daemon/tests/test_app.py +``` + +Expected: PASS. If the repository has no usable pre-commit configuration in the active environment, record the exact error and keep the pytest results above as the closest Python verification. + +- [ ] **Step 5: Confirm the remaining v1 blockers are unchanged** + +Run: + +```bash +test -e packages/context/src/scan/entity-details.ts; printf 'entity-details:%s\n' "$?" +test -e packages/context/src/sl/dictionary-search.ts; printf 'dictionary-search:%s\n' "$?" +test -e packages/context/src/search/discover.ts; printf 'discover:%s\n' "$?" +test -e packages/cli/src/commands/mcp-commands.ts; printf 'mcp-commands:%s\n' "$?" +test -e packages/cli/src/skills/research/SKILL.md; printf 'research-skill:%s\n' "$?" +``` + +Expected: + +```text +entity-details:1 +dictionary-search:1 +discover:1 +mcp-commands:1 +research-skill:1 +``` + +These `1` exit-code markers confirm this plan landed only the SQL execution foundation and did not silently claim the remaining research-tool, daemon, or setup-agent v1 work. + +- [ ] **Step 6: Commit verification notes if any test docs changed** + +Run: + +```bash +git status --short +``` + +Expected: no uncommitted source changes after the task commits. If verification required a small documentation note, commit only that note with: + +```bash +git add docs/superpowers/plans/2026-05-14-research-agent-mcp-sql-execution-foundation.md +git commit -m "docs: record research MCP SQL execution plan" +``` diff --git a/docs/superpowers/specs/2026-05-14-research-agent-mcp-tools-design.md b/docs/superpowers/specs/2026-05-14-research-agent-mcp-tools-design.md new file mode 100644 index 00000000..a8044076 --- /dev/null +++ b/docs/superpowers/specs/2026-05-14-research-agent-mcp-tools-design.md @@ -0,0 +1,931 @@ +# Research Agent MCP Tools Design + +**Date:** 2026-05-14 +**Author:** Andrey Avtomonov +**Status:** Design — pending implementation plan + +## Background + +KTX positions itself as a standalone context layer for database agents. +External agents — Claude Code, Cursor, Codex, opencode — should be able to +connect to a local KTX instance via MCP and perform research against +configured data connections. + +The existing MCP surface (`packages/context/src/mcp/context-tools.ts`) already +exposes strong **context** primitives: wiki search/read/write, semantic-layer +list/read/write/validate/query, ingest and scan run management, memory +capture. What it is missing is the **active investigation** primitives a +research agent needs: + +- The agent cannot run raw SQL against a connection. `sl_query` only covers + semantic-layer-defined queries. +- The agent cannot inspect raw table or column metadata for tables that are + not yet modeled in the semantic layer. +- The agent cannot find which column holds a literal value mentioned by the + user (e.g., "Acme Corp"). +- The agent must call multiple separate search tools (`wiki_search`, + `sl_list_sources`) and reconcile results manually instead of getting a + unified ranked discovery view. + +The Kaelio research agent (reference implementation at +`/Users/andrey/conductor/workspaces/kaelio-main2/douala/server/src/cores/research-execution.core.ts`) +addresses these gaps with tools named `sql_execution`, `entity_details`, +`dictionary_search`, and `discover_data`, used in a discovery → inspection → +query loop. The corresponding KTX infrastructure already exists in pieces: + +- `KtxScanConnector.executeReadOnly` on every connector + (`packages/connector-postgres/src/connector.ts:447` and siblings) — read-only + SQL execution with `assertReadOnlySql` and `limitSqlForExecution`. +- `KtxSchemaSnapshot` from scan reports — full table/column/FK metadata. +- `SlDictionaryEntry` extraction over relationship-profiling artifacts + (`packages/context/src/sl/sl-dictionary-profile.ts`). +- Hybrid search core with Reciprocal Rank Fusion + (`packages/context/src/search/{hybrid-search-core,rrf}.ts`). + +This design exposes those primitives as four new MCP tools, adds a research +skill to guide external agents, and introduces an HTTP-only `ktx mcp` daemon +to host the MCP server. + +## Goals + +- Expose four new MCP tools that turn KTX into a research-capable context + layer for any MCP-compatible client: `discover_data`, `entity_details`, + `dictionary_search`, `sql_execution`. +- Ship a `ktx-research` skill installable via `ktx setup-agents`, describing + the discover → inspect → query → capture workflow for external agents. +- Provide a `ktx mcp` CLI subtree that runs the MCP server over HTTP on + localhost, with the same lifecycle pattern as the existing managed Python + daemon (`packages/cli/src/managed-python-daemon.ts`). +- Make `ktx setup-agents` install MCP client configuration for the configured + targets pointing at the local HTTP endpoint. v1 splits this by client: for + claude-code and cursor (JSON config), `setup-agents` writes the entry + directly; for codex (TOML) and opencode (different JSON wrapper), + `setup-agents` prints a copy-pasteable snippet rather than writing the file. + See the client matrix below for full per-target behavior. +- Reuse existing infrastructure (connector `executeReadOnly`, schema + snapshots, dictionary profile, hybrid search + RRF) rather than building + parallel implementations. + +## Non-goals + +- This spec does not build an agent loop inside KTX. The system prompt, step + budget, tool dispatch, and methodology tracking remain in the external + client. KTX is a context provider, not an agent runner. +- This spec does not expose Python code execution. The `ktx-daemon` + `/code/execute` endpoint exists but is not surfaced via MCP. That is a + separate design with its own sandboxing and security considerations. +- This spec does not ship widget rendering, chart creation, or scheduled + report execution. Those are presentation concerns the external client owns. +- This spec does not implement stdio MCP transport. HTTP-only. +- This spec does not implement OS-level auto-start (launchd, systemd user + units). `ktx mcp start` must be run explicitly. +- This spec does not implement remote network exposure beyond loopback. Token + auth and non-`127.0.0.1` binding are supported but TLS, audit logging, and + multi-tenant isolation are out of scope for v1. + +## Tool inventory + +Four new MCP tools, registered in `packages/context/src/mcp/context-tools.ts` +alongside the existing tools. + +### Relationship to existing warehouse-verification tools + +KTX already ships ingest-side implementations of `sql_execution`, +`entity_details`, and `discover_data` at +`packages/context/src/ingest/tools/warehouse-verification/{sql-execution,entity-details,discover-data}.tool.ts`, +backed by `warehouse-catalog.service.ts`. Their contracts differ from the +MCP shapes proposed below in three concrete ways: + +- They currently take `connectionName` (slug-shaped); this spec renames + them to `connectionId` in the same change (see below). +- They take `targets` (a discriminated `display` vs. `{catalog,db,name}` + union) and `rowLimit`, not `entities` / `maxRows`. +- They return `{ markdown, structured }` with scan availability, candidate + matches, and ingest-session-allowed-connection scoping, not the + MCP-shaped pure-structured outputs in this spec. + +To avoid two divergent contracts for the same primitives, the MCP tools +**must be implemented by extracting the shared logic out of +`warehouse-verification/*` and into reusable services** +(e.g., `WarehouseCatalogService` as the source of truth for table/column +resolution and discovery, plus a shared read-only SQL executor that wraps +`assertReadOnlySql`/`limitSqlForExecution`). The ingest tools and the new +MCP tools then become thin adapters around those services with their own +input/output shapes appropriate to each surface. + +KTX has no public users yet, so the same change that introduces the MCP +tools renames the ingest-side `connectionName` parameter to `connectionId` +across `warehouse-verification/*.tool.ts`, `warehouse-catalog.service.ts`, +and any callers. `connectionId` matches the rest of the in-process MCP +surface (`sl_query`, `sl_list_sources`, `scan_trigger`, etc.) and the new +MCP tool inputs. The ingest tools and the new MCP tools then share both +the service layer and the parameter name; only their input/output shapes +differ (markdown+structured for the ingest surface, pure structured for +the MCP surface). + +### discover_data + +Unified ranked search across wiki, semantic-layer sources/measures/dimensions, +and raw schema tables/columns. Returns refs only with a uniform shape; the +agent dereferences top hits using the existing `wiki_read`, `sl_read_source`, +or `entity_details` tools. + +**Input schema:** + +```typescript +{ + query: z.string().min(1), + connectionId: z.string().optional(), // omit → all connections + kinds: z.array(z.enum([ + 'wiki', 'sl_source', 'sl_measure', 'sl_dimension', 'table', 'column', + ])).optional(), // omit → all kinds + limit: z.number().int().min(1).max(50).default(15).optional(), +} +``` + +**Output:** array of refs, each: + +```typescript +{ + kind: 'wiki' | 'sl_source' | 'sl_measure' | 'sl_dimension' | 'table' | 'column', + id: string, // stable id: wiki key, source name, or driver-qualified table/column display string + score: number, // RRF fused score, 0-1 range + summary: string | null, // one-line description; null when no source field is populated + snippet: string | null, // short context snippet, ≤200 chars; null when nothing meaningful to show + matchedOn: // why this result matched (powers the snippet for non-description kinds) + | 'name' | 'display' | 'description' | 'comment' | 'expr' | 'sample_value' | 'body', + connectionId?: string, // present for non-wiki kinds + tableRef?: { // present for kind 'table' and 'column' + catalog: string | null, + db: string | null, + name: string, + }, + columnName?: string, // present for kind 'column' +} +``` + +The structured `tableRef` mirrors the live `KtxSchemaTable` identity +(`packages/context/src/scan/types.ts:74-83`) so callers can pass refs into +`entity_details` without losing `catalog`/`db` qualification on drivers +that need it (BigQuery `project.dataset.table`, Snowflake/SQL Server +`database.schema.table`). + +#### `summary` and `snippet` provenance per kind + +Both fields are derived from existing source data, never invented or +LLM-generated. The resolver is pure and deterministic per kind. When no +source field exists for a given kind, the field is `null`; agents must +not assume a missing snippet means "no context" — they should dereference +the ref via `wiki_read`, `sl_read_source`, or `entity_details` to get +authoritative content. + +| Kind | `summary` source | `snippet` source | +|---|---|---| +| `wiki` | `WikiFrontmatter.summary` (`packages/context/src/wiki/types.ts:15`) — populated at write time | Up to 200 chars from the wiki body around the match position; falls back to first 200 chars of body when `matchedOn === 'name'`/`'display'` | +| `sl_source` | `resolveDescription(source.descriptions, priority)` (`packages/context/src/sl/descriptions.ts:16-34`) over the `user|ai|dbt|db` priority chain (`packages/context/src/sl/types.ts:5`) | When `matchedOn === 'description'`/`'body'`: a window of the resolved description; otherwise the source's `name` + first 1–2 measure or dimension names as context | +| `sl_measure` | `measure.description` (`packages/context/src/sl/types.ts:37`) | `measure.expr` truncated to 200 chars — the calculation is the most informative one-line context for a measure | +| `sl_dimension` | `resolveDescription(column.descriptions, priority)` (same precedence as `sl_source`); when empty, fall back to `null` | `${column.name} (${column.type})` formatted exactly like the existing inline rendering in `sl-search.service.ts:29-41` | +| `table` | `firstDescription(table.descriptions)` then `table.comment` (precedence already used by `warehouse-catalog.service.ts:286-287`); `null` when both are empty | When `matchedOn === 'description'`/`'comment'`: a window of that string; when `matchedOn === 'name'`/`'display'`: a comma-joined list of up to 5 of the table's column names | +| `column` | `resolveDescription(column.descriptions)` then `column.comment` (`warehouse-catalog.service.ts:228-245`); `null` when both are empty | When `matchedOn === 'description'`/`'comment'`: that text; when `matchedOn === 'sample_value'`: `${column.nativeType} · samples: ` formatted from `column.sampleValues` (`warehouse-catalog.service.ts:18-23`); otherwise `${column.nativeType}` | + +The `matchedOn` field is the same concept as the existing +`RawSchemaHit.matchedOn` in `warehouse-catalog.service.ts:40-54`, +extended to the wiki and SL kinds. Snippets always come from a single +already-stored field; the resolver never concatenates across sources or +invents bridging text. Length cap is enforced at the producer side (≤200 +chars after a single-pass slice; no ellipsis appended — clients render +one if they want). + +**Implementation:** new module `packages/context/src/search/discover.ts`. +Composes three sub-searches in parallel: + +1. Wiki search via the existing wiki search backend. +2. SL search over sources/measures/dimensions using existing + `sl-sources-index` (or a new lightweight index if needed for measure + granularity). +3. Raw schema search over tables and columns from `KtxSchemaSnapshot`, + indexed at scan time and stored alongside other scan artifacts. + +Results from each sub-search are fused with `packages/context/src/search/rrf.ts` +using equal weights. The `kinds` filter constrains which sub-searches run. + +### entity_details + +Read structured metadata for one or more raw tables (and optionally specific +columns) from the latest scan snapshot. The raw-data equivalent of +`sl_read_source`. + +**Input schema:** + +```typescript +{ + connectionId: z.string().min(1), + entities: z.array(z.object({ + // table accepts either a driver-display string ("project.dataset.table", + // "schema.name", "db.schema.name") or a structured ref. The resolver + // returns a structured error when the input is ambiguous across multiple + // schemas/catalogs. + table: z.union([ + z.string().min(1), + z.object({ + catalog: z.string().nullable(), + db: z.string().nullable(), + name: z.string().min(1), + }), + ]), + columns: z.array(z.string()).optional(), // omit → all columns + })).min(1).max(20), +} +``` + +**Output:** for each entity, a structured record: + +```typescript +{ + connectionId: string, + tableRef: { // structured identity, lossless on every driver + catalog: string | null, // BigQuery project, Snowflake/SQL Server database + db: string | null, // schema/dataset + name: string, + }, + display: string, // driver-formatted display string + // (e.g. "project.dataset.table", "schema.name") + kind: 'table' | 'view' | 'external' | 'event_stream', // matches KtxSchemaTableKind + comment: string | null, + estimatedRows: number | null, + columns: Array<{ + name: string, + nativeType: string, + normalizedType: string, + dimensionType: 'time' | 'string' | 'number' | 'boolean', + nullable: boolean, + primaryKey: boolean, + comment: string | null, + }>, + foreignKeys: Array<{ + fromColumn: string, + toCatalog: string | null, // qualified FK target, preserves cross-db FKs + toDb: string | null, + toTable: string, + toColumn: string, + constraintName: string | null, + }>, + snapshot: { // freshness metadata, present on every response + syncId: string, // latest scan/sync identifier + extractedAt: string, // ISO-8601 UTC of the snapshot + scanRunId: string | null, // scan run id if available + }, +} +``` + +Output fields mirror `KtxSchemaTable` / `KtxSchemaColumn` / +`KtxSchemaForeignKey` from `packages/context/src/scan/types.ts:51-82`. The +full `KtxSchemaTableKind` set is preserved so BigQuery `external` tables +and warehouses with event-stream sources are not silently coerced. FK +target qualification (`toCatalog`/`toDb`) carries through so agents can +write valid SQL for cross-schema or cross-database references without +re-resolving. + +If `columns` is provided, only the requested columns appear in the `columns` +array (PKs and FKs still report on the full table). + +**Implementation:** new module `packages/context/src/scan/entity-details.ts`. +Reads `KtxSchemaSnapshot` from the same store the existing `scan_*` tools +read. No new infrastructure. If the requested table is not in the latest +snapshot, the tool returns a structured error with a suggestion to run +`ktx ingest `. + +**Cache freshness.** Today `WarehouseCatalogService` caches `ConnectionCatalog` +per connection name with no invalidation +(`packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.ts:248-249`, +`:404-411`). For an ingest tool that runs inside a single short-lived ingest +session that is acceptable, but the MCP daemon is long-lived and serves +clients across multiple `scan_trigger` / `ktx ingest` runs. The MCP adapter +**must** key its cache on the latest scan artifact identity (the `syncId` +derived from the artifact path, or the artifact file mtime) and re-read when +that identity advances. The same rule applies to the shared services backing +`discover_data` and `dictionary_search`. The implementation plan must +either: + +1. Extend `WarehouseCatalogService` (and equivalent dictionary/discover + services) to invalidate cached entries when the underlying artifact + identity advances, or +2. Wrap those services in an MCP-adapter cache layer that performs the + identity check before returning cached values. + +### dictionary_search + +Find which connection, source, and column **profile-sampled** a given literal +value (or substring) such as "Acme Corp" or "shipped". Backed by the existing +`SlDictionaryEntry` extraction over relationship-profiling artifacts. + +**Authoritativeness.** The dictionary index is built from *sampled* values +captured during relationship profiling — by default 5 values per column, +drawn from a sample of up to 10,000 rows +(`packages/context/src/scan/relationship-profiling.ts:409-410`, +`packages/context/src/sl/sl-dictionary-profile.ts:70`). A hit confirms a +column contains the value; a miss is **not** proof that the value is absent +from the column or warehouse — the value may simply have been outside the +profile sample. The tool must surface this distinction in its output and the +research skill must teach agents not to treat a miss as exhaustive. + +**Input schema:** + +```typescript +{ + values: z.array(z.string().min(1)).min(1).max(20), + connectionId: z.string().optional(), // omit → all connections +} +``` + +**Output:** for each input value, the list of matching entries plus +per-connection provenance. Coverage and miss reasons are connection-scoped +because `loadLatestSlDictionaryEntries` iterates each connection's profile +artifact independently +(`packages/context/src/sl/sl-dictionary-profile.ts:96-112`); a single +all-connections call can mix `no_profile_artifact` (one connection never +ran an enriched scan), `value_not_in_sample` (another connection ran but +the literal was outside the sample), and matches in the same response. + +```typescript +{ + // The set of connections actually searched on this call. When the input + // omits connectionId this is every configured connection; otherwise it + // contains the single requested connection. + searched: Array<{ + connectionId: string, + coverage: { + sampledRows: number | null, // profileSampleRows used at profile time + valuesPerColumn: number | null, // sampleValuesPerColumn used at profile time + profiledColumns: number, // count of columns in the dictionary index for this connection + syncId: string | null, // identifier of the profile artifact (null when missing) + profiledAt: string | null, // ISO-8601 UTC of the profile artifact (null when missing) + }, + // Per-connection status, independent of any specific input value: + // ready — profile present with profiled columns + // no_profile_artifact — enriched scan never ran for this connection + // no_candidate_columns — profile present but no columns profile-eligible + status: 'ready' | 'no_profile_artifact' | 'no_candidate_columns', + }>, + results: Array<{ + value: string, // input value + matches: Array<{ + connectionId: string, + sourceName: string, + columnName: string, + matchedValue: string, // actual value found (may differ in case) + cardinality: number | null, // column cardinality if known + }>, + // Per-connection miss reasons for this value, present when that + // connection produced no match. Connections that matched do not appear + // in `misses`. For ready connections with no match, the reason is + // 'value_not_in_sample' (non-authoritative miss). For unready + // connections, the reason mirrors their `status` above. + misses: Array<{ + connectionId: string, + reason: + | 'no_profile_artifact' + | 'no_candidate_columns' + | 'value_not_in_sample', + }>, + }>, +} +``` + +**Matching semantics:** case-insensitive substring match against the +profile-sampled values. Misses are never authoritative — they only state +that the value was not in the captured sample for the listed connection. +`misses[].reason` distinguishes "no enriched scan has run on this +connection" (`no_profile_artifact`), "enriched scan ran but no columns +were profile-eligible" (`no_candidate_columns`), and "scan ran but value +was not in the sample" (`value_not_in_sample`). The research skill must +direct agents to follow up a `value_not_in_sample` miss with +`sql_execution` against the most plausible columns, not to conclude the +value is absent. + +**Cache freshness:** the dictionary index is keyed on the profile artifact +identity (the `syncId` derived from its path or the artifact mtime). When +that identity advances, the daemon re-reads the artifact on next call. See +the `entity_details` cache-freshness note above for the shared rule. + +**Implementation:** new module `packages/context/src/sl/dictionary-search.ts`. +Loads `SlDictionaryEntry` records via the existing extraction code path, +builds a per-connection in-memory index on first call, caches it for the +lifetime of the MCP daemon. Invalidated on next ingest run (the daemon +watches `.ktx/db.sqlite` for changes, or simply re-reads on each call when +the artifact mtime advances). + +### sql_execution + +Execute a read-only SQL query against a configured connection and return the +result. The fallback path for questions the semantic layer does not cover. + +**Input schema:** + +```typescript +{ + connectionId: z.string().min(1), + sql: z.string().min(1), + maxRows: z.number().int().min(1).max(10_000).default(1000).optional(), +} +``` + +**Output:** + +```typescript +{ + headers: string[], + headerTypes?: string[], // driver-mapped type names, one per header; optional + rows: Array>, + rowCount: number, +} +``` + +`headerTypes` is optional because not every connector exposes per-column +type metadata. The current contract makes it optional +(`KtxQueryResult.headerTypes` in `packages/context/src/scan/types.ts:272-277`), +and the SQLite connector currently omits it +(`packages/connector-sqlite/src/connector.ts:237-240`, `:301-308`). When a +connector returns header types, the MCP adapter passes them through +verbatim. When a connector does not, the MCP adapter omits the field rather +than fabricating values. + +**Implementation:** delegates to `KtxScanConnector.executeReadOnly` on the +matching connector. The connector calls `assertReadOnlySql` and +`limitSqlForExecution` (`packages/context/src/connections/read-only-sql.ts`). + +**Read-only enforcement is lexical, not parser-backed.** The current guard +inspects the first token with regex: it accepts queries whose first non-space +token is `SELECT` or `WITH`, and rejects queries whose first non-space token +matches a fixed mutating-verb list. Implications: + +- A CTE that nests a data-modifying statement (e.g., `WITH x AS (INSERT ... + RETURNING *) SELECT ...`, valid in Postgres) passes the first-token check + and would reach the connector. +- Dialect-specific read/write constructs and procedure calls that do not + start with a listed verb are not caught. + +Because `sql_execution` exposes this boundary to external MCP clients, the +tool **must not** be enabled until one of the following holds: + +1. The guard is upgraded to a sqlglot/AST-based read-only check that + inspects every statement and CTE node, with explicit tests for CTE-DML, + `CALL`, `DO`, vendor pragmas, and multi-statement payloads; or +2. Connector-side execution forces a read-only transaction / session (e.g., + `SET TRANSACTION READ ONLY` for Postgres, `READ ONLY` connection for + MySQL, equivalent for each connector), so the guard is defense-in-depth + rather than the sole boundary. + +The implementation plan that follows this spec is required to choose and +land one of those before registering `sql_execution` in the MCP surface. +Errors from `assertReadOnlySql` (whichever implementation) are returned as +structured tool errors so the agent can correct the query and retry. + +## Tool naming convention + +Match the existing KTX MCP convention (no prefix): `discover_data`, +`entity_details`, `dictionary_search`, `sql_execution`. The existing tools +(`wiki_search`, `sl_list_sources`, `scan_trigger`, `memory_capture`) all use +unprefixed snake_case; the new tools follow suit. + +## Connection model + +- `sql_execution` and `entity_details` require `connectionId` — these tools + cannot operate without a target. +- `discover_data` and `dictionary_search` make `connectionId` optional. Omit + it to search across all configured connections; provide it to scope. This + matches the existing pattern for `sl_list_sources({ connectionId? })`. +- All tools are project-locked: the MCP daemon runs in one KTX project dir; + to operate on a different project, restart the daemon with a different + `--project-dir` or `cwd`. + +## MCP daemon: `ktx mcp` + +A new CLI subtree in `packages/cli/src/commands/mcp-commands.ts`, wired into +`cli-program.ts` alongside `setup`, `connection`, `ingest`, `wiki`, `sl`, +`status`, `dev`. + +### Commands + +```bash +ktx mcp start [--port ] [--host ] [--token ] [--foreground] \ + [--allowed-host ...] [--allowed-origin ...] +ktx mcp stop +ktx mcp status +ktx mcp logs [--follow] +``` + +`--allowed-host` and `--allowed-origin` are repeatable. They extend (not +replace) the defaults defined in the security model below. + +### `ktx mcp start` + +Starts a long-lived HTTP MCP server bound to the configured host and port, +serving every tool registered by `createKtxMcpServer`. The server stays alive +until `ktx mcp stop` is invoked or the process is terminated. + +- Default `--host` is `127.0.0.1`. Any value other than `127.0.0.1` or + `localhost` **requires** `--token` (or `KTX_MCP_TOKEN` in the environment); + the command refuses otherwise. +- Default `--port` is 7878. If the port is in use, the command exits with an + error explaining how to choose another. Allocated port is persisted to + `.ktx/mcp.json` for subsequent `status`, `stop`, `logs`, and + `setup-agents` calls. +- `--foreground` runs the server in the foreground and pipes all logs to + stdout, for debugging. Default is background. +- Background runs detach via the same pattern as the managed Python daemon + (`packages/cli/src/managed-python-daemon.ts`): spawn a detached child, + write `pid`, `port`, `startedAt` to `.ktx/mcp.json`, return immediately + with the URL the user should configure in their client. +- Logs go to `.ktx/logs/mcp.log` (matches existing log layout). + +### `ktx mcp stop` + +Reads `.ktx/mcp.json` for the daemon PID, sends SIGTERM, waits up to 10 +seconds for graceful exit, then SIGKILLs if still running. Removes the state +file on success. + +### `ktx mcp status` + +Reads `.ktx/mcp.json`, checks the process is alive, hits the server's +`/health` endpoint, and reports: + +- Running / stopped / stale (state file present but process not alive) +- Port, host, started-at, pid +- Whether token auth is enabled +- Configured project dir + +### `ktx mcp logs` + +Tails or follows `.ktx/logs/mcp.log`. Standard `--follow` flag. + +### Lifecycle + +Manual: the user runs `ktx mcp start` after each reboot or whenever they +want the server running. No auto-start on other `ktx` commands (matches the +explicit pattern established by the daemon model). + +### Transport + +HTTP-only via `StreamableHTTPServerTransport` from +`@modelcontextprotocol/sdk/server/streamableHttp.js`. + +The `/mcp` endpoint must implement the full Streamable HTTP contract, not +just `POST`: + +- `POST /mcp` — JSON-RPC requests (and the `initialize` handshake when no + session exists). On the first `initialize` post, the server allocates a + session id and returns it in the `Mcp-Session-Id` response header. +- `GET /mcp` — opens an SSE stream for server-initiated messages on an + existing session. Requires a valid `Mcp-Session-Id` header. +- `DELETE /mcp` — explicit session termination by the client. Requires a + valid `Mcp-Session-Id` header; the server must drop the session and any + associated SSE streams. + +**Session model.** v1 ships **stateful** sessions: the server generates a +session id with `randomUUID()` on `initialize`, stores the transport in an +in-memory map keyed by session id, reuses it on subsequent +`POST`/`GET`/`DELETE` calls that carry the same `Mcp-Session-Id`, and +removes it on `DELETE` or transport close. Requests that carry an unknown +session id are rejected with HTTP 404 so the client knows to re-initialize. + +Health: `GET /health` returns `{ status: 'ok', projectDir, port }` for +liveness checks. `/health` is separate from `/mcp` and is not subject to +session-id requirements (but is subject to host/origin validation; see +below). + +### Security model + +- `127.0.0.1` binding is the default and requires no token auth (loopback + only). Even on loopback, the server enforces **Host and Origin header + validation** on every `/mcp` and `/health` request to defend against + browser-driven DNS-rebinding attacks (the same defense the MCP SDK + exposes in `createMcpExpressApp` / `createMcpHonoApp`). +- **Host validation** compares the incoming `Host` header to the allowed-host + list after normalizing: lowercase, strip any port, strip surrounding + brackets from IPv6 literals (`[::1]:7878` → `::1`). Comparison is exact + on the normalized host string. The default allowed-host list is + `['localhost', '127.0.0.1', '::1']`. `--allowed-host` values are appended + after the same normalization. +- **Origin validation** compares the full browser `Origin` header (scheme + + host + port) to the allowed-origin list. The default allowed-origin list + is empty: any request that carries an `Origin` header is rejected unless + an explicit `--allowed-origin` entry matches. Non-browser clients that + do not send an `Origin` header (Claude Code, Cursor, Codex, opencode + HTTP transports) are accepted regardless of `Origin`. Each + `--allowed-origin` value must be a full origin string + (e.g., `http://localhost:7878`); KTX validates the format at startup. +- Non-loopback binding requires `--token ` or `KTX_MCP_TOKEN`. The + server checks `Authorization: Bearer ` on **every** `/mcp` method — + `POST`, `GET` (SSE), and `DELETE` — and rejects with HTTP 401 otherwise. + Token enforcement is independent of the session check; both must pass. + When `--host` is non-loopback, the allowed-host list expands to include + the normalized bound host plus any user-supplied `--allowed-host` + values. +- TLS is out of scope. For remote access, document running KTX behind a + reverse proxy (Caddy, nginx) that terminates TLS. + +## Client config installation via `ktx setup-agents` + +`ktx setup-agents` extends its existing per-target file installation +(`plannedKtxAgentFiles` in `packages/cli/src/setup-agents.ts:64`) to also +write MCP server entries. + +The per-client config matrix is **not uniform**. Each client has its own +file location, scope semantics, and entry shape; `setup-agents` must +produce the correct shape per target rather than emit one JSON blob. + +| Target | Scope | MCP config path | Writer behavior | +|---|---|---|---| +| claude-code | user (global) | `~/.claude.json` → root `mcpServers.ktx` | write JSON | +| claude-code | local (per-project, private) | `~/.claude.json` → `projects[].mcpServers.ktx` | write JSON | +| claude-code | project (shared, checked in) | `/.mcp.json` → `mcpServers.ktx` | write JSON | +| cursor | global | `~/.cursor/mcp.json` → `mcpServers.ktx` | write JSON | +| cursor | project | `/.cursor/mcp.json` → `mcpServers.ktx` | write JSON | +| codex | user (global) | `~/.codex/config.toml` → `[mcp_servers.ktx]` (TOML) | print instructions; do not auto-write in v1 | +| opencode | user (global) | `~/.config/opencode/opencode.json` → `mcp.ktx` | print instructions; do not auto-write in v1 | +| opencode | project | `/opencode.json` → `mcp.ktx` | print instructions; do not auto-write in v1 | + +The shared global `~/.claude.json` and per-project `~/.claude.json` → +`projects[...]` scope are both supported because Claude Code's "user" vs. +"local" scopes write to different sub-trees of the same file; `setup-agents` +must select the scope explicitly per invocation. + +Codex and opencode entries are **printed as copy-pasteable snippets** in v1 +because their config formats (TOML for codex, a different JSON wrapper for +opencode) diverge enough from the JSON writers above that mixing them into +the same writer codepath risks silently producing invalid files. This is a +deliberate v1 scoping decision, not a permanent limitation. + +#### Entry shapes by target + +Claude Code (HTTP): + +```jsonc +{ + "mcpServers": { + "ktx": { + "type": "http", + "url": "http://localhost:7878/mcp" + // when token auth is active, env-var expansion only: + // "headers": { "Authorization": "Bearer ${KTX_MCP_TOKEN}" } + } + } +} +``` + +Cursor (HTTP, project `.cursor/mcp.json` or global `~/.cursor/mcp.json`): + +```jsonc +{ + "mcpServers": { + "ktx": { + "url": "http://localhost:7878/mcp" + // when token auth is active, env-var expansion only: + // "headers": { "Authorization": "Bearer ${KTX_MCP_TOKEN}" } + } + } +} +``` + +Codex (printed snippet, `~/.codex/config.toml`): + +```toml +[mcp_servers.ktx] +url = "http://localhost:7878/mcp" +# Codex MCP config does not currently document a headers field; if token +# auth is active, instruct the user to either run KTX on loopback without a +# token or wait for codex header support before enabling. +``` + +opencode (printed snippet, `opencode.json`): + +```jsonc +{ + "mcp": { + "ktx": { + "type": "remote", + "url": "http://localhost:7878/mcp", + "enabled": true + // when token auth is active, env-var expansion only: + // "headers": { "Authorization": "Bearer ${KTX_MCP_TOKEN}" } + } + } +} +``` + +#### Token handling per client + +When `--token` / `KTX_MCP_TOKEN` is active, `setup-agents` writes the bearer +token **only via environment-variable reference** (`Bearer ${KTX_MCP_TOKEN}`), +never as a literal token value. Claude Code, Cursor, and opencode all +support environment-variable expansion inside `headers` values; the +written entry references `${KTX_MCP_TOKEN}` and the user is responsible +for exporting it in the shell that launches the MCP client. + +Rules: + +- **No literal-token writes, anywhere.** Even the user-scope (private) + Claude Code / Cursor config receives env-var references, not the raw + token. This keeps the same writer codepath for every scope and avoids a + branch that materializes secrets. +- **Project-scope (shared, checked-in) configs are gated.** When a token is + active and the user requests a shared scope — `/.mcp.json` + for Claude Code, `/.cursor/mcp.json` for Cursor — `setup-agents` + prints a warning and offers a choice: (a) write the entry with the + `${KTX_MCP_TOKEN}` reference (the file is safe to commit; readers must + export the variable locally), or (b) skip the shared entry and rely on a + user-scope entry instead. The default is (a). +- **Verify header support per client before writing.** The matrix below + reflects the current state of each client's MCP config docs: + - claude-code: supports `headers` with `${VAR}` expansion on HTTP entries. + - cursor: supports `headers` with `${VAR}` expansion on HTTP entries. + - opencode: supports `headers` with `${VAR}` expansion on remote MCP + entries. + - codex: **not currently supported** in published config docs. When a + token is active and the user selects codex, `setup-agents` prints a + warning and skips the codex entry rather than writing an entry that + codex will silently ignore. The recommended workaround is to bind KTX + to loopback without a token for codex users. +- **Implementation acceptance test.** Setup-agents writer tests must assert + that no rendered output contains the literal token string for any + scope/target combination — only the `${KTX_MCP_TOKEN}` reference. + +Port is read from `.ktx/mcp.json` if present, falling back to 7878. The +install manifest (`agentInstallManifestPath`, +`packages/cli/src/setup-agents.ts:60`) tracks each **written** entry so +`ktx setup-agents --remove` can roll back cleanly. The current manifest +entry kinds are `file` and `json-key` +(`packages/cli/src/setup-agents.ts:42-50`); the MCP client writers for +claude-code and cursor add `json-key` entries for their respective config +files. Printed-only snippets for codex and opencode are **not** tracked in +the manifest, and `--remove` does not attempt to mutate user-written +files for those targets; the printed instructions tell the user how to +remove the entry by hand. + +If the daemon is not running when `setup-agents` runs, the command prints a +follow-up hint: "Run `ktx mcp start` to enable the configured KTX MCP +server." It does **not** auto-start the daemon (matches the manual +lifecycle decision). + +## Research skill + +A new skill source file at `packages/cli/src/skills/research/SKILL.md`, +installed by `ktx setup-agents` to all configured targets. The skill is +separate from the existing setup skill (different triggers: "work in a KTX +project" vs. "answer a data question") and lives in its own per-target +folder so global vs. project scope and removal stay clean. + +`plannedKtxAgentFiles` in `packages/cli/src/setup-agents.ts:64` is extended +to return both the existing `ktx` entries and new `ktx-research` entries: + +| Target | Scope | Path | +|---|---|---| +| claude-code | global | `~/.claude/skills/ktx-research/SKILL.md` | +| claude-code | project | `.claude/skills/ktx-research/SKILL.md` | +| codex | global | `${CODEX_HOME}/skills/ktx-research/SKILL.md` | +| codex | project | `.agents/skills/ktx-research/SKILL.md` | +| cursor | project | `.cursor/rules/ktx-research.mdc` | +| opencode | project | `.opencode/commands/ktx-research.md` | +| universal | project | `.agents/skills/ktx-research/SKILL.md` | + +The skill body is identical across targets; only the wrapper format and +file path differ to match each target's convention. + +### Skill content + +```markdown +--- +name: ktx-research +description: Use when answering a question that needs data from a KTX-connected database — investigating, analyzing, "how many", "show me", "what's the breakdown of", finding records by value, exploring tables, comparing periods, or any data-investigation request. Triggers even when the user does not say "research"; if the answer requires querying a configured KTX connection, this skill applies. +--- + +# KTX Research Workflow + +You have access to KTX MCP tools for investigating data. Follow this workflow. + + +1. **Discover** — call `discover_data(query)` first to see what exists across wiki, semantic-layer sources, and raw tables. Returns refs only. +2. **Inspect top hits in parallel** — for each promising ref: + - `kind: 'wiki'` → `wiki_read(key)` + - `kind: 'sl_source'` / `'sl_measure'` / `'sl_dimension'` → `sl_read_source(connectionId, sourceName)` + - `kind: 'table'` / `'column'` → `entity_details(connectionId, entities)` +3. **Resolve literals** — if the user named a value (e.g., "Acme Corp", "status=shipped"), call `dictionary_search(values)` to find which column holds it. +4. **Query** — + - Prefer `sl_query` when the semantic layer covers the question (joins, measures pre-defined). + - Use `sql_execution` only for things the semantic layer doesn't cover. +5. **Capture learnings** — at the end of the turn, call `memory_capture(userMessage, assistantMessage)` so future turns benefit. Skip when the answer carries no durable knowledge (e.g., the user only asked for schema info). + + + +- Always run `discover_data` before writing SQL. Do not guess table names. +- Prefer the semantic layer over raw SQL when both can answer the question — measures are the source of truth. +- Read entity details before writing SQL against an unfamiliar table; do not assume column names. +- Treat `sql_execution` as read-only. Writes are rejected by the server. +- Validate value mentions with `dictionary_search` instead of guessing case/spelling — but treat a `dictionary_search` *miss* as non-authoritative. The index is built from profile-sampled values, so a missing value may simply have been outside the sample. Follow up with `sql_execution` against the most plausible columns before concluding the value is absent. + + + +**Input:** "How many orders did Acme Corp place last month?" + +**Output workflow:** +1. `dictionary_search(["Acme Corp"])` → finds `customers.name` +2. `discover_data("orders customer monthly")` → finds `orders_facts` SL source +3. `sl_read_source("warehouse", "orders_facts")` → confirms measure `order_count`, dim `customer_name`, dim `ordered_at` +4. `sl_query({ measures: ["order_count"], filters: ["customer_name = 'Acme Corp'", "ordered_at >= date_trunc('month', now() - interval '1 month')"], dimensions: [{ field: "ordered_at", granularity: "month" }] })` +5. `memory_capture(userMessage, assistantMessage)` + +--- + +**Input:** "What columns does the events table have?" + +**Output workflow:** +1. `discover_data("events table")` → top hit `kind: 'table', id: 'analytics.events'` +2. `entity_details("warehouse", [{ table: "analytics.events" }])` → returns columns, types, FKs +3. Answer directly. (No query needed; no `memory_capture` since no durable learning.) + +``` + +## Files + +### New + +- `packages/context/src/scan/entity-details.ts` — derives entity-detail + records from `KtxSchemaSnapshot`, sharing resolution logic with + `warehouse-verification/warehouse-catalog.service.ts` (refactored or + imported, not duplicated). +- `packages/context/src/sl/dictionary-search.ts` — builds and queries the + dictionary index over relationship-profiling artifacts. +- `packages/context/src/search/discover.ts` — composes wiki, SL, and raw + schema searches; fuses results via `rrf.ts`. Reuses the same wiki/SL/raw + search building blocks as `warehouse-verification/discover-data.tool.ts`. +- `packages/cli/src/commands/mcp-commands.ts` — `ktx mcp start|stop|status|logs`. +- `packages/cli/src/managed-mcp-daemon.ts` — daemon lifecycle (spawn, + pidfile, log management), mirroring `managed-python-daemon.ts`. +- `packages/cli/src/skills/research/SKILL.md` — research workflow skill. +- Tests for each new module following existing patterns + (`*.test.ts` siblings), including coverage of the per-client config + writer/printer matrix. + +### Modified + +- `packages/context/src/mcp/context-tools.ts` — register the four new tools + with their Zod schemas. +- `packages/context/src/mcp/server.ts` — extend `KtxMcpContextPorts` with the + new ports (`sqlExecution`, `entityDetails`, `dictionarySearch`, `discover`). +- `packages/context/src/mcp/types.ts` — add the new port interface + definitions. +- `packages/cli/src/cli-program.ts` — register the `mcp` command subtree. +- `packages/cli/src/setup-agents.ts` — install the research skill and write + MCP client config entries to each configured target. + +## Testing strategy + +- Unit tests for each new module (`entity-details.ts`, + `dictionary-search.ts`, `discover.ts`) using existing fixture patterns. +- MCP-level integration test in `packages/context/src/mcp/server.test.ts` + that registers a fake server, invokes each tool, and asserts the + responses. +- CLI integration test for `ktx mcp start|stop|status` lifecycle following + the pattern in `managed-python-daemon.test.ts`. +- Setup-agents tests verifying behavior per target: claude-code and cursor + writers add the correct JSON entry and a corresponding `json-key` + manifest entry that `--remove` cleans up; codex and opencode targets + produce printed snippet output and do not mutate any user config file + or add manifest entries in v1. +- Verification commands per CLAUDE.md: `pnpm --filter @ktx/context run test` + and `pnpm --filter @ktx/cli run test` for the affected packages, plus + `pnpm run type-check`. + +## Out of scope / follow-ups + +- **Python code execution via MCP.** The daemon's `/code/execute` endpoint + exists; surfacing it via MCP is a separate design with sandbox/security + considerations. +- **Stdio MCP transport.** HTTP-only for now. Stdio can be added later as an + additional transport mode without changing the tool surface. +- **OS-level auto-start.** Manual `ktx mcp start` only. Adding launchd / + systemd unit installation is a UX polish for a later release. +- **TLS in the daemon itself.** Reverse proxy is the documented path. Native + TLS support if/when demand emerges. +- **Multi-project / project-switching MCP.** One daemon per project. A + cross-project model would require per-call `projectDir` arguments or a + `set_project_dir` tool and is deferred. +- **Audit logging, rate limiting, per-tool authorization.** Not in scope for + v1; the security boundary is loopback or bearer token. + +## Open trade-offs + +- **`dictionary_search` requires `--deep` (enriched) scan to have run.** The + relationship-profiling artifact that powers the dictionary index is only + produced by enriched scans. The tool reports this distinctly when missing, + but the dependency is real: without enriched scan, the tool returns + empty. +- **`entity_details` reads from the latest snapshot, not live.** If the + database schema changes after the last scan, the tool will reflect the + scan state, not reality. Surfacing this clearly in the tool's response + (snapshot timestamp) is part of the implementation. +- **No streaming for `sql_execution`.** Large results are capped at + `maxRows` (default 1000, max 10k). The tool returns the full result set + in one response. Streaming partial results is left for a later iteration + if real workloads demand it. diff --git a/packages/cli/package.json b/packages/cli/package.json index 65895f89..539618f7 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -26,7 +26,7 @@ ], "scripts": { "assets:demo": "node scripts/build-demo-assets.mjs", - "build": "node -e \"fs.rmSync('dist', { recursive: true, force: true })\" && tsc -p tsconfig.json && node ../../scripts/prepare-cli-bin.mjs", + "build": "node -e \"fs.rmSync('dist', { recursive: true, force: true })\" && tsc -p tsconfig.json && node scripts/copy-runtime-assets.mjs && node ../../scripts/prepare-cli-bin.mjs", "docs:commands": "pnpm run build && node dist/print-command-tree.js", "smoke": "vitest run src/standalone-smoke.test.ts src/example-smoke.test.ts --testTimeout 30000", "test": "vitest run --exclude src/standalone-smoke.test.ts --exclude src/example-smoke.test.ts --exclude src/setup-databases.test.ts --exclude src/scan.test.ts --exclude src/commands/connection-metabase-setup.test.ts --exclude src/setup-models.test.ts --exclude src/setup-sources.test.ts --exclude src/setup.test.ts --exclude src/connection.test.ts --exclude src/setup-embeddings.test.ts --exclude src/ingest.test.ts --exclude src/commands/connection-mapping.test.ts --exclude src/ingest-viz.test.ts --exclude src/demo.test.ts --exclude src/setup-project.test.ts --exclude src/sl.test.ts --exclude src/local-scan-connectors.test.ts --exclude src/commands/connection-notion.test.ts", @@ -45,6 +45,7 @@ "@ktx/connector-sqlserver": "workspace:*", "@ktx/context": "workspace:*", "@ktx/llm": "workspace:*", + "@modelcontextprotocol/sdk": "^1.29.0", "commander": "14.0.3", "ink": "^7.0.2", "react": "^19.2.6", diff --git a/packages/cli/scripts/copy-runtime-assets.mjs b/packages/cli/scripts/copy-runtime-assets.mjs new file mode 100644 index 00000000..a7c75658 --- /dev/null +++ b/packages/cli/scripts/copy-runtime-assets.mjs @@ -0,0 +1,11 @@ +import { cp, mkdir, rm } from 'node:fs/promises'; +import { dirname, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const packageRoot = fileURLToPath(new URL('..', import.meta.url)); +const skillsSource = join(packageRoot, 'src', 'skills'); +const skillsTarget = join(packageRoot, 'dist', 'skills'); + +await rm(skillsTarget, { recursive: true, force: true }); +await mkdir(dirname(skillsTarget), { recursive: true }); +await cp(skillsSource, skillsTarget, { recursive: true }); diff --git a/packages/cli/src/cli-program.ts b/packages/cli/src/cli-program.ts index e8bdf445..e82cf0b5 100644 --- a/packages/cli/src/cli-program.ts +++ b/packages/cli/src/cli-program.ts @@ -5,6 +5,7 @@ import type { KtxCliDeps, KtxCliIo, KtxCliPackageInfo } from './cli-runtime.js'; import { registerConnectionCommands } from './commands/connection-commands.js'; import { registerIngestCommands } from './commands/ingest-commands.js'; import { registerWikiCommands } from './commands/knowledge-commands.js'; +import { registerMcpCommands } from './commands/mcp-commands.js'; import { registerSetupCommands } from './commands/setup-commands.js'; import { registerSlCommands } from './commands/sl-commands.js'; import { registerStatusCommands } from './commands/status-commands.js'; @@ -55,7 +56,7 @@ type CommandPathNode = CommandWithGlobalOptions & { parent?: CommandPathNode | null; }; -const PROJECT_AWARE_ROOT_COMMANDS = new Set(['setup', 'connection', 'ingest', 'wiki', 'sl', 'status']); +const PROJECT_AWARE_ROOT_COMMANDS = new Set(['setup', 'connection', 'ingest', 'wiki', 'sl', 'status', 'mcp']); const COMMANDS_THAT_CREATE_PROJECT = new Set(['setup', 'ktx dev init']); const COMMANDS_WITH_OWN_MISSING_PROJECT_HANDLING = new Set(['status']); const GLOBAL_OPTIONS_WITH_VALUE = new Set(['--project-dir']); @@ -412,6 +413,7 @@ export function buildKtxProgram(options: BuildKtxProgramOptions): Command { registerWikiCommands(program, context); registerSlCommands(program, context); registerStatusCommands(program, context); + registerMcpCommands(program, context); registerDevCommands(program, context); return program; diff --git a/packages/cli/src/cli-runtime.ts b/packages/cli/src/cli-runtime.ts index cc1a2b98..e8be6be7 100644 --- a/packages/cli/src/cli-runtime.ts +++ b/packages/cli/src/cli-runtime.ts @@ -34,6 +34,12 @@ export interface KtxCliDeps { runtime?: (args: KtxRuntimeArgs, io: KtxCliIo) => Promise; knowledge?: (args: KtxKnowledgeArgs, io: KtxCliIo) => Promise; sl?: (args: KtxSlArgs, io: KtxCliIo) => Promise; + mcp?: { + startDaemon?: typeof import('./managed-mcp-daemon.js').startKtxMcpDaemon; + stopDaemon?: typeof import('./managed-mcp-daemon.js').stopKtxMcpDaemon; + readStatus?: typeof import('./managed-mcp-daemon.js').readKtxMcpDaemonStatus; + runServer?: typeof import('./mcp-http-server.js').runKtxMcpHttpServer; + }; } export function getKtxCliPackageInfo(): KtxCliPackageInfo { diff --git a/packages/cli/src/commands/mcp-commands.test.ts b/packages/cli/src/commands/mcp-commands.test.ts new file mode 100644 index 00000000..f31996f2 --- /dev/null +++ b/packages/cli/src/commands/mcp-commands.test.ts @@ -0,0 +1,57 @@ +import { Command } from '@commander-js/extra-typings'; +import { describe, expect, it, vi } from 'vitest'; +import type { KtxCliCommandContext } from '../cli-program.js'; +import { registerMcpCommands } from './mcp-commands.js'; + +function makeContext(overrides: Partial = {}): KtxCliCommandContext { + let exitCode = 0; + return { + io: { + stdout: { write: vi.fn() }, + stderr: { write: vi.fn() }, + }, + deps: {}, + packageInfo: { name: '@ktx/cli', version: '0.0.0-test', contextPackageName: '@ktx/context' }, + setExitCode: (code) => { + exitCode = code; + }, + runInit: vi.fn(), + writeDebug: vi.fn(), + ...overrides, + get exitCode() { + return exitCode; + }, + } as KtxCliCommandContext; +} + +describe('registerMcpCommands', () => { + it('registers the public mcp lifecycle commands', () => { + const program = new Command().exitOverride(); + registerMcpCommands(program, makeContext()); + const mcp = program.commands.find((command) => command.name() === 'mcp'); + + expect(mcp?.commands.map((command) => command.name()).sort()).toEqual([ + 'logs', + 'serve-internal', + 'start', + 'status', + 'stop', + ]); + expect( + (mcp?.commands.find((command) => command.name() === 'serve-internal') as { _hidden?: boolean } | undefined) + ?._hidden, + ).toBe(true); + }); + + it('rejects non-loopback start without token before spawning', async () => { + const program = new Command().exitOverride(); + const startDaemon = vi.fn(); + const context = makeContext({ deps: { mcp: { startDaemon } } }); + registerMcpCommands(program, context); + + await expect(program.parseAsync(['mcp', 'start', '--host', '0.0.0.0'], { from: 'user' })).rejects.toThrow( + 'Binding KTX MCP to 0.0.0.0 requires --token or KTX_MCP_TOKEN', + ); + expect(startDaemon).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/cli/src/commands/mcp-commands.ts b/packages/cli/src/commands/mcp-commands.ts new file mode 100644 index 00000000..7880f608 --- /dev/null +++ b/packages/cli/src/commands/mcp-commands.ts @@ -0,0 +1,136 @@ +import { spawn } from 'node:child_process'; +import { readFile } from 'node:fs/promises'; +import { fileURLToPath } from 'node:url'; +import { Command } from '@commander-js/extra-typings'; +import type { KtxCliCommandContext } from '../cli-program.js'; +import { + collectOption, + parsePositiveIntegerOption, + resolveCommandProjectDir, +} from '../cli-program.js'; +import { + mcpDaemonLayout, + readKtxMcpDaemonStatus, + startKtxMcpDaemon, + stopKtxMcpDaemon, +} from '../managed-mcp-daemon.js'; +import { buildMcpSecurityConfig, runKtxMcpHttpServer } from '../mcp-http-server.js'; + +function tokenFromOption(value: string | undefined): string | undefined { + return value ?? process.env.KTX_MCP_TOKEN; +} + +function binPath(): string { + return fileURLToPath(new URL('../bin.js', import.meta.url)); +} + +export function registerMcpCommands(program: Command, context: KtxCliCommandContext): void { + const mcp = program.command('mcp').description('Run the KTX MCP HTTP server'); + + mcp + .command('start') + .description('Start the KTX MCP HTTP server') + .option('--host ', 'Host to bind', '127.0.0.1') + .option('--port ', 'Port to bind', parsePositiveIntegerOption, 7878) + .option('--token ', 'Bearer token required for non-loopback binding') + .option('--foreground', 'Run in the foreground', false) + .option('--allowed-host ', 'Additional allowed Host header', collectOption, []) + .option('--allowed-origin ', 'Allowed browser Origin header', collectOption, []) + .action(async (options, command) => { + const projectDir = resolveCommandProjectDir(command); + const token = tokenFromOption(options.token); + buildMcpSecurityConfig({ + host: options.host, + port: options.port, + token, + allowedHosts: options.allowedHost, + allowedOrigins: options.allowedOrigin, + }); + if (options.foreground) { + await (context.deps.mcp?.runServer ?? runKtxMcpHttpServer)({ + projectDir, + cliVersion: context.packageInfo.version, + host: options.host, + port: options.port, + token, + allowedHosts: options.allowedHost, + allowedOrigins: options.allowedOrigin, + io: context.io, + }); + context.io.stdout.write(`KTX MCP server listening at http://${options.host}:${options.port}/mcp\n`); + return; + } + const result = await (context.deps.mcp?.startDaemon ?? startKtxMcpDaemon)({ + projectDir, + cliVersion: context.packageInfo.version, + host: options.host, + port: options.port, + token, + allowedHosts: options.allowedHost, + allowedOrigins: options.allowedOrigin, + binPath: binPath(), + }); + context.io.stdout.write(`KTX MCP daemon started: ${result.url}\n`); + }); + + mcp + .command('stop') + .description('Stop the KTX MCP daemon') + .action(async (_options, command) => { + const result = await (context.deps.mcp?.stopDaemon ?? stopKtxMcpDaemon)({ + projectDir: resolveCommandProjectDir(command), + }); + context.io.stdout.write(result.status === 'stopped' ? 'KTX MCP daemon stopped.\n' : 'KTX MCP daemon is not running.\n'); + }); + + mcp + .command('status') + .description('Show KTX MCP daemon status') + .action(async (_options, command) => { + const status = await (context.deps.mcp?.readStatus ?? readKtxMcpDaemonStatus)({ + projectDir: resolveCommandProjectDir(command), + }); + context.io.stdout.write(`${status.detail}\n`); + if (status.kind === 'running') { + context.io.stdout.write(`URL: ${status.url}\n`); + context.io.stdout.write(`PID: ${status.state.pid}\n`); + context.io.stdout.write(`Token auth: ${status.state.tokenAuth ? 'enabled' : 'disabled'}\n`); + context.io.stdout.write(`Project: ${status.state.projectDir}\n`); + } + }); + + mcp + .command('logs') + .description('Print the KTX MCP daemon log') + .option('--follow', 'Follow log output', false) + .action(async (options, command) => { + const logPath = mcpDaemonLayout(resolveCommandProjectDir(command)).logPath; + if (options.follow) { + const child = spawn('tail', ['-f', logPath], { stdio: ['ignore', 'pipe', 'pipe'] }); + child.stdout?.on('data', (chunk: Buffer) => context.io.stdout.write(chunk.toString('utf8'))); + child.stderr?.on('data', (chunk: Buffer) => context.io.stderr.write(chunk.toString('utf8'))); + await new Promise((resolve) => child.on('close', resolve)); + return; + } + context.io.stdout.write(await readFile(logPath, 'utf8')); + }); + + mcp + .command('serve-internal', { hidden: true }) + .option('--host ', 'Host to bind', '127.0.0.1') + .requiredOption('--port ', 'Port to bind', parsePositiveIntegerOption) + .option('--allowed-host ', 'Additional allowed Host header', collectOption, []) + .option('--allowed-origin ', 'Allowed browser Origin header', collectOption, []) + .action(async (options, command) => { + await (context.deps.mcp?.runServer ?? runKtxMcpHttpServer)({ + projectDir: resolveCommandProjectDir(command), + cliVersion: context.packageInfo.version, + host: options.host, + port: options.port, + token: process.env.KTX_MCP_TOKEN, + allowedHosts: options.allowedHost, + allowedOrigins: options.allowedOrigin, + io: context.io, + }); + }); +} diff --git a/packages/cli/src/commands/setup-commands.ts b/packages/cli/src/commands/setup-commands.ts index 4f6f0c32..d09f8149 100644 --- a/packages/cli/src/commands/setup-commands.ts +++ b/packages/cli/src/commands/setup-commands.ts @@ -90,6 +90,7 @@ function shouldShowSetupEntryMenu( agents?: boolean; target?: string; global?: boolean; + local?: boolean; skipAgents?: boolean; yes?: boolean; input?: boolean; @@ -163,6 +164,7 @@ function shouldShowSetupEntryMenu( 'agents', 'target', 'global', + 'local', 'skipAgents', 'yes', 'input', @@ -223,6 +225,7 @@ export function registerSetupCommands(program: Command, context: KtxCliCommandCo ]), ) .option('--global', 'Install agent integration into the global target scope', false) + .option('--local', 'Install Claude Code MCP config into the private per-project ~/.claude.json scope', false) .addOption(new Option('--skip-agents', 'Leave agent integration incomplete for now').hideHelp().default(false)) .option('--yes', 'Accept safe defaults in non-interactive setup', false) .option('--no-input', 'Disable interactive terminal input') @@ -392,9 +395,19 @@ export function registerSetupCommands(program: Command, context: KtxCliCommandCo context.setExitCode(1); return; } + if (options.local && options.global) { + context.io.stderr.write('Choose only one agent scope: --local or --global.\n'); + context.setExitCode(1); + return; + } + if (options.local && options.target && options.target !== 'claude-code') { + context.io.stderr.write('--local is only supported with --target claude-code.\n'); + context.setExitCode(1); + return; + } const mode = options.new ? 'new' : options.existing ? 'existing' : 'auto'; - const resolvedAgentScope = options.global ? 'global' : 'project'; + const resolvedAgentScope = options.local ? 'local' : options.global ? 'global' : 'project'; await runSetupArgs(context, { command: 'run', projectDir: resolveCommandProjectDir(command), diff --git a/packages/cli/src/index.test.ts b/packages/cli/src/index.test.ts index 86a7468c..9c0a94fb 100644 --- a/packages/cli/src/index.test.ts +++ b/packages/cli/src/index.test.ts @@ -440,6 +440,7 @@ describe('runKtxCli', () => { expect(stdout).toContain('--agents'); expect(stdout).toContain('--target '); expect(stdout).toContain('--global'); + expect(stdout).toContain('--local'); expect(stdout).toContain('--yes'); expect(stdout).toContain('--no-input'); expect(stdout).toContain('Global Options:'); @@ -1286,6 +1287,38 @@ describe('runKtxCli', () => { ); }); + it('rejects --local with non-Claude targets', async () => { + const setup = vi.fn(async () => 0); + const setupIo = makeIo(); + + await expect( + runKtxCli( + ['--project-dir', tempDir, 'setup', '--agents', '--target', 'cursor', '--local', '--no-input'], + setupIo.io, + { setup }, + ), + ).resolves.toBe(1); + + expect(setupIo.stderr()).toContain('--local is only supported with --target claude-code'); + expect(setup).not.toHaveBeenCalled(); + }); + + it('rejects --local and --global together', async () => { + const setup = vi.fn(async () => 0); + const setupIo = makeIo(); + + await expect( + runKtxCli( + ['--project-dir', tempDir, 'setup', '--agents', '--target', 'claude-code', '--local', '--global', '--no-input'], + setupIo.io, + { setup }, + ), + ).resolves.toBe(1); + + expect(setupIo.stderr()).toContain('Choose only one agent scope: --local or --global.'); + expect(setup).not.toHaveBeenCalled(); + }); + it('rejects source-path with source-git-url', async () => { const setup = vi.fn(async () => 0); const testIo = makeIo(); diff --git a/packages/cli/src/local-adapters.test.ts b/packages/cli/src/local-adapters.test.ts index dbb03f2b..e1b4b014 100644 --- a/packages/cli/src/local-adapters.test.ts +++ b/packages/cli/src/local-adapters.test.ts @@ -18,6 +18,9 @@ function sqlAnalysisStub() { async analyzeBatch() { return new Map(); }, + async validateReadOnly() { + return { ok: true }; + }, }; } diff --git a/packages/cli/src/managed-mcp-daemon.test.ts b/packages/cli/src/managed-mcp-daemon.test.ts new file mode 100644 index 00000000..7ffc277c --- /dev/null +++ b/packages/cli/src/managed-mcp-daemon.test.ts @@ -0,0 +1,133 @@ +import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { + mcpDaemonLayout, + readKtxMcpDaemonStatus, + startKtxMcpDaemon, + stopKtxMcpDaemon, + type KtxMcpDaemonChild, + type KtxMcpDaemonState, +} from './managed-mcp-daemon.js'; + +function child(pid = 4242): KtxMcpDaemonChild { + return { pid, unref: vi.fn() }; +} + +function state(projectDir: string, overrides: Partial = {}): KtxMcpDaemonState { + return { + schemaVersion: 1, + pid: 4242, + host: '127.0.0.1', + port: 7878, + tokenAuth: false, + projectDir, + startedAt: '2026-05-14T00:00:00.000Z', + logPath: join(projectDir, '.ktx/logs/mcp.log'), + ...overrides, + }; +} + +describe('managed MCP daemon lifecycle', () => { + let tempDir: string; + let projectDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-mcp-daemon-')); + projectDir = join(tempDir, 'project'); + await mkdir(projectDir, { recursive: true }); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('uses the spec state and log paths', () => { + expect(mcpDaemonLayout(projectDir)).toEqual({ + statePath: join(projectDir, '.ktx/mcp.json'), + logPath: join(projectDir, '.ktx/logs/mcp.log'), + }); + }); + + it('starts a detached child and writes state without the token value', async () => { + const spawnDaemon = vi.fn(() => child(5555)); + await startKtxMcpDaemon({ + projectDir, + cliVersion: '0.0.0-test', + host: '0.0.0.0', + port: 7879, + token: 'secret-token', + allowedHosts: ['mcp.example.test'], + allowedOrigins: ['https://mcp.example.test'], + binPath: '/repo/packages/cli/dist/bin.js', + spawnDaemon, + processAlive: vi.fn(() => false), + portAvailable: vi.fn(async () => true), + now: () => new Date('2026-05-14T00:00:00.000Z'), + }); + + expect(spawnDaemon).toHaveBeenCalledWith( + process.execPath, + [ + '/repo/packages/cli/dist/bin.js', + '--project-dir', + projectDir, + 'mcp', + 'serve-internal', + '--host', + '0.0.0.0', + '--port', + '7879', + '--allowed-host', + 'mcp.example.test', + '--allowed-origin', + 'https://mcp.example.test', + ], + expect.objectContaining({ + detached: true, + env: expect.objectContaining({ KTX_MCP_TOKEN: 'secret-token' }), + }), + ); + expect(JSON.stringify(JSON.parse(await readFile(join(projectDir, '.ktx/mcp.json'), 'utf8')))).not.toContain( + 'secret-token', + ); + }); + + it('reports running when the process is alive and health passes', async () => { + await mkdir(join(projectDir, '.ktx'), { recursive: true }); + await writeFile(join(projectDir, '.ktx/mcp.json'), `${JSON.stringify(state(projectDir), null, 2)}\n`); + + const status = await readKtxMcpDaemonStatus({ + projectDir, + processAlive: vi.fn(() => true), + fetchHealth: vi.fn(async () => ({ ok: true, body: { status: 'ok', projectDir, port: 7878 } })), + }); + + expect(status.kind).toBe('running'); + if (status.kind !== 'running') { + throw new Error(`Expected running status, received ${status.kind}`); + } + expect(status.url).toBe('http://127.0.0.1:7878/mcp'); + }); + + it('stops a recorded daemon and removes state', async () => { + await mkdir(join(projectDir, '.ktx'), { recursive: true }); + await writeFile(join(projectDir, '.ktx/mcp.json'), `${JSON.stringify(state(projectDir), null, 2)}\n`); + const alive = new Set([4242]); + const killProcess = vi.fn((pid: number) => alive.delete(pid)); + + await expect( + stopKtxMcpDaemon({ + projectDir, + processAlive: vi.fn((pid) => alive.has(pid)), + killProcess, + stopGraceMs: 1, + pollIntervalMs: 1, + }), + ).resolves.toEqual({ status: 'stopped' }); + + expect(killProcess).toHaveBeenCalledWith(4242, 'SIGTERM'); + await expect(readFile(join(projectDir, '.ktx/mcp.json'), 'utf8')).rejects.toThrow(); + }); +}); diff --git a/packages/cli/src/managed-mcp-daemon.ts b/packages/cli/src/managed-mcp-daemon.ts new file mode 100644 index 00000000..96394f69 --- /dev/null +++ b/packages/cli/src/managed-mcp-daemon.ts @@ -0,0 +1,238 @@ +import { spawn } from 'node:child_process'; +import { mkdir, open, readFile, rm, writeFile } from 'node:fs/promises'; +import { createServer } from 'node:net'; +import { dirname, join } from 'node:path'; +import { setTimeout as delay } from 'node:timers/promises'; +import { z } from 'zod'; + +export interface KtxMcpDaemonState { + schemaVersion: 1; + pid: number; + host: string; + port: number; + tokenAuth: boolean; + projectDir: string; + startedAt: string; + logPath: string; +} + +export interface KtxMcpDaemonChild { + pid?: number; + unref(): void; +} + +export type KtxMcpDaemonStatus = + | { kind: 'stopped'; detail: string } + | { kind: 'running'; detail: string; state: KtxMcpDaemonState; url: string } + | { kind: 'stale'; detail: string; state?: KtxMcpDaemonState }; + +const stateSchema = z.object({ + schemaVersion: z.literal(1), + pid: z.number().int().positive(), + host: z.string().min(1), + port: z.number().int().min(1).max(65535), + tokenAuth: z.boolean(), + projectDir: z.string().min(1), + startedAt: z.string().min(1), + logPath: z.string().min(1), +}); + +export function mcpDaemonLayout(projectDir: string): { statePath: string; logPath: string } { + return { + statePath: join(projectDir, '.ktx/mcp.json'), + logPath: join(projectDir, '.ktx/logs/mcp.log'), + }; +} + +function defaultProcessAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch { + return false; + } +} + +function defaultKillProcess(pid: number, signal: NodeJS.Signals): void { + try { + process.kill(pid, signal); + } catch (error) { + if ((error as { code?: unknown }).code !== 'ESRCH') { + throw error; + } + } +} + +async function readState(projectDir: string): Promise { + try { + return stateSchema.parse(JSON.parse(await readFile(mcpDaemonLayout(projectDir).statePath, 'utf8')) as unknown); + } catch (error) { + if ((error as { code?: unknown }).code === 'ENOENT') { + return undefined; + } + throw error; + } +} + +async function writeState(projectDir: string, state: KtxMcpDaemonState): Promise { + const { statePath } = mcpDaemonLayout(projectDir); + await mkdir(dirname(statePath), { recursive: true }); + await writeFile(statePath, `${JSON.stringify(state, null, 2)}\n`, 'utf8'); +} + +async function defaultPortAvailable(host: string, port: number): Promise { + return await new Promise((resolve) => { + const server = createServer(); + server.once('error', () => resolve(false)); + server.listen(port, host, () => server.close(() => resolve(true))); + }); +} + +function defaultSpawnDaemon( + command: string, + args: string[], + options: { detached: boolean; stdio: ['ignore', number, number]; env: NodeJS.ProcessEnv }, +): KtxMcpDaemonChild { + return spawn(command, args, options); +} + +async function defaultFetchHealth(state: KtxMcpDaemonState): Promise<{ ok: boolean; body: unknown; detail?: string }> { + try { + const response = await fetch(`http://${state.host}:${state.port}/health`, { + headers: { host: `${state.host}:${state.port}` }, + }); + const body = await response.json(); + return { ok: response.ok, body, detail: response.ok ? undefined : `HTTP ${response.status}` }; + } catch (error) { + return { ok: false, body: null, detail: error instanceof Error ? error.message : String(error) }; + } +} + +export async function startKtxMcpDaemon(options: { + projectDir: string; + cliVersion: string; + host: string; + port: number; + token?: string; + allowedHosts: string[]; + allowedOrigins: string[]; + binPath: string; + processAlive?: (pid: number) => boolean; + portAvailable?: (host: string, port: number) => Promise; + spawnDaemon?: typeof defaultSpawnDaemon; + now?: () => Date; +}): Promise<{ status: 'started'; state: KtxMcpDaemonState; url: string }> { + const existing = await readState(options.projectDir).catch(() => undefined); + const processAlive = options.processAlive ?? defaultProcessAlive; + if (existing && processAlive(existing.pid)) { + throw new Error(`KTX MCP daemon is already recorded at http://${existing.host}:${existing.port}/mcp`); + } + const portAvailable = options.portAvailable ?? defaultPortAvailable; + if (!(await portAvailable(options.host, options.port))) { + throw new Error(`Port ${options.port} is already in use. Choose another port with --port .`); + } + + const { logPath } = mcpDaemonLayout(options.projectDir); + await mkdir(dirname(logPath), { recursive: true }); + const log = await open(logPath, 'a'); + try { + const args = [ + options.binPath, + '--project-dir', + options.projectDir, + 'mcp', + 'serve-internal', + '--host', + options.host, + '--port', + String(options.port), + ...options.allowedHosts.flatMap((host) => ['--allowed-host', host]), + ...options.allowedOrigins.flatMap((origin) => ['--allowed-origin', origin]), + ]; + const child = (options.spawnDaemon ?? defaultSpawnDaemon)(process.execPath, args, { + detached: true, + stdio: ['ignore', log.fd, log.fd], + env: { + ...process.env, + KTX_CLI_VERSION: options.cliVersion, + ...(options.token ? { KTX_MCP_TOKEN: options.token } : {}), + }, + }); + if (!child.pid) { + throw new Error('Failed to start KTX MCP daemon: child process pid was not available.'); + } + child.unref(); + const state: KtxMcpDaemonState = { + schemaVersion: 1, + pid: child.pid, + host: options.host, + port: options.port, + tokenAuth: Boolean(options.token), + projectDir: options.projectDir, + startedAt: (options.now ?? (() => new Date()))().toISOString(), + logPath, + }; + await writeState(options.projectDir, state); + return { status: 'started', state, url: `http://${state.host}:${state.port}/mcp` }; + } finally { + await log.close(); + } +} + +export async function readKtxMcpDaemonStatus(options: { + projectDir: string; + processAlive?: (pid: number) => boolean; + fetchHealth?: (state: KtxMcpDaemonState) => Promise<{ ok: boolean; body: unknown; detail?: string }>; +}): Promise { + let state: KtxMcpDaemonState | undefined; + try { + state = await readState(options.projectDir); + } catch (error) { + return { kind: 'stale', detail: `MCP daemon state is invalid: ${error instanceof Error ? error.message : String(error)}` }; + } + if (!state) { + return { kind: 'stopped', detail: `No MCP daemon state at ${mcpDaemonLayout(options.projectDir).statePath}` }; + } + const processAlive = options.processAlive ?? defaultProcessAlive; + if (!processAlive(state.pid)) { + return { kind: 'stale', detail: `MCP daemon process ${state.pid} is not running`, state }; + } + const health = await (options.fetchHealth ?? defaultFetchHealth)(state); + if (!health.ok) { + return { kind: 'stale', detail: health.detail ?? 'MCP daemon health check failed', state }; + } + return { + kind: 'running', + detail: `KTX MCP daemon running at http://${state.host}:${state.port}/mcp`, + state, + url: `http://${state.host}:${state.port}/mcp`, + }; +} + +export async function stopKtxMcpDaemon(options: { + projectDir: string; + processAlive?: (pid: number) => boolean; + killProcess?: (pid: number, signal: NodeJS.Signals) => void; + stopGraceMs?: number; + pollIntervalMs?: number; +}): Promise<{ status: 'stopped' | 'already-stopped' }> { + const state = await readState(options.projectDir); + const { statePath } = mcpDaemonLayout(options.projectDir); + if (!state) { + return { status: 'already-stopped' }; + } + const processAlive = options.processAlive ?? defaultProcessAlive; + const killProcess = options.killProcess ?? defaultKillProcess; + if (processAlive(state.pid)) { + killProcess(state.pid, 'SIGTERM'); + const deadline = Date.now() + (options.stopGraceMs ?? 10_000); + while (Date.now() <= deadline && processAlive(state.pid)) { + await delay(options.pollIntervalMs ?? 100); + } + if (processAlive(state.pid)) { + killProcess(state.pid, 'SIGKILL'); + } + } + await rm(statePath, { force: true }); + return { status: 'stopped' }; +} diff --git a/packages/cli/src/mcp-http-server.test.ts b/packages/cli/src/mcp-http-server.test.ts new file mode 100644 index 00000000..d34f0c0c --- /dev/null +++ b/packages/cli/src/mcp-http-server.test.ts @@ -0,0 +1,274 @@ +import { request } from 'node:http'; +import type { AddressInfo } from 'node:net'; +import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; +import { describe, expect, it } from 'vitest'; +import { + buildMcpSecurityConfig, + isMcpRequestAuthorized, + normalizeHostHeader, + runKtxMcpHttpServer, +} from './mcp-http-server.js'; + +describe('normalizeHostHeader', () => { + it('normalizes host headers before allow-list comparison', () => { + expect(normalizeHostHeader('LOCALHOST:7878')).toBe('localhost'); + expect(normalizeHostHeader('127.0.0.1:7878')).toBe('127.0.0.1'); + expect(normalizeHostHeader('[::1]:7878')).toBe('::1'); + expect(normalizeHostHeader(' Example.COM ')).toBe('example.com'); + }); +}); + +describe('buildMcpSecurityConfig', () => { + it('allows loopback hosts without a token', () => { + const config = buildMcpSecurityConfig({ + host: '127.0.0.1', + port: 7878, + token: undefined, + allowedHosts: [], + allowedOrigins: [], + }); + + expect(config.token).toBeUndefined(); + expect(config.allowedHosts).toEqual(['localhost', '127.0.0.1', '::1']); + }); + + it('requires a token for non-loopback binding', () => { + expect(() => + buildMcpSecurityConfig({ + host: '0.0.0.0', + port: 7878, + token: undefined, + allowedHosts: [], + allowedOrigins: [], + }), + ).toThrow('Binding KTX MCP to 0.0.0.0 requires --token or KTX_MCP_TOKEN'); + }); + + it('validates allowed origins as full origins', () => { + expect(() => + buildMcpSecurityConfig({ + host: '127.0.0.1', + port: 7878, + token: undefined, + allowedHosts: [], + allowedOrigins: ['localhost:7878'], + }), + ).toThrow('Allowed origin must be a full origin URL'); + }); +}); + +describe('isMcpRequestAuthorized', () => { + const config = buildMcpSecurityConfig({ + host: '0.0.0.0', + port: 7878, + token: 'secret-token', + allowedHosts: ['mcp.example.test'], + allowedOrigins: ['https://mcp.example.test'], + }); + + it('accepts a valid host, origin, and bearer token', () => { + expect( + isMcpRequestAuthorized( + { + path: '/mcp', + headers: { + host: 'mcp.example.test:7878', + origin: 'https://mcp.example.test', + authorization: 'Bearer secret-token', + }, + }, + config, + ), + ).toEqual({ ok: true }); + }); + + it('rejects bad host headers before MCP handling', () => { + expect( + isMcpRequestAuthorized( + { path: '/health', headers: { host: 'evil.example.test' } }, + config, + ), + ).toEqual({ ok: false, status: 403, message: 'Host header is not allowed for KTX MCP.' }); + }); + + it('rejects browser origins unless explicitly allowed', () => { + expect( + isMcpRequestAuthorized( + { + path: '/health', + headers: { host: 'mcp.example.test', origin: 'https://evil.example.test' }, + }, + config, + ), + ).toEqual({ ok: false, status: 403, message: 'Origin header is not allowed for KTX MCP.' }); + }); + + it('requires bearer auth on /mcp when token auth is enabled', () => { + expect( + isMcpRequestAuthorized( + { path: '/mcp', headers: { host: 'mcp.example.test', authorization: 'Bearer wrong' } }, + config, + ), + ).toEqual({ ok: false, status: 401, message: 'Missing or invalid KTX MCP bearer token.' }); + }); + + it('does not require bearer auth on /health', () => { + expect(isMcpRequestAuthorized({ path: '/health', headers: { host: 'mcp.example.test' } }, config)).toEqual({ + ok: true, + }); + }); +}); + +function postJson(port: number, path: string, body: unknown, headers: Record = {}) { + return new Promise<{ status: number; headers: Record; body: string }>( + (resolve, reject) => { + const payload = JSON.stringify(body); + const req = request( + { + host: '127.0.0.1', + port, + path, + method: 'POST', + headers: { + host: `127.0.0.1:${port}`, + accept: 'application/json, text/event-stream', + 'content-type': 'application/json', + 'content-length': Buffer.byteLength(payload), + ...headers, + }, + }, + (res) => { + const chunks: Buffer[] = []; + res.on('data', (chunk: Buffer) => chunks.push(chunk)); + res.on('end', () => + resolve({ + status: res.statusCode ?? 0, + headers: res.headers, + body: Buffer.concat(chunks).toString('utf8'), + }), + ); + }, + ); + req.on('error', reject); + req.end(payload); + }, + ); +} + +function get(port: number, path: string, headers: Record = {}) { + return new Promise<{ status: number; headers: Record; body: string }>( + (resolve, reject) => { + const req = request( + { + host: '127.0.0.1', + port, + path, + method: 'GET', + headers: { host: `127.0.0.1:${port}`, ...headers }, + }, + (res) => { + const chunks: Buffer[] = []; + res.on('data', (chunk: Buffer) => chunks.push(chunk)); + res.on('end', () => + resolve({ + status: res.statusCode ?? 0, + headers: res.headers, + body: Buffer.concat(chunks).toString('utf8'), + }), + ); + }, + ); + req.on('error', reject); + req.end(); + }, + ); +} + +function createTestMcpServer() { + return () => { + const server = new McpServer({ name: 'ktx-test', version: '0.0.0-test' }); + server.registerTool('ping', { inputSchema: {} }, async () => ({ + content: [{ type: 'text', text: 'pong' }], + })); + return server; + }; +} + +describe('runKtxMcpHttpServer', () => { + it('serves /health with project metadata', async () => { + const handle = await runKtxMcpHttpServer({ + projectDir: '/tmp/ktx-project', + host: '127.0.0.1', + port: 0, + allowedHosts: [], + allowedOrigins: [], + createMcpServer: createTestMcpServer(), + }); + try { + const port = (handle.server.address() as AddressInfo).port; + const response = await get(port, '/health'); + expect(response.status).toBe(200); + expect(JSON.parse(response.body)).toEqual({ + status: 'ok', + projectDir: '/tmp/ktx-project', + port, + }); + } finally { + await handle.close(); + } + }); + + it('allocates a stateful MCP session on initialize', async () => { + const handle = await runKtxMcpHttpServer({ + projectDir: '/tmp/ktx-project', + host: '127.0.0.1', + port: 0, + allowedHosts: [], + allowedOrigins: [], + createMcpServer: createTestMcpServer(), + }); + try { + const port = (handle.server.address() as AddressInfo).port; + const response = await postJson(port, '/mcp', { + jsonrpc: '2.0', + id: 1, + method: 'initialize', + params: { + protocolVersion: '2025-06-18', + capabilities: {}, + clientInfo: { name: 'vitest', version: '0.0.0' }, + }, + }); + + expect(response.status).toBe(200); + expect(response.headers['mcp-session-id']).toBeTruthy(); + } finally { + await handle.close(); + } + }); + + it('rejects unknown session ids with 404', async () => { + const handle = await runKtxMcpHttpServer({ + projectDir: '/tmp/ktx-project', + host: '127.0.0.1', + port: 0, + allowedHosts: [], + allowedOrigins: [], + createMcpServer: createTestMcpServer(), + }); + try { + const port = (handle.server.address() as AddressInfo).port; + const response = await postJson( + port, + '/mcp', + { jsonrpc: '2.0', id: 2, method: 'tools/list', params: {} }, + { 'mcp-session-id': 'missing-session' }, + ); + + expect(response.status).toBe(404); + expect(response.body).toContain('Unknown MCP session'); + } finally { + await handle.close(); + } + }); +}); diff --git a/packages/cli/src/mcp-http-server.ts b/packages/cli/src/mcp-http-server.ts new file mode 100644 index 00000000..68e8eb3b --- /dev/null +++ b/packages/cli/src/mcp-http-server.ts @@ -0,0 +1,340 @@ +import { randomUUID } from 'node:crypto'; +import { createServer, type IncomingHttpHeaders, type IncomingMessage, type Server, type ServerResponse } from 'node:http'; +import { createDefaultKtxMcpServer, createLocalProjectMcpContextPorts } from '@ktx/context/mcp'; +import { createLocalProjectMemoryCapture } from '@ktx/context/memory'; +import { loadKtxProject, type KtxLocalProject } from '@ktx/context/project'; +import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; +import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js'; +import { isInitializeRequest } from '@modelcontextprotocol/sdk/types.js'; +import type { KtxCliIo } from './cli-runtime.js'; +import { createKtxCliIngestQueryExecutor } from './ingest-query-executor.js'; +import { createKtxCliScanConnector } from './local-scan-connectors.js'; +import { createManagedPythonSemanticLayerComputePort } from './managed-python-command.js'; +import { createManagedDaemonSqlAnalysisPort } from './managed-python-http.js'; + +const DEFAULT_ALLOWED_HOSTS = ['localhost', '127.0.0.1', '::1'] as const; + +export interface McpSecurityConfigInput { + host: string; + port: number; + token?: string; + allowedHosts: string[]; + allowedOrigins: string[]; +} + +export interface McpSecurityConfig { + host: string; + port: number; + token?: string; + allowedHosts: string[]; + allowedOrigins: string[]; +} + +export type McpAuthorizationResult = + | { ok: true } + | { ok: false; status: 401 | 403; message: string }; + +function isLoopbackHost(host: string): boolean { + const normalized = normalizeHostHeader(host); + return normalized === 'localhost' || normalized === '127.0.0.1' || normalized === '::1'; +} + +export function normalizeHostHeader(value: string): string { + const trimmed = value.trim().toLowerCase(); + if (trimmed.startsWith('[')) { + const close = trimmed.indexOf(']'); + return close >= 0 ? trimmed.slice(1, close) : trimmed.replace(/^\[/, ''); + } + const colon = trimmed.lastIndexOf(':'); + if (colon > -1 && trimmed.indexOf(':') === colon) { + return trimmed.slice(0, colon); + } + return trimmed; +} + +function fullOrigin(value: string): string { + let parsed: URL; + try { + parsed = new URL(value); + } catch { + throw new Error(`Allowed origin must be a full origin URL: ${value}`); + } + if (!parsed.protocol || !parsed.host || parsed.pathname !== '/' || parsed.search || parsed.hash) { + throw new Error(`Allowed origin must be a full origin URL: ${value}`); + } + return parsed.origin; +} + +export function buildMcpSecurityConfig(input: McpSecurityConfigInput): McpSecurityConfig { + if (!isLoopbackHost(input.host) && !input.token) { + throw new Error(`Binding KTX MCP to ${input.host} requires --token or KTX_MCP_TOKEN`); + } + const allowedHostSet = new Set(DEFAULT_ALLOWED_HOSTS); + if (!isLoopbackHost(input.host)) { + allowedHostSet.add(normalizeHostHeader(input.host)); + } + for (const host of input.allowedHosts) { + allowedHostSet.add(normalizeHostHeader(host)); + } + return { + host: input.host, + port: input.port, + ...(input.token ? { token: input.token } : {}), + allowedHosts: [...allowedHostSet], + allowedOrigins: input.allowedOrigins.map(fullOrigin), + }; +} + +function headerValue(headers: IncomingHttpHeaders | Record, name: string): string | undefined { + const value = headers[name.toLowerCase()]; + return Array.isArray(value) ? value[0] : value; +} + +export function isMcpRequestAuthorized( + request: { path: string; headers: IncomingHttpHeaders | Record }, + config: McpSecurityConfig, +): McpAuthorizationResult { + const host = headerValue(request.headers, 'host'); + if (!host || !config.allowedHosts.includes(normalizeHostHeader(host))) { + return { ok: false, status: 403, message: 'Host header is not allowed for KTX MCP.' }; + } + const origin = headerValue(request.headers, 'origin'); + if (origin && !config.allowedOrigins.includes(origin)) { + return { ok: false, status: 403, message: 'Origin header is not allowed for KTX MCP.' }; + } + if (request.path === '/mcp' && config.token) { + const auth = headerValue(request.headers, 'authorization'); + if (auth !== `Bearer ${config.token}`) { + return { ok: false, status: 401, message: 'Missing or invalid KTX MCP bearer token.' }; + } + } + return { ok: true }; +} + +export interface KtxMcpHttpServerHandle { + server: Server; + close(): Promise; +} + +export interface RunKtxMcpHttpServerOptions extends McpSecurityConfigInput { + projectDir: string; + cliVersion?: string; + io?: KtxCliIo; + createMcpServer?: () => McpServer; + loadProject?: typeof loadKtxProject; +} + +function noopIo(): KtxCliIo { + return { + stdout: { write() {} }, + stderr: { write() {} }, + }; +} + +function writeJson(res: ServerResponse, status: number, body: object): void { + const payload = `${JSON.stringify(body)}\n`; + res.writeHead(status, { + 'content-type': 'application/json', + 'content-length': Buffer.byteLength(payload), + }); + res.end(payload); +} + +function writeText(res: ServerResponse, status: number, body: string): void { + res.writeHead(status, { 'content-type': 'text/plain; charset=utf-8' }); + res.end(body); +} + +function requestPath(req: IncomingMessage): string { + const url = new URL(req.url ?? '/', 'http://127.0.0.1'); + return url.pathname; +} + +async function readJsonBody(req: IncomingMessage): Promise { + const chunks: Buffer[] = []; + for await (const chunk of req) { + chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + const raw = Buffer.concat(chunks).toString('utf8'); + return raw.trim().length === 0 ? undefined : (JSON.parse(raw) as unknown); +} + +async function defaultMcpServerFactory(input: { + project: KtxLocalProject; + projectDir: string; + cliVersion: string; + io?: KtxCliIo; +}): Promise<() => McpServer> { + const io = input.io ?? noopIo(); + const queryExecutor = createKtxCliIngestQueryExecutor(input.project); + const semanticLayerCompute = await createManagedPythonSemanticLayerComputePort({ + cliVersion: input.cliVersion, + installPolicy: 'auto', + io, + }); + const sqlAnalysis = createManagedDaemonSqlAnalysisPort({ + cliVersion: input.cliVersion, + projectDir: input.projectDir, + installPolicy: 'auto', + io, + }); + const contextTools = createLocalProjectMcpContextPorts(input.project, { + semanticLayerCompute, + queryExecutor, + sqlAnalysis, + localScan: { + createConnector: async (connectionId) => createKtxCliScanConnector(input.project, connectionId), + }, + localIngest: { + semanticLayerCompute, + queryExecutor, + }, + }); + + let memoryCapture: ReturnType | undefined; + try { + memoryCapture = createLocalProjectMemoryCapture(input.project, { semanticLayerCompute, queryExecutor }); + } catch (error) { + input.io?.stderr.write(`KTX MCP memory_capture disabled: ${error instanceof Error ? error.message : String(error)}\n`); + } + + return () => + createDefaultKtxMcpServer({ + name: 'ktx', + version: input.cliVersion, + userContext: { userId: 'local' }, + contextTools, + memoryCapture, + }); +} + +function listenerPort(server: Server, fallback: number): number { + const address = server.address(); + return typeof address === 'object' && address ? address.port : fallback; +} + +function transportAllowedHosts(config: McpSecurityConfig, server: Server): string[] { + const port = listenerPort(server, config.port); + const hosts = new Set(config.allowedHosts); + for (const host of config.allowedHosts) { + hosts.add(`${host}:${port}`); + if (config.port !== 0 && config.port !== port) { + hosts.add(`${host}:${config.port}`); + } + } + return [...hosts]; +} + +export async function runKtxMcpHttpServer(options: RunKtxMcpHttpServerOptions): Promise { + const config = buildMcpSecurityConfig(options); + const project = + options.createMcpServer === undefined + ? await (options.loadProject ?? loadKtxProject)({ projectDir: options.projectDir }) + : undefined; + const createMcpServer = + options.createMcpServer ?? + (await defaultMcpServerFactory({ + project: project!, + projectDir: options.projectDir, + cliVersion: options.cliVersion ?? '0.0.0-private', + io: options.io, + })); + const sessions = new Map(); + + async function newTransport(): Promise { + let transport: StreamableHTTPServerTransport; + transport = new StreamableHTTPServerTransport({ + sessionIdGenerator: () => randomUUID(), + onsessioninitialized: (sessionId) => { + sessions.set(sessionId, transport); + }, + onsessionclosed: (sessionId) => { + sessions.delete(sessionId); + }, + allowedHosts: transportAllowedHosts(config, server), + allowedOrigins: config.allowedOrigins, + enableDnsRebindingProtection: true, + }); + transport.onclose = () => { + if (transport.sessionId) { + sessions.delete(transport.sessionId); + } + }; + await createMcpServer().connect(transport); + return transport; + } + + const server = createServer(async (req, res) => { + const path = requestPath(req); + const auth = isMcpRequestAuthorized({ path, headers: req.headers }, config); + if (!auth.ok) { + writeText(res, auth.status, auth.message); + return; + } + + if (path === '/health' && req.method === 'GET') { + const port = listenerPort(server, config.port); + writeJson(res, 200, { status: 'ok', projectDir: options.projectDir, port }); + return; + } + + if (path !== '/mcp' || !['POST', 'GET', 'DELETE'].includes(req.method ?? '')) { + writeText(res, 404, 'Not found'); + return; + } + + const sessionId = req.headers['mcp-session-id']; + const normalizedSessionId = Array.isArray(sessionId) ? sessionId[0] : sessionId; + + if (req.method === 'POST') { + let body: unknown; + try { + body = await readJsonBody(req); + } catch (error) { + writeText(res, 400, `Invalid JSON body: ${error instanceof Error ? error.message : String(error)}`); + return; + } + const existing = normalizedSessionId ? sessions.get(normalizedSessionId) : undefined; + if (existing) { + await existing.handleRequest(req, res, body); + return; + } + if (normalizedSessionId) { + writeText(res, 404, `Unknown MCP session: ${normalizedSessionId}`); + return; + } + if (!isInitializeRequest(body)) { + writeText(res, 400, 'MCP initialize request is required before session traffic.'); + return; + } + await (await newTransport()).handleRequest(req, res, body); + return; + } + + if (!normalizedSessionId || !sessions.has(normalizedSessionId)) { + writeText(res, 404, normalizedSessionId ? `Unknown MCP session: ${normalizedSessionId}` : 'Missing MCP session id.'); + return; + } + await sessions.get(normalizedSessionId)!.handleRequest(req, res); + }); + + await new Promise((resolve, reject) => { + server.once('error', reject); + server.listen(config.port, config.host, () => { + server.off('error', reject); + resolve(); + }); + }); + + return { + server, + async close() { + for (const transport of sessions.values()) { + await transport.close(); + } + await new Promise((resolve, reject) => { + server.close((error) => (error ? reject(error) : resolve())); + }); + }, + }; +} diff --git a/packages/cli/src/print-command-tree.test.ts b/packages/cli/src/print-command-tree.test.ts index 9bbfa0a8..ececa88c 100644 --- a/packages/cli/src/print-command-tree.test.ts +++ b/packages/cli/src/print-command-tree.test.ts @@ -12,11 +12,12 @@ describe('renderKtxCommandTree', () => { .filter((line) => /^ {2}[├└]── \S/.test(line)) .map((line) => line.replace(/^ {2}[├└]── /, '').trim().split(' ')[0]); - for (const expected of ['setup', 'connection', 'ingest', 'sl', 'dev']) { + for (const expected of ['setup', 'connection', 'ingest', 'sl', 'mcp', 'dev']) { expect(topLevel).toContain(expected); } expect(output).toContain('│ └── test [connectionId]'); + expect(output).toContain('│ ├── status Show KTX MCP daemon status'); expect(output).not.toContain('│ ├── add'); expect(output).not.toContain('│ ├── remove'); expect(output).not.toContain('│ ├── map'); @@ -24,7 +25,6 @@ describe('renderKtxCommandTree', () => { expect(output).not.toContain('│ ├── metabase'); expect(output).not.toContain('│ ├── notion'); expect(output).not.toContain('scan '); - expect(output).not.toContain('│ ├── status'); expect(output).not.toContain('│ ├── replay'); expect(output).not.toContain('│ └── replay'); expect(output).not.toContain('│ ├── run'); diff --git a/packages/cli/src/setup-agents.test.ts b/packages/cli/src/setup-agents.test.ts index ee5c7718..9fb6903a 100644 --- a/packages/cli/src/setup-agents.test.ts +++ b/packages/cli/src/setup-agents.test.ts @@ -37,23 +37,28 @@ describe('setup agents', () => { await rm(tempDir, { recursive: true, force: true }); }); - it('plans project-scoped CLI files for every target', () => { + it('plans project-scoped CLI and research files for every target', () => { expect(plannedKtxAgentFiles({ projectDir: tempDir, target: 'claude-code', scope: 'project', mode: 'cli' })).toEqual([ { kind: 'file', path: join(tempDir, '.claude/skills/ktx/SKILL.md'), role: 'skill' }, + { kind: 'file', path: join(tempDir, '.claude/skills/ktx-research/SKILL.md'), role: 'research-skill' }, { kind: 'file', path: join(tempDir, '.claude/rules/ktx.md'), role: 'rule' }, ]); expect(plannedKtxAgentFiles({ projectDir: tempDir, target: 'codex', scope: 'project', mode: 'cli' })).toEqual([ { kind: 'file', path: join(tempDir, '.agents/skills/ktx/SKILL.md'), role: 'skill' }, + { kind: 'file', path: join(tempDir, '.agents/skills/ktx-research/SKILL.md'), role: 'research-skill' }, { kind: 'file', path: join(tempDir, '.codex/instructions/ktx.md'), role: 'rule' }, ]); expect(plannedKtxAgentFiles({ projectDir: tempDir, target: 'cursor', scope: 'project', mode: 'cli' })).toEqual([ { kind: 'file', path: join(tempDir, '.cursor/rules/ktx.mdc') }, + { kind: 'file', path: join(tempDir, '.cursor/rules/ktx-research.mdc'), role: 'research-skill' }, ]); expect(plannedKtxAgentFiles({ projectDir: tempDir, target: 'opencode', scope: 'project', mode: 'cli' })).toEqual([ { kind: 'file', path: join(tempDir, '.opencode/commands/ktx.md') }, + { kind: 'file', path: join(tempDir, '.opencode/commands/ktx-research.md'), role: 'research-skill' }, ]); expect(plannedKtxAgentFiles({ projectDir: tempDir, target: 'universal', scope: 'project', mode: 'cli' })).toEqual([ { kind: 'file', path: join(tempDir, '.agents/skills/ktx/SKILL.md') }, + { kind: 'file', path: join(tempDir, '.agents/skills/ktx-research/SKILL.md'), role: 'research-skill' }, ]); }); @@ -97,6 +102,31 @@ describe('setup agents', () => { expect(io.stderr()).toBe(''); }); + it('installs the research skill from the runtime asset', async () => { + const io = makeIo(); + + await expect( + runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'universal', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + io.io, + ), + ).resolves.toMatchObject({ status: 'ready' }); + + const researchSkill = await readFile(join(tempDir, '.agents/skills/ktx-research/SKILL.md'), 'utf-8'); + expect(researchSkill).toContain('name: ktx-research'); + expect(researchSkill).toContain('Always run `discover_data` before writing SQL.'); + expect(researchSkill).toContain('Treat a `dictionary_search` miss as non-authoritative.'); + }); + it('writes PATH-independent launcher commands for skills', async () => { const io = makeIo(); @@ -123,6 +153,178 @@ describe('setup agents', () => { expect(skill).not.toContain('sql execute'); }); + it('writes Claude Code project MCP config and tracks the json key', async () => { + const io = makeIo(); + + await expect( + runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'claude-code', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + io.io, + ), + ).resolves.toMatchObject({ status: 'ready' }); + + const mcpJson = JSON.parse(await readFile(join(tempDir, '.mcp.json'), 'utf-8')) as { + mcpServers: { ktx: { type: string; url: string; headers?: Record } }; + }; + expect(mcpJson.mcpServers.ktx).toEqual({ type: 'http', url: 'http://localhost:7878/mcp' }); + expect(await readKtxAgentInstallManifest(tempDir)).toMatchObject({ + entries: expect.arrayContaining([{ kind: 'json-key', path: join(tempDir, '.mcp.json'), jsonPath: ['mcpServers', 'ktx'] }]), + }); + expect(io.stdout()).toContain('Run `ktx mcp start` to enable the configured KTX MCP server.'); + }); + + it('writes Cursor project MCP config', async () => { + const io = makeIo(); + + await runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'cursor', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + io.io, + ); + + const cursorJson = JSON.parse(await readFile(join(tempDir, '.cursor/mcp.json'), 'utf-8')) as { + mcpServers: { ktx: { url: string; headers?: Record } }; + }; + expect(cursorJson.mcpServers.ktx).toEqual({ url: 'http://localhost:7878/mcp' }); + }); + + it('prints Codex and opencode snippets without mutating printed-only config files', async () => { + const codexIo = makeIo(); + await runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'codex', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + codexIo.io, + ); + expect(codexIo.stdout()).toContain('[mcp_servers.ktx]'); + expect(codexIo.stdout()).toContain('url = "http://localhost:7878/mcp"'); + + const opencodeIo = makeIo(); + await runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'opencode', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + opencodeIo.io, + ); + expect(opencodeIo.stdout()).toContain('"mcp"'); + expect(opencodeIo.stdout()).toContain('"type": "remote"'); + await expect(readFile(join(tempDir, 'opencode.json'), 'utf-8')).rejects.toThrow(); + }); + + it('uses MCP daemon state for port and token metadata without rendering literal tokens', async () => { + await mkdir(join(tempDir, '.ktx'), { recursive: true }); + await writeFile( + join(tempDir, '.ktx/mcp.json'), + `${JSON.stringify( + { + schemaVersion: 1, + pid: 999999, + host: '127.0.0.1', + port: 8787, + tokenAuth: true, + projectDir: tempDir, + startedAt: '2026-05-14T00:00:00.000Z', + logPath: join(tempDir, '.ktx/logs/mcp.log'), + }, + null, + 2, + )}\n`, + 'utf-8', + ); + const io = makeIo(); + const previousToken = process.env.KTX_MCP_TOKEN; + process.env.KTX_MCP_TOKEN = 'secret-token'; + + try { + await runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'claude-code', + scope: 'project', + mode: 'cli', + skipAgents: false, + }, + io.io, + ); + + const rendered = JSON.stringify(JSON.parse(await readFile(join(tempDir, '.mcp.json'), 'utf-8'))); + expect(rendered).toContain('http://127.0.0.1:8787/mcp'); + expect(rendered).toContain('Bearer ${KTX_MCP_TOKEN}'); + expect(rendered).not.toContain('secret-token'); + expect(io.stdout()).toContain('Run `ktx mcp start` to enable the configured KTX MCP server.'); + } finally { + if (previousToken === undefined) { + delete process.env.KTX_MCP_TOKEN; + } else { + process.env.KTX_MCP_TOKEN = previousToken; + } + } + }); + + it('writes Claude Code local MCP config under the project key in ~/.claude.json', async () => { + const home = await mkdtemp(join(tmpdir(), 'ktx-setup-agents-home-')); + const previousHome = process.env.HOME; + process.env.HOME = home; + try { + const io = makeIo(); + await runKtxSetupAgentsStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + agents: true, + target: 'claude-code', + scope: 'local', + mode: 'cli', + skipAgents: false, + }, + io.io, + ); + + const config = JSON.parse(await readFile(join(home, '.claude.json'), 'utf-8')) as { + projects: Record; + }; + expect(config.projects[tempDir].mcpServers.ktx).toEqual({ type: 'http', url: 'http://localhost:7878/mcp' }); + } finally { + process.env.HOME = previousHome; + await rm(home, { recursive: true, force: true }); + } + }); + it('removes only manifest-listed files', async () => { const io = makeIo(); await runKtxSetupAgentsStep( diff --git a/packages/cli/src/setup-agents.ts b/packages/cli/src/setup-agents.ts index 7a18a969..a065fc41 100644 --- a/packages/cli/src/setup-agents.ts +++ b/packages/cli/src/setup-agents.ts @@ -1,3 +1,4 @@ +import { existsSync } from 'node:fs'; import { mkdir, readFile, rm, writeFile } from 'node:fs/promises'; import { dirname, join, relative, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; @@ -12,9 +13,10 @@ import { createKtxSetupPromptAdapter, type KtxSetupPromptOption, } from './setup-prompts.js'; +import { readKtxMcpDaemonStatus } from './managed-mcp-daemon.js'; export type KtxAgentTarget = 'claude-code' | 'codex' | 'cursor' | 'opencode' | 'universal'; -export type KtxAgentScope = 'project' | 'global'; +export type KtxAgentScope = 'project' | 'global' | 'local'; export type KtxAgentInstallMode = 'cli'; export interface KtxSetupAgentsArgs { @@ -45,18 +47,179 @@ export interface KtxAgentInstallManifest { installedAt: string; installs: Array<{ target: KtxAgentTarget; scope: KtxAgentScope; mode: KtxAgentInstallMode }>; entries: Array< - | { kind: 'file'; path: string; role?: 'skill' | 'rule' } + | { kind: 'file'; path: string; role?: 'skill' | 'rule' | 'research-skill' } | { kind: 'json-key'; path: string; jsonPath: string[] } >; } type InstallEntry = KtxAgentInstallManifest['entries'][number]; +interface KtxMcpEndpointInfo { + url: string; + tokenAuth: boolean; + running: boolean; +} + +interface KtxMcpClientInstallResult { + entries: InstallEntry[]; + snippets: string[]; + notices: string[]; +} + interface KtxCliLauncher { command: string; args: string[]; } +async function readJsonObject(path: string): Promise> { + if (!existsSync(path)) return {}; + const parsed = JSON.parse(await readFile(path, 'utf-8')) as unknown; + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + throw new Error(`Expected JSON object in ${path}`); + } + return parsed as Record; +} + +function objectAtPath(root: Record, jsonPath: string[]): Record { + let cursor = root; + for (const segment of jsonPath) { + const current = cursor[segment]; + if (!current || typeof current !== 'object' || Array.isArray(current)) { + cursor[segment] = {}; + } + cursor = cursor[segment] as Record; + } + return cursor; +} + +async function writeJsonKey(path: string, jsonPath: string[], value: unknown): Promise { + const root = await readJsonObject(path); + const parent = objectAtPath(root, jsonPath.slice(0, -1)); + parent[jsonPath.at(-1) as string] = value; + await mkdir(dirname(path), { recursive: true }); + await writeFile(path, `${JSON.stringify(root, null, 2)}\n`, 'utf-8'); +} + +async function resolveMcpEndpoint(projectDir: string): Promise { + const status = await readKtxMcpDaemonStatus({ projectDir }).catch(() => null); + if (status?.kind === 'running') { + return { + url: status.url, + tokenAuth: status.state.tokenAuth, + running: true, + }; + } + if (status?.kind === 'stale' && status.state) { + return { + url: `http://${status.state.host}:${status.state.port}/mcp`, + tokenAuth: status.state.tokenAuth || Boolean(process.env.KTX_MCP_TOKEN), + running: false, + }; + } + return { + url: 'http://localhost:7878/mcp', + tokenAuth: Boolean(process.env.KTX_MCP_TOKEN), + running: false, + }; +} + +function tokenHeaders(endpoint: KtxMcpEndpointInfo): Record | undefined { + return endpoint.tokenAuth ? { Authorization: 'Bearer ${KTX_MCP_TOKEN}' } : undefined; +} + +function claudeMcpEntry(endpoint: KtxMcpEndpointInfo): Record { + return { + type: 'http', + url: endpoint.url, + ...(tokenHeaders(endpoint) ? { headers: tokenHeaders(endpoint) } : {}), + }; +} + +function cursorMcpEntry(endpoint: KtxMcpEndpointInfo): Record { + return { + url: endpoint.url, + ...(tokenHeaders(endpoint) ? { headers: tokenHeaders(endpoint) } : {}), + }; +} + +function codexSnippet(endpoint: KtxMcpEndpointInfo): string { + if (endpoint.tokenAuth) { + return [ + 'Codex MCP config does not currently document HTTP headers.', + 'Run KTX on loopback without token auth for Codex, or configure headers after Codex documents support.', + ].join('\n'); + } + return [`[mcp_servers.ktx]`, `url = "${endpoint.url}"`].join('\n'); +} + +function opencodeSnippet(endpoint: KtxMcpEndpointInfo): string { + return JSON.stringify( + { + mcp: { + ktx: { + type: 'remote', + url: endpoint.url, + enabled: true, + ...(tokenHeaders(endpoint) ? { headers: tokenHeaders(endpoint) } : {}), + }, + }, + }, + null, + 2, + ); +} + +function claudeConfigPath(projectDir: string, scope: KtxAgentScope): { path: string; jsonPath: string[] } { + const home = process.env.HOME ?? ''; + if (scope === 'global') { + return { path: join(home, '.claude.json'), jsonPath: ['mcpServers', 'ktx'] }; + } + if (scope === 'local') { + return { path: join(home, '.claude.json'), jsonPath: ['projects', resolve(projectDir), 'mcpServers', 'ktx'] }; + } + return { path: join(resolve(projectDir), '.mcp.json'), jsonPath: ['mcpServers', 'ktx'] }; +} + +function cursorConfigPath(projectDir: string, scope: KtxAgentScope): { path: string; jsonPath: string[] } { + const home = process.env.HOME ?? ''; + return { + path: scope === 'global' ? join(home, '.cursor/mcp.json') : join(resolve(projectDir), '.cursor/mcp.json'), + jsonPath: ['mcpServers', 'ktx'], + }; +} + +async function installMcpClientConfig(input: { + projectDir: string; + target: KtxAgentTarget; + scope: KtxAgentScope; +}): Promise { + const endpoint = await resolveMcpEndpoint(input.projectDir); + const entries: InstallEntry[] = []; + const snippets: string[] = []; + const notices: string[] = []; + + if (!endpoint.running) { + notices.push('Run `ktx mcp start` to enable the configured KTX MCP server.'); + } + + if (input.target === 'claude-code') { + const config = claudeConfigPath(input.projectDir, input.scope); + await writeJsonKey(config.path, config.jsonPath, claudeMcpEntry(endpoint)); + entries.push({ kind: 'json-key', path: config.path, jsonPath: config.jsonPath }); + } else if (input.target === 'cursor') { + const config = cursorConfigPath(input.projectDir, input.scope); + await writeJsonKey(config.path, config.jsonPath, cursorMcpEntry(endpoint)); + entries.push({ kind: 'json-key', path: config.path, jsonPath: config.jsonPath }); + } else if (input.target === 'codex') { + snippets.push(`Codex MCP snippet for ~/.codex/config.toml:\n${codexSnippet(endpoint)}`); + } else if (input.target === 'opencode') { + const path = input.scope === 'global' ? '~/.config/opencode/opencode.json' : relative(input.projectDir, join(input.projectDir, 'opencode.json')); + snippets.push(`opencode MCP snippet for ${path}:\n${opencodeSnippet(endpoint)}`); + } + + return { entries, snippets, notices }; +} + export function agentInstallManifestPath(projectDir: string): string { return join(resolve(projectDir), '.ktx/agents/install-manifest.json'); } @@ -72,6 +235,7 @@ export function plannedKtxAgentFiles(input: { const home = process.env.HOME ?? ''; return [ { kind: 'file', path: join(home, '.claude/skills/ktx/SKILL.md'), role: 'skill' as const }, + { kind: 'file', path: join(home, '.claude/skills/ktx-research/SKILL.md'), role: 'research-skill' as const }, { kind: 'file', path: join(home, '.claude/rules/ktx.md'), role: 'rule' as const }, ]; } @@ -79,25 +243,44 @@ export function plannedKtxAgentFiles(input: { const codexHome = process.env.CODEX_HOME ?? join(process.env.HOME ?? '', '.codex'); return [ { kind: 'file', path: join(codexHome, 'skills/ktx/SKILL.md'), role: 'skill' as const }, + { kind: 'file', path: join(codexHome, 'skills/ktx-research/SKILL.md'), role: 'research-skill' as const }, { kind: 'file', path: join(codexHome, 'instructions/ktx.md'), role: 'rule' as const }, ]; } + if (input.target === 'cursor' || input.target === 'opencode') { + return []; + } throw new Error(`Global ${input.target} installation is not supported; omit --global.`); } const root = resolve(input.projectDir); - const cliEntries: Partial> = { - 'claude-code': { kind: 'file', path: join(root, '.claude/skills/ktx/SKILL.md'), role: 'skill' }, - codex: { kind: 'file', path: join(root, '.agents/skills/ktx/SKILL.md'), role: 'skill' }, - cursor: { kind: 'file', path: join(root, '.cursor/rules/ktx.mdc') }, - opencode: { kind: 'file', path: join(root, '.opencode/commands/ktx.md') }, - universal: { kind: 'file', path: join(root, '.agents/skills/ktx/SKILL.md') }, + const cliEntries: Partial> = { + 'claude-code': [ + { kind: 'file', path: join(root, '.claude/skills/ktx/SKILL.md'), role: 'skill' }, + { kind: 'file', path: join(root, '.claude/skills/ktx-research/SKILL.md'), role: 'research-skill' }, + ], + codex: [ + { kind: 'file', path: join(root, '.agents/skills/ktx/SKILL.md'), role: 'skill' }, + { kind: 'file', path: join(root, '.agents/skills/ktx-research/SKILL.md'), role: 'research-skill' }, + ], + cursor: [ + { kind: 'file', path: join(root, '.cursor/rules/ktx.mdc') }, + { kind: 'file', path: join(root, '.cursor/rules/ktx-research.mdc'), role: 'research-skill' }, + ], + opencode: [ + { kind: 'file', path: join(root, '.opencode/commands/ktx.md') }, + { kind: 'file', path: join(root, '.opencode/commands/ktx-research.md'), role: 'research-skill' }, + ], + universal: [ + { kind: 'file', path: join(root, '.agents/skills/ktx/SKILL.md') }, + { kind: 'file', path: join(root, '.agents/skills/ktx-research/SKILL.md'), role: 'research-skill' }, + ], }; const ruleEntries: Partial> = { 'claude-code': { kind: 'file', path: join(root, '.claude/rules/ktx.md'), role: 'rule' }, codex: { kind: 'file', path: join(root, '.codex/instructions/ktx.md'), role: 'rule' }, }; - return [cliEntries[input.target], ruleEntries[input.target]].filter( + return [...(cliEntries[input.target] ?? []), ruleEntries[input.target]].filter( (entry): entry is InstallEntry => entry !== undefined, ); } @@ -109,6 +292,12 @@ function ktxCliLauncher(): KtxCliLauncher { }; } +async function readResearchSkillContent(): Promise { + const path = fileURLToPath(new URL('./skills/research/SKILL.md', import.meta.url)); + const content = await readFile(path, 'utf-8'); + return content.endsWith('\n') ? content : `${content}\n`; +} + function shellQuote(value: string): string { if (/^[A-Za-z0-9_/:=.,@%+-]+$/.test(value)) { return value; @@ -283,16 +472,22 @@ export function formatInstallSummary( projectDir: string, ): string { const entriesByTarget = new Map(); - let idx = 0; for (const install of installs) { - const planned = plannedKtxAgentFiles({ projectDir, ...install }); - entriesByTarget.set(install.target, entries.slice(idx, idx + planned.length)); - idx += planned.length; + const plannedFilePaths = new Set( + plannedKtxAgentFiles({ projectDir, ...install }) + .filter((entry) => entry.kind === 'file') + .map((entry) => entry.path), + ); + entriesByTarget.set( + install.target, + entries.filter((entry) => entry.kind === 'file' && plannedFilePaths.has(entry.path)), + ); } const fileHints: Record = { skill: 'teaches your agent which KTX commands to run', rule: 'tells your agent when to use KTX', + 'research-skill': 'teaches your agent the KTX MCP research workflow', }; const lines: string[] = []; @@ -304,7 +499,7 @@ export function formatInstallSummary( install.scope === 'global' ? entry.path : relative(projectDir, entry.path); if (entry.kind === 'file') { const isRule = entry.role === 'rule' || fileEntryLabels[install.target] === 'Rule installed'; - const label = isRule ? 'Rule installed' : fileEntryLabels[install.target]; + const label = entry.role === 'research-skill' ? 'Research skill installed' : isRule ? 'Rule installed' : fileEntryLabels[install.target]; const hint = fileHints[isRule ? 'rule' : (entry.role ?? 'skill')] ?? ''; lines.push(` + ${label} — ${hint}`); lines.push(` ${displayPath}`); @@ -327,6 +522,8 @@ async function installTarget(input: { const content = entry.role === 'rule' ? ruleInstructionContent({ projectDir: input.projectDir }) + : entry.role === 'research-skill' + ? await readResearchSkillContent() : cliInstructionContent({ projectDir: input.projectDir, launcher }); await mkdir(dirname(entry.path), { recursive: true }); await writeFile(entry.path, content, 'utf-8'); @@ -391,11 +588,25 @@ export async function runKtxSetupAgentsStep( const installs = targets.map((target) => ({ target, scope: args.scope, mode })); const entries: InstallEntry[] = []; + const snippets: string[] = []; + const notices = new Set(); try { - for (const install of installs) entries.push(...(await installTarget({ projectDir: args.projectDir, ...install }))); + for (const install of installs) { + entries.push(...(await installTarget({ projectDir: args.projectDir, ...install }))); + const mcpResult = await installMcpClientConfig({ projectDir: args.projectDir, target: install.target, scope: install.scope }); + entries.push(...mcpResult.entries); + for (const snippet of mcpResult.snippets) snippets.push(snippet); + for (const notice of mcpResult.notices) notices.add(notice); + } await writeManifest(args.projectDir, mergeManifest(args.projectDir, await readKtxAgentInstallManifest(args.projectDir), installs, entries)); await markAgentsComplete(args.projectDir); io.stdout.write(`\nAgent integration complete\n\n${formatInstallSummary(installs, entries, args.projectDir)}\n`); + for (const snippet of snippets) { + io.stdout.write(`\n${snippet}\n`); + } + for (const notice of notices) { + io.stdout.write(`\n${notice}\n`); + } return { status: 'ready', projectDir: args.projectDir, installs }; } catch (error) { io.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`); diff --git a/packages/cli/src/skills/research/SKILL.md b/packages/cli/src/skills/research/SKILL.md new file mode 100644 index 00000000..e8e354a3 --- /dev/null +++ b/packages/cli/src/skills/research/SKILL.md @@ -0,0 +1,49 @@ +--- +name: ktx-research +description: Use when answering a question that needs data from a KTX-connected database - investigating, analyzing, "how many", "show me", "what's the breakdown of", finding records by value, exploring tables, comparing periods, or any data-investigation request. Triggers even when the user does not say "research"; if the answer requires querying a configured KTX connection, this skill applies. +--- + +# KTX Research Workflow + +You have access to KTX MCP tools for investigating data. Follow this workflow. + + +1. **Discover** - call `discover_data` first to see what exists across wiki, semantic-layer sources, and raw tables. Returns refs only. +2. **Inspect top hits in parallel** - for each promising ref: + - `kind: 'wiki'` -> `wiki_read` + - `kind: 'sl_source'`, `kind: 'sl_measure'`, or `kind: 'sl_dimension'` -> `sl_read_source` + - `kind: 'table'` or `kind: 'column'` -> `entity_details` +3. **Resolve literals** - if the user named a value such as "Acme Corp" or "status=shipped", call `dictionary_search` to find which column holds it. +4. **Query** - + - Prefer `sl_query` when the semantic layer covers the question. + - Use `sql_execution` only for questions the semantic layer does not cover. +5. **Capture learnings** - at the end of the turn, call `memory_capture` so future turns benefit. Skip when the answer carries no durable knowledge. + + + +- Always run `discover_data` before writing SQL. Do not guess table names. +- Prefer the semantic layer over raw SQL when both can answer the question; measures are the source of truth. +- Read entity details before writing SQL against an unfamiliar table. Do not assume column names. +- Treat `sql_execution` as read-only. Writes are rejected by the server. +- Validate value mentions with `dictionary_search` instead of guessing case or spelling. Treat a `dictionary_search` miss as non-authoritative. The index is built from profile-sampled values, so a missing value may simply have been outside the sample. Follow up with `sql_execution` against the most plausible columns before concluding the value is absent. + + + +**Input:** "How many orders did Acme Corp place last month?" + +**Workflow:** +1. `dictionary_search({ values: ["Acme Corp"] })` finds `customers.name`. +2. `discover_data({ query: "orders customer monthly" })` finds an orders semantic-layer source. +3. `sl_read_source({ connectionId: "warehouse", sourceName: "orders_facts" })` confirms the source grain, measures, and dimensions. +4. `sl_query({ connectionId: "warehouse", measures: ["order_count"], filters: ["customer_name = 'Acme Corp'"] })` answers through the semantic layer. +5. `memory_capture({ userMessage, assistantMessage })` captures the durable finding. + +--- + +**Input:** "What columns does the events table have?" + +**Workflow:** +1. `discover_data({ query: "events table" })` returns a `table` ref. +2. `entity_details({ connectionId: "warehouse", entities: [{ table: "analytics.events" }] })` returns columns, types, and foreign keys. +3. Answer directly. No query is needed. + diff --git a/packages/context/skills/_shared/identifier-verification.md b/packages/context/skills/_shared/identifier-verification.md index 775203bd..1741d880 100644 --- a/packages/context/skills/_shared/identifier-verification.md +++ b/packages/context/skills/_shared/identifier-verification.md @@ -8,16 +8,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. diff --git a/packages/context/skills/dbt_ingest/SKILL.md b/packages/context/skills/dbt_ingest/SKILL.md index a3ce0151..fdaf586f 100644 --- a/packages/context/skills/dbt_ingest/SKILL.md +++ b/packages/context/skills/dbt_ingest/SKILL.md @@ -41,16 +41,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. diff --git a/packages/context/skills/historic_sql_patterns/SKILL.md b/packages/context/skills/historic_sql_patterns/SKILL.md index 5e898c47..057a7c78 100644 --- a/packages/context/skills/historic_sql_patterns/SKILL.md +++ b/packages/context/skills/historic_sql_patterns/SKILL.md @@ -31,16 +31,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. diff --git a/packages/context/skills/historic_sql_table_digest/SKILL.md b/packages/context/skills/historic_sql_table_digest/SKILL.md index 0815e3dc..99cf6936 100644 --- a/packages/context/skills/historic_sql_table_digest/SKILL.md +++ b/packages/context/skills/historic_sql_table_digest/SKILL.md @@ -27,16 +27,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. diff --git a/packages/context/skills/live_database_ingest/SKILL.md b/packages/context/skills/live_database_ingest/SKILL.md index 2b9cb6d8..48a476ef 100644 --- a/packages/context/skills/live_database_ingest/SKILL.md +++ b/packages/context/skills/live_database_ingest/SKILL.md @@ -37,16 +37,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. diff --git a/packages/context/skills/looker_ingest/SKILL.md b/packages/context/skills/looker_ingest/SKILL.md index 7a41fa6e..45e0f906 100644 --- a/packages/context/skills/looker_ingest/SKILL.md +++ b/packages/context/skills/looker_ingest/SKILL.md @@ -34,16 +34,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. diff --git a/packages/context/skills/lookml_ingest/SKILL.md b/packages/context/skills/lookml_ingest/SKILL.md index 52b08438..91640504 100644 --- a/packages/context/skills/lookml_ingest/SKILL.md +++ b/packages/context/skills/lookml_ingest/SKILL.md @@ -64,16 +64,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. @@ -85,11 +85,11 @@ SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: **Required flow before writing any overlay or standalone**: 1. Call `sl_discover({ query: "" })` for each base table you're about to touch. That returns the real columns. -2. If the table isn't in the manifest, use the warehouse `connectionName` +2. If the table isn't in the manifest, use the warehouse `connectionId` returned by `discover_data` or the target connection chosen from `sl_discover`, then call a dialect-appropriate SQL probe with that - connection name, for example: - `sql_execution({connectionName: "warehouse", sql: "SELECT 1 FROM analytics.orders LIMIT 0"})`. + connection id, for example: + `sql_execution({connectionId: "warehouse", sql: "SELECT 1 FROM analytics.orders LIMIT 0"})`. Replace `warehouse`, `analytics`, and `orders` with the verified connection, schema or dataset, and table from the WorkUnit evidence. 3. Use only those names in `sql:`, `columns:`, and `grain:`. Map each `dimension_group` to ONE `{ name: , type: time, role: time }` entry - never one per timeframe. diff --git a/packages/context/skills/metabase_ingest/SKILL.md b/packages/context/skills/metabase_ingest/SKILL.md index aefd067f..921d0c50 100644 --- a/packages/context/skills/metabase_ingest/SKILL.md +++ b/packages/context/skills/metabase_ingest/SKILL.md @@ -57,16 +57,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. diff --git a/packages/context/skills/metricflow_ingest/SKILL.md b/packages/context/skills/metricflow_ingest/SKILL.md index 42caf604..f46d29ec 100644 --- a/packages/context/skills/metricflow_ingest/SKILL.md +++ b/packages/context/skills/metricflow_ingest/SKILL.md @@ -42,16 +42,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. @@ -82,8 +82,8 @@ The `model:` field on a semantic_model is a string like `ref('table_name')`, `so If `sl_discover` errors because no such table exists, use `discover_data` and `entity_details` to find the warehouse target. If a SQL probe is still needed, -call `sql_execution` with the same warehouse connection name, for example: -`sql_execution({connectionName: "warehouse", sql: "SELECT 1 FROM analytics.orders LIMIT 0"})`. +call `sql_execution` with the same warehouse connection id, for example: +`sql_execution({connectionId: "warehouse", sql: "SELECT 1 FROM analytics.orders LIMIT 0"})`. **Never invent column names** - every column in computed `columns:`, `column_overrides:`, `grain:`, and `sql:` must be sourced from raw files, `entity_details`, or a successful SQL probe. diff --git a/packages/context/skills/notion_synthesize/SKILL.md b/packages/context/skills/notion_synthesize/SKILL.md index e799ce7c..877b568b 100644 --- a/packages/context/skills/notion_synthesize/SKILL.md +++ b/packages/context/skills/notion_synthesize/SKILL.md @@ -79,16 +79,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. diff --git a/packages/context/skills/sl_capture/SKILL.md b/packages/context/skills/sl_capture/SKILL.md index 22e55859..272f6860 100644 --- a/packages/context/skills/sl_capture/SKILL.md +++ b/packages/context/skills/sl_capture/SKILL.md @@ -214,16 +214,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. @@ -239,7 +239,7 @@ SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: 3. `sl_read_source({ connectionId, sourceName })` - read the raw YAML before editing. 4. For modifications: `sl_edit_source({ connectionId, sourceName, yaml_edits: [{ oldText, newText, reason }] })` with exact-string replacements. `oldText` must match exactly and be unique in the file. 5. For new sources or full rewrites: `sl_write_source({ connectionId, sourceName, source })` with the full structured source definition. -6. For join discovery: use `sql_execution({connectionName: "warehouse", sql: "SELECT count(*) FROM public.orders o JOIN public.customers c ON c.id = o.customer_id LIMIT 20"})` with the target warehouse connection name and dialect-correct table names to verify the join key exists in both tables and assess cardinality before declaring the join. +6. For join discovery: use `sql_execution({connectionId: "warehouse", sql: "SELECT count(*) FROM public.orders o JOIN public.customers c ON c.id = o.customer_id LIMIT 20"})` with the target warehouse connection id and dialect-correct table names to verify the join key exists in both tables and assess cardinality before declaring the join. 7. Cross-reference knowledge: author the edge once on the **wiki** side via `sl_refs: [source_name]` in the page's front-matter. The reverse edge (wiki pages that cite an SL source) is derived automatically by the reconciler - do not add a `knowledge_refs:` field to SL YAMLs. 8. `sl_validate` - run after writing or editing to surface schema issues, duplicate measure names, and cross-source validation errors. Read-only; the writes are already committed (the squash-at-end flow will collapse them into one commit). @@ -315,7 +315,7 @@ Prior turn: user asked to correlate LTV with protocol count; assistant joined `f sl_read_source({ connectionId: "warehouse", sourceName: "fct_orders" }) → no joins section yet sql_execution({ - connectionName: "warehouse", + connectionId: "warehouse", sql: "SELECT COUNT(*), COUNT(DISTINCT a.admin_user_id) FROM public.fct_orders a JOIN public.fct_mau_multiprotocol b ON a.admin_user_id = b.admin_user_id LIMIT 1" }) → confirms cardinality (many orders per MAU row = many_to_one) diff --git a/packages/context/skills/wiki_capture/SKILL.md b/packages/context/skills/wiki_capture/SKILL.md index 55601f99..831161c5 100644 --- a/packages/context/skills/wiki_capture/SKILL.md +++ b/packages/context/skills/wiki_capture/SKILL.md @@ -60,16 +60,16 @@ Before writing a wiki page or SL source on any topic: Before emitting any `schema.table` or `schema.table.column` into a wiki body, SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`: -2. `entity_details({connectionName, targets: [{display: ""}]})` - +2. `entity_details({connectionId, targets: [{display: ""}]})` - confirm the identifier resolves; inspect native types, FK/PK, and sampleValues. 3. For literal values from the source, such as status codes or plan tiers, check whether they appear in `entity_details` sampleValues for the relevant column. If sampleValues is short or the sample may have missed real values, - run a `sql_execution` probe with the same warehouse connection name: - `sql_execution({connectionName, sql: "SELECT DISTINCT FROM LIMIT 50"})`. + run a `sql_execution` probe with the same warehouse connection id: + `sql_execution({connectionId, sql: "SELECT DISTINCT FROM LIMIT 50"})`. 4. If the candidate identifier still does not resolve, do one of: - - Use `sql_execution({connectionName, sql: "SELECT 1 FROM LIMIT 0"})`. + - Use `sql_execution({connectionId, sql: "SELECT 1 FROM LIMIT 0"})`. If it errors, the identifier is fictional. - Wrap the identifier in `[unverified - from ]` in the wiki body, citing the exact raw path that mentioned it. diff --git a/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts b/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts index 36461bb2..3b5697d0 100644 --- a/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts +++ b/packages/context/src/ingest/adapters/historic-sql/historic-sql.adapter.test.ts @@ -18,6 +18,9 @@ const sqlAnalysis: SqlAnalysisPort = { async analyzeBatch() { return new Map(); }, + async validateReadOnly() { + return { ok: true }; + }, }; const reader: HistoricSqlReader = { @@ -79,6 +82,9 @@ describe('HistoricSqlSourceAdapter', () => { ], ]); }, + async validateReadOnly() { + return { ok: true }; + }, }; const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis: batchSqlAnalysis, diff --git a/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts b/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts index 5540c991..e610de76 100644 --- a/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts +++ b/packages/context/src/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts @@ -159,6 +159,7 @@ function acceptanceSqlAnalysis(): SqlAnalysisPort { ); }, ), + validateReadOnly: vi.fn(async () => ({ ok: true })), }; } diff --git a/packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts b/packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts index b2af032f..d09a4d40 100644 --- a/packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts +++ b/packages/context/src/ingest/adapters/historic-sql/stage-unified.test.ts @@ -83,6 +83,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { ], ['bad-parse', { tablesTouched: [], columnsByClause: {}, error: 'parse failed' }], ])), + validateReadOnly: vi.fn(async () => ({ ok: true })), }; await stageHistoricSqlAggregatedSnapshot({ @@ -207,6 +208,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { }, ], ])), + validateReadOnly: vi.fn(async () => ({ ok: true })), }; await stageHistoricSqlAggregatedSnapshot({ @@ -283,6 +285,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { }, ], ])), + validateReadOnly: vi.fn(async () => ({ ok: true })), }; await stageHistoricSqlAggregatedSnapshot({ @@ -403,6 +406,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { }, ], ])), + validateReadOnly: vi.fn(async () => ({ ok: true })), }; await stageHistoricSqlAggregatedSnapshot({ diff --git a/packages/context/src/ingest/ingest-runtime-assets.test.ts b/packages/context/src/ingest/ingest-runtime-assets.test.ts index c77bee11..6b9d83ba 100644 --- a/packages/context/src/ingest/ingest-runtime-assets.test.ts +++ b/packages/context/src/ingest/ingest-runtime-assets.test.ts @@ -94,11 +94,15 @@ describe('ingest runtime assets', () => { it('packages identifier verification prompt assets', async () => { const shared = await readFile(join(skillsDir, '_shared', 'identifier-verification.md'), 'utf-8'); + const legacyConnectionPrefix = ['connection', 'Name'].join(''); + expect(shared).toContain('## Identifier Verification Protocol'); expect(shared).toContain('discover_data'); expect(shared).toContain('entity_details'); expect(shared).toContain('sql_execution'); - expect(shared).toContain('sql_execution({connectionName, sql: "SELECT DISTINCT'); - expect(shared).toContain('sql_execution({connectionName, sql: "SELECT 1 FROM'); + expect(shared).toContain('sql_execution({connectionId, sql: "SELECT DISTINCT'); + expect(shared).toContain('sql_execution({connectionId, sql: "SELECT 1 FROM'); + expect(shared).not.toContain(`entity_details({${legacyConnectionPrefix}`); + expect(shared).not.toContain(`sql_execution({${legacyConnectionPrefix}`); }); }); diff --git a/packages/context/src/ingest/local-adapters.test.ts b/packages/context/src/ingest/local-adapters.test.ts index 17269698..a4e9eea6 100644 --- a/packages/context/src/ingest/local-adapters.test.ts +++ b/packages/context/src/ingest/local-adapters.test.ts @@ -97,6 +97,9 @@ describe('local ingest adapters', () => { async analyzeBatch() { return new Map(); }, + async validateReadOnly() { + return { ok: true }; + }, }; const adapters = createDefaultLocalIngestAdapters(project, { historicSql: { @@ -140,6 +143,9 @@ describe('local ingest adapters', () => { async analyzeBatch() { return new Map(); }, + async validateReadOnly() { + return { ok: true }; + }, }, reader, queryClient, @@ -166,6 +172,9 @@ describe('local ingest adapters', () => { async analyzeBatch() { return new Map(); }, + async validateReadOnly() { + return { ok: true }; + }, }, postgresQueryClient: { async executeQuery() { @@ -258,6 +267,9 @@ describe('local ingest adapters', () => { async analyzeBatch() { return new Map(); }, + async validateReadOnly() { + return { ok: true }; + }, }, postgresQueryClient: { async executeQuery() { diff --git a/packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.test.ts b/packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.test.ts index 979873a4..8982e300 100644 --- a/packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.test.ts +++ b/packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.test.ts @@ -1,7 +1,7 @@ import { beforeEach, describe, expect, it, vi } from 'vitest'; +import type { WarehouseCatalogService } from '../../../scan/warehouse-catalog.js'; import type { BaseTool, ToolContext } from '../../../tools/index.js'; import { DiscoverDataTool } from './discover-data.tool.js'; -import type { WarehouseCatalogService } from './warehouse-catalog.service.js'; describe('DiscoverDataTool', () => { const wikiSearchTool = { call: vi.fn() } as unknown as BaseTool & { call: ReturnType }; @@ -36,7 +36,7 @@ describe('DiscoverDataTool', () => { catalog.searchByName.mockResolvedValue([ { kind: 'table', - connectionName: 'warehouse', + connectionId: 'warehouse', ref: { catalog: null, db: 'public', name: 'orders' }, display: 'public.orders', matchedOn: 'name', @@ -45,28 +45,28 @@ describe('DiscoverDataTool', () => { }); it('groups wiki, semantic layer, and raw schema hits with routing hints', async () => { - const result = await tool.call({ query: 'orders', connectionName: 'warehouse', limit: 5 }, context); + const result = await tool.call({ query: 'orders', connectionId: 'warehouse', limit: 5 }, context); expect(result.markdown).toContain('## Wiki Pages'); expect(result.markdown).toContain('use `wiki_read(blockKey)` for full content'); expect(result.markdown).toContain('## Semantic Layer Sources'); expect(result.markdown).toContain('use `sl_read_source(sourceName)` for the YAML'); expect(result.markdown).toContain('## Raw Warehouse Schema'); - expect(result.markdown).toContain('use `entity_details({connectionName, targets: [{display}]})`'); + expect(result.markdown).toContain('use `entity_details({connectionId, targets: [{display}]})`'); expect(result.structured.raw?.hits).toHaveLength(1); }); - it('includes connectionName on raw schema hits so entity_details can follow up', async () => { + it('includes connectionId on raw schema hits so entity_details can follow up', async () => { const multiConnectionContext: ToolContext = { ...context, session: { allowedConnectionNames: new Set(['warehouse', 'analytics']) } as any, }; - catalog.searchByName.mockImplementation(async (connectionName: string, query: string) => [ + catalog.searchByName.mockImplementation(async (connectionId: string, query: string) => [ { kind: 'table', - connectionName, - ref: { catalog: null, db: 'public', name: `${connectionName}_${query}` }, - display: `public.${connectionName}_${query}`, + connectionId, + ref: { catalog: null, db: 'public', name: `${connectionId}_${query}` }, + display: `public.${connectionId}_${query}`, matchedOn: 'name', }, ]); @@ -75,16 +75,16 @@ describe('DiscoverDataTool', () => { expect(catalog.searchByName).toHaveBeenCalledWith('analytics', 'orders', 10); expect(catalog.searchByName).toHaveBeenCalledWith('warehouse', 'orders', 10); - expect(result.markdown).toContain('connectionName=analytics'); - expect(result.markdown).toContain('connectionName=warehouse'); + expect(result.markdown).toContain('connectionId=analytics'); + expect(result.markdown).toContain('connectionId=warehouse'); expect(result.markdown).toContain( - 'entity_details({connectionName: "analytics", targets: [{display: "public.analytics_orders"}]})', + 'entity_details({connectionId: "analytics", targets: [{display: "public.analytics_orders"}]})', ); - expect(result.structured.raw?.hits.map((hit) => hit.connectionName)).toEqual(['analytics', 'warehouse']); + expect(result.structured.raw?.hits.map((hit) => hit.connectionId)).toEqual(['analytics', 'warehouse']); }); it('refuses explicit out-of-scope connection names', async () => { - const result = await tool.call({ query: 'orders', connectionName: 'billing' }, context); + const result = await tool.call({ query: 'orders', connectionId: 'billing' }, context); expect(result.markdown).toContain('Connection "billing" is not available to this ingest stage.'); expect(result.structured).toEqual({ wiki: null, sl: null, raw: null }); @@ -99,7 +99,7 @@ describe('DiscoverDataTool', () => { structured: { sourceName: 'orders' }, }); - const result = await tool.call({ sourceName: 'orders', connectionName: 'warehouse' }, context); + const result = await tool.call({ sourceName: 'orders', connectionId: 'warehouse' }, context); expect(slDiscoverTool.call).toHaveBeenCalledWith({ sourceName: 'orders', connectionId: 'warehouse' }, context); expect(wikiSearchTool.call).not.toHaveBeenCalled(); @@ -112,8 +112,20 @@ describe('DiscoverDataTool', () => { slDiscoverTool.call.mockResolvedValueOnce({ markdown: '', structured: { totalSources: 0, sources: [] } }); catalog.searchByName.mockResolvedValueOnce([]); - const result = await tool.call({ query: 'customer source', connectionName: 'warehouse' }, context); + const result = await tool.call({ query: 'customer source', connectionId: 'warehouse' }, context); expect(result.markdown).toContain('No matches for "customer source" across wiki, semantic layer, or raw warehouse schema.'); }); + + it('uses connectionId as the optional connection filter', () => { + const legacyConnectionField = ['connection', 'Name'].join(''); + + expect(tool.parseInput({ query: 'orders', connectionId: 'warehouse', limit: 5 })).toEqual({ + query: 'orders', + connectionId: 'warehouse', + limit: 5, + }); + + expect(() => tool.parseInput({ query: 'orders', [legacyConnectionField]: 'warehouse', limit: 5 })).toThrow(); + }); }); diff --git a/packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.ts b/packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.ts index 667d8f83..4d13ea6b 100644 --- a/packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.ts +++ b/packages/context/src/ingest/tools/warehouse-verification/discover-data.tool.ts @@ -1,13 +1,13 @@ import { z } from 'zod'; +import { WarehouseCatalogService, type RawSchemaHit } from '../../../scan/warehouse-catalog.js'; import { BaseTool, type ToolContext, type ToolOutput } from '../../../tools/index.js'; -import { WarehouseCatalogService, type RawSchemaHit } from './warehouse-catalog.service.js'; const discoverDataInputSchema = z.object({ query: z.string().optional(), - connectionName: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/).optional(), + connectionId: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/).optional(), limit: z.number().int().positive().max(50).optional().default(10), sourceName: z.string().optional(), -}); +}).strict(); type DiscoverDataInput = z.input; @@ -62,16 +62,16 @@ export class DiscoverDataTool extends BaseTool { async call(input: DiscoverDataInput, context: ToolContext): Promise> { const allowed = allowedConnectionNames(context); - if (input.connectionName && allowed && !allowed.has(input.connectionName)) { + if (input.connectionId && allowed && !allowed.has(input.connectionId)) { return { - markdown: `Connection "${input.connectionName}" is not available to this ingest stage.`, + markdown: `Connection "${input.connectionId}" is not available to this ingest stage.`, structured: { wiki: null, sl: null, raw: null }, }; } if (input.sourceName) { const sl = await this.deps.slDiscoverTool.call( - { sourceName: input.sourceName, connectionId: input.connectionName }, + { sourceName: input.sourceName, connectionId: input.connectionId }, context, ); return { markdown: sl.markdown, structured: { wiki: null, sl: sl.structured, raw: null } }; @@ -93,7 +93,7 @@ export class DiscoverDataTool extends BaseTool { } const slResult = await this.deps.slDiscoverTool.call( - { query: query || undefined, connectionId: input.connectionName }, + { query: query || undefined, connectionId: input.connectionId }, context, ); if (totalSources(slResult.structured) > 0) { @@ -107,23 +107,23 @@ export class DiscoverDataTool extends BaseTool { } const catalog = this.deps.catalogFactory(context); - const connections = input.connectionName ? [input.connectionName] : [...(allowed ?? [])].sort(); + const connections = input.connectionId ? [input.connectionId] : [...(allowed ?? [])].sort(); const rawHits: RawSchemaHit[] = []; - for (const connectionName of connections) { - rawHits.push(...(await catalog.searchByName(connectionName, query, limit))); + for (const connectionId of connections) { + rawHits.push(...(await catalog.searchByName(connectionId, query, limit))); } if (rawHits.length > 0) { parts.push( '## Raw Warehouse Schema', - '> use `entity_details({connectionName, targets: [{display}]})` for full DDL + sample values', + '> use `entity_details({connectionId, targets: [{display}]})` for full DDL + sample values', ); parts.push( rawHits .slice(0, limit) .map( (hit) => - `- ${hit.kind}: ${hit.display} [connectionName=${hit.connectionName}] (matched on ${hit.matchedOn}) - ` + - `follow up with \`entity_details({connectionName: "${hit.connectionName}", targets: [{display: "${hit.display}"}]})\``, + `- ${hit.kind}: ${hit.display} [connectionId=${hit.connectionId}] (matched on ${hit.matchedOn}) - ` + + `follow up with \`entity_details({connectionId: "${hit.connectionId}", targets: [{display: "${hit.display}"}]})\``, ) .join('\n'), ); diff --git a/packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.test.ts b/packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.test.ts index 24a14863..e6cdbdc8 100644 --- a/packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.test.ts +++ b/packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.test.ts @@ -3,9 +3,9 @@ import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; import { initKtxProject, type KtxLocalProject } from '../../../project/index.js'; +import { WarehouseCatalogService } from '../../../scan/warehouse-catalog.js'; import type { ToolContext } from '../../../tools/index.js'; import { EntityDetailsTool } from './entity-details.tool.js'; -import { WarehouseCatalogService } from './warehouse-catalog.service.js'; describe('EntityDetailsTool', () => { let tempDir: string; @@ -32,11 +32,11 @@ describe('EntityDetailsTool', () => { await rm(tempDir, { recursive: true, force: true }); }); - async function seedLiveDatabaseScan(connectionName = 'warehouse', syncId = 'sync-1') { - const root = `raw-sources/${connectionName}/live-database/${syncId}`; + async function seedLiveDatabaseScan(connectionId = 'warehouse', syncId = 'sync-1') { + const root = `raw-sources/${connectionId}/live-database/${syncId}`; await project.fileStore.writeFile( `${root}/connection.json`, - JSON.stringify({ connectionId: connectionName, driver: 'postgres', extractedAt: '2026-05-12T00:00:00.000Z' }, null, 2), + JSON.stringify({ connectionId, driver: 'postgres', extractedAt: '2026-05-12T00:00:00.000Z' }, null, 2), 'ktx', 'ktx@example.com', 'seed connection', @@ -84,7 +84,7 @@ describe('EntityDetailsTool', () => { `${root}/enrichment/relationship-profile.json`, JSON.stringify( { - connectionId: connectionName, + connectionId, driver: 'postgres', tables: [{ table: { catalog: null, db: 'public', name: 'orders' }, rowCount: 12 }], columns: { @@ -109,7 +109,7 @@ describe('EntityDetailsTool', () => { } it('returns scoped table detail for a display target', async () => { - const result = await tool.call({ connectionName: 'warehouse', targets: [{ display: 'public.orders' }] }, context); + const result = await tool.call({ connectionId: 'warehouse', targets: [{ display: 'public.orders' }] }, context); expect(result.markdown).toContain('### public.orders'); expect(result.markdown).toContain('- status (text, nullable=false)'); @@ -120,7 +120,7 @@ describe('EntityDetailsTool', () => { it('resolves display targets that include a column name', async () => { const result = await tool.call( - { connectionName: 'warehouse', targets: [{ display: 'public.orders.status' }] }, + { connectionId: 'warehouse', targets: [{ display: 'public.orders.status' }] }, context, ); @@ -133,7 +133,7 @@ describe('EntityDetailsTool', () => { it('reports missing explicit columns instead of returning an empty column list', async () => { const result = await tool.call( - { connectionName: 'warehouse', targets: [{ display: 'public.orders.plan_tier' }] }, + { connectionId: 'warehouse', targets: [{ display: 'public.orders.plan_tier' }] }, context, ); @@ -146,7 +146,7 @@ describe('EntityDetailsTool', () => { it('reports missing structured table targets in model-visible markdown', async () => { const result = await tool.call( { - connectionName: 'warehouse', + connectionId: 'warehouse', targets: [{ catalog: null, db: 'public', name: 'orderz' }], }, context, @@ -161,7 +161,7 @@ describe('EntityDetailsTool', () => { it('reports missing structured column targets in model-visible markdown', async () => { const result = await tool.call( { - connectionName: 'warehouse', + connectionId: 'warehouse', targets: [{ catalog: null, db: 'public', name: 'orders', column: 'plan_tier' }], }, context, @@ -175,7 +175,7 @@ describe('EntityDetailsTool', () => { it('returns a no-scan state distinct from not found', async () => { const result = await tool.call( - { connectionName: 'empty', targets: [{ display: 'public.orders' }] }, + { connectionId: 'empty', targets: [{ display: 'public.orders' }] }, { ...context, session: { ...context.session!, allowedConnectionNames: new Set(['empty']) } }, ); @@ -184,9 +184,30 @@ describe('EntityDetailsTool', () => { }); it('refuses out-of-scope connections', async () => { - const result = await tool.call({ connectionName: 'billing', targets: [{ display: 'public.orders' }] }, context); + const result = await tool.call({ connectionId: 'billing', targets: [{ display: 'public.orders' }] }, context); expect(result.markdown).toContain('Connection "billing" is not available to this ingest stage.'); expect(result.structured.scanAvailable).toBe(false); }); + + it('uses connectionId as the public input field', async () => { + const legacyConnectionField = ['connection', 'Name'].join(''); + + expect( + tool.parseInput({ + connectionId: 'warehouse', + targets: [{ display: 'public.orders' }], + }), + ).toEqual({ + connectionId: 'warehouse', + targets: [{ display: 'public.orders' }], + }); + + expect(() => + tool.parseInput({ + [legacyConnectionField]: 'warehouse', + targets: [{ display: 'public.orders' }], + }), + ).toThrow(); + }); }); diff --git a/packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.ts b/packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.ts index 27cf55a0..79ce92b2 100644 --- a/packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.ts +++ b/packages/context/src/ingest/tools/warehouse-verification/entity-details.tool.ts @@ -1,7 +1,7 @@ import { z } from 'zod'; import type { KtxTableRef } from '../../../scan/types.js'; +import { WarehouseCatalogService, type TableDetail } from '../../../scan/warehouse-catalog.js'; import { BaseTool, type ToolContext, type ToolOutput } from '../../../tools/index.js'; -import { WarehouseCatalogService, type TableDetail } from './warehouse-catalog.service.js'; const targetSchema = z.union([ z.object({ display: z.string().min(1) }), @@ -14,9 +14,9 @@ const targetSchema = z.union([ ]); const entityDetailsInputSchema = z.object({ - connectionName: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/), + connectionId: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/), targets: z.array(targetSchema).min(1).max(50), -}); +}).strict(); type EntityDetailsInput = z.infer; type EntityDetailsTarget = EntityDetailsInput['targets'][number]; @@ -47,14 +47,14 @@ function appendMissingTargetMarkdown(parts: string[], target: EntityDetailsTarge async function resolveTarget( catalog: WarehouseCatalogService, - connectionName: string, + connectionId: string, target: EntityDetailsTarget, ): Promise<{ resolved: (KtxTableRef & { column?: string }) | null; candidates: KtxTableRef[] }> { if ('display' in target) { - return catalog.resolveDisplayTarget(connectionName, target.display); + return catalog.resolveDisplayTarget(connectionId, target.display); } - const candidateResolution = await catalog.resolveDisplayTarget(connectionName, targetLabel(target)); + const candidateResolution = await catalog.resolveDisplayTarget(connectionId, targetLabel(target)); return { resolved: { catalog: target.catalog, @@ -107,18 +107,18 @@ export class EntityDetailsTool extends BaseTool async call(input: EntityDetailsInput, context: ToolContext): Promise> { const allowed = allowedConnectionNames(context); - if (allowed && !allowed.has(input.connectionName)) { + if (allowed && !allowed.has(input.connectionId)) { return { - markdown: `Connection "${input.connectionName}" is not available to this ingest stage.`, + markdown: `Connection "${input.connectionId}" is not available to this ingest stage.`, structured: { resolved: [], missing: [], scanAvailable: false }, }; } const catalog = this.catalogFactory(context); - const scanAvailable = await catalog.hasScan(input.connectionName); + const scanAvailable = await catalog.hasScan(input.connectionId); if (!scanAvailable) { return { - markdown: `No live-database scan available for connection "${input.connectionName}"; run \`ktx scan\` first.`, + markdown: `No live-database scan available for connection "${input.connectionId}"; run \`ktx scan\` first.`, structured: { resolved: [], missing: [], scanAvailable: false }, }; } @@ -128,13 +128,13 @@ export class EntityDetailsTool extends BaseTool const missing: EntityDetailsStructured['missing'] = []; for (const target of input.targets) { - const resolution = await resolveTarget(catalog, input.connectionName, target); + const resolution = await resolveTarget(catalog, input.connectionId, target); if (!resolution.resolved) { missing.push({ target, candidates: resolution.candidates }); appendMissingTargetMarkdown(parts, target, resolution.candidates); continue; } - const detail = await catalog.getTable({ connectionName: input.connectionName, ...resolution.resolved }); + const detail = await catalog.getTable({ connectionId: input.connectionId, ...resolution.resolved }); if (!detail) { missing.push({ target, candidates: resolution.candidates }); appendMissingTargetMarkdown(parts, target, resolution.candidates); diff --git a/packages/context/src/ingest/tools/warehouse-verification/index.ts b/packages/context/src/ingest/tools/warehouse-verification/index.ts index e6ac2c1c..0478305c 100644 --- a/packages/context/src/ingest/tools/warehouse-verification/index.ts +++ b/packages/context/src/ingest/tools/warehouse-verification/index.ts @@ -1,10 +1,10 @@ import type { KtxFileStorePort } from '../../../core/index.js'; import type { SlConnectionCatalogPort } from '../../../sl/index.js'; +import { WarehouseCatalogService } from '../../../scan/warehouse-catalog.js'; import type { BaseTool, ToolContext } from '../../../tools/index.js'; import { DiscoverDataTool } from './discover-data.tool.js'; import { EntityDetailsTool } from './entity-details.tool.js'; import { SqlExecutionTool } from './sql-execution.tool.js'; -import { WarehouseCatalogService } from './warehouse-catalog.service.js'; export function createWarehouseVerificationTools(deps: { connections: SlConnectionCatalogPort; diff --git a/packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.test.ts b/packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.test.ts index 1cc63cac..ec7ef0ba 100644 --- a/packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.test.ts +++ b/packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.test.ts @@ -19,7 +19,7 @@ describe('SqlExecutionTool', () => { connections.executeQuery.mockResolvedValue({ headers: ['status'], rows: [['paid']], totalRows: 1 }); const result = await tool.call( - { connectionName: 'warehouse', sql: 'select status from public.orders', rowLimit: 5 }, + { connectionId: 'warehouse', sql: 'select status from public.orders', rowLimit: 5 }, context, ); @@ -34,7 +34,7 @@ describe('SqlExecutionTool', () => { it.each(['insert into x values (1)', 'drop table x', 'vacuum'])('rejects mutating SQL: %s', async (sql) => { connections.executeQuery.mockClear(); - const result = await tool.call({ connectionName: 'warehouse', sql }, context); + const result = await tool.call({ connectionId: 'warehouse', sql }, context); expect(result.markdown).toContain('Only read-only SELECT/WITH queries can be executed locally.'); expect(connections.executeQuery).not.toHaveBeenCalled(); @@ -44,11 +44,35 @@ describe('SqlExecutionTool', () => { connections.executeQuery.mockRejectedValue(new Error('relation "orbit_analytics.customer" does not exist')); const result = await tool.call( - { connectionName: 'warehouse', sql: 'select 1 from orbit_analytics.customer', rowLimit: 1 }, + { connectionId: 'warehouse', sql: 'select 1 from orbit_analytics.customer', rowLimit: 1 }, context, ); expect(result.markdown).toContain('relation "orbit_analytics.customer" does not exist'); expect(result.structured.error).toContain('relation "orbit_analytics.customer" does not exist'); }); + + it('uses connectionId as the public input field', () => { + const legacyConnectionField = ['connection', 'Name'].join(''); + + expect( + tool.parseInput({ + connectionId: 'warehouse', + sql: 'select 1', + rowLimit: 5, + }), + ).toEqual({ + connectionId: 'warehouse', + sql: 'select 1', + rowLimit: 5, + }); + + expect(() => + tool.parseInput({ + [legacyConnectionField]: 'warehouse', + sql: 'select 1', + rowLimit: 5, + }), + ).toThrow(); + }); }); diff --git a/packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.ts b/packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.ts index 03375938..8b2e3b5c 100644 --- a/packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.ts +++ b/packages/context/src/ingest/tools/warehouse-verification/sql-execution.tool.ts @@ -4,10 +4,10 @@ import type { SlConnectionCatalogPort } from '../../../sl/index.js'; import { BaseTool, type ToolContext, type ToolOutput } from '../../../tools/index.js'; const sqlExecutionInputSchema = z.object({ - connectionName: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/), + connectionId: z.string().regex(/^[a-zA-Z0-9][a-zA-Z0-9_-]*$/), sql: z.string().min(1), rowLimit: z.number().int().positive().max(1000).optional().default(100), -}); +}).strict(); type SqlExecutionInput = z.input; @@ -54,9 +54,9 @@ export class SqlExecutionTool extends BaseTool { async call(input: SqlExecutionInput, context: ToolContext): Promise> { const allowed = context.session?.allowedConnectionNames; - if (allowed && !allowed.has(input.connectionName)) { + if (allowed && !allowed.has(input.connectionId)) { return { - markdown: `Connection "${input.connectionName}" is not available to this ingest stage.`, + markdown: `Connection "${input.connectionId}" is not available to this ingest stage.`, structured: { headers: [], rows: [], @@ -83,7 +83,7 @@ export class SqlExecutionTool extends BaseTool { } try { - const result = await this.connections.executeQuery(input.connectionName, wrappedSql); + const result = await this.connections.executeQuery(input.connectionId, wrappedSql); const headers = result.headers ?? []; const rows = result.rows ?? []; const rowCount = result.totalRows ?? rows.length; diff --git a/packages/context/src/mcp/context-tools.ts b/packages/context/src/mcp/context-tools.ts index 9f84b586..773155bf 100644 --- a/packages/context/src/mcp/context-tools.ts +++ b/packages/context/src/mcp/context-tools.ts @@ -143,6 +143,45 @@ const scanArtifactReadSchema = z.object({ path: z.string().min(1), }); +const entityDetailsTableRefSchema = z.object({ + catalog: z.string().nullable(), + db: z.string().nullable(), + name: z.string().min(1), +}); + +const entityDetailsSchema = z.object({ + connectionId: connectionIdSchema, + entities: z + .array( + z.object({ + table: z.union([z.string().min(1), entityDetailsTableRefSchema]), + columns: z.array(z.string().min(1)).optional(), + }), + ) + .min(1) + .max(20), +}); + +const dictionarySearchSchema = z.object({ + values: z.array(z.string().min(1)).min(1).max(20), + connectionId: connectionIdSchema.optional(), +}); + +const discoverDataKindSchema = z.enum(['wiki', 'sl_source', 'sl_measure', 'sl_dimension', 'table', 'column']); + +const discoverDataSchema = z.object({ + query: z.string().min(1), + connectionId: connectionIdSchema.optional(), + kinds: z.array(discoverDataKindSchema).optional(), + limit: z.number().int().min(1).max(50).default(15).optional(), +}); + +const sqlExecutionSchema = z.object({ + connectionId: connectionIdSchema, + sql: z.string().min(1), + maxRows: z.number().int().min(1).max(10_000).default(1000).optional(), +}); + export function jsonToolResult(structuredContent: T): KtxMcpToolResult { return { content: [{ type: 'text', text: JSON.stringify(structuredContent, null, 2) }], @@ -361,6 +400,81 @@ export function registerKtxContextTools(deps: RegisterKtxContextToolsDeps): void ); } + if (ports.entityDetails) { + const entityDetails = ports.entityDetails; + registerParsedTool( + server, + 'entity_details', + { + title: 'Entity Details', + description: 'Read raw table and column metadata from the latest KTX live-database scan snapshot.', + inputSchema: entityDetailsSchema.shape, + }, + entityDetailsSchema, + async (input) => jsonToolResult(await entityDetails.read(input)), + ); + } + + if (ports.dictionarySearch) { + const dictionarySearch = ports.dictionarySearch; + registerParsedTool( + server, + 'dictionary_search', + { + title: 'Dictionary Search', + description: + 'Search profile-sampled warehouse values and report matching connection/source/column locations plus non-authoritative miss reasons.', + inputSchema: dictionarySearchSchema.shape, + }, + dictionarySearchSchema, + async (input) => jsonToolResult(await dictionarySearch.search(input)), + ); + } + + if (ports.discover) { + const discover = ports.discover; + registerParsedTool( + server, + 'discover_data', + { + title: 'Discover Data', + description: + 'Search across KTX wiki pages, semantic-layer sources/measures/dimensions, and raw warehouse schema refs.', + inputSchema: discoverDataSchema.shape, + }, + discoverDataSchema, + async (input) => jsonToolResult(await discover.search(input)), + ); + } + + if (ports.sqlExecution) { + const sqlExecution = ports.sqlExecution; + registerParsedTool( + server, + 'sql_execution', + { + title: 'SQL Execution', + description: + 'Execute one parser-validated read-only SQL query against a configured KTX connection and return structured rows.', + inputSchema: sqlExecutionSchema.shape, + }, + sqlExecutionSchema, + async (input) => { + try { + return jsonToolResult( + await sqlExecution.execute({ + connectionId: input.connectionId, + sql: input.sql, + maxRows: input.maxRows ?? 1000, + }), + ); + } catch (error) { + return jsonErrorToolResult(error instanceof Error ? error.message : String(error)); + } + }, + ); + } + if (ports.ingest) { const ingest = ports.ingest; registerParsedTool( diff --git a/packages/context/src/mcp/index.ts b/packages/context/src/mcp/index.ts index c3f02a66..df1bc6c5 100644 --- a/packages/context/src/mcp/index.ts +++ b/packages/context/src/mcp/index.ts @@ -5,6 +5,9 @@ export { createDefaultKtxMcpServer, createKtxMcpServer } from './server.js'; export type { KtxConnectionSummary, KtxConnectionsMcpPort, + KtxDiscoverDataMcpPort, + KtxDictionarySearchMcpPort, + KtxEntityDetailsMcpPort, KtxIngestDiffSummary, KtxIngestMcpPort, KtxIngestStatusResponse, diff --git a/packages/context/src/mcp/local-project-ports.test.ts b/packages/context/src/mcp/local-project-ports.test.ts index 4d01b846..0c000831 100644 --- a/packages/context/src/mcp/local-project-ports.test.ts +++ b/packages/context/src/mcp/local-project-ports.test.ts @@ -5,7 +5,12 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { AgentRunnerService } from '../agent/index.js'; import { FakeSourceAdapter, type MemoryFlowReplayInput } from '../ingest/index.js'; import { initKtxProject } from '../project/index.js'; -import { createKtxConnectorCapabilities, type KtxScanConnector, type KtxSchemaSnapshot } from '../scan/index.js'; +import { + createKtxConnectorCapabilities, + type KtxQueryResult, + type KtxScanConnector, + type KtxSchemaSnapshot, +} from '../scan/index.js'; import { writeLocalSlSource } from '../sl/index.js'; import { createLocalProjectMcpContextPorts } from './local-project-ports.js'; @@ -60,16 +65,119 @@ describe('createLocalProjectMcpContextPorts', () => { }; } - function testConnector(snapshot = testSnapshot()): KtxScanConnector { + function testConnector(snapshot = testSnapshot(), queryResult?: KtxQueryResult): KtxScanConnector { return { id: `test:${snapshot.connectionId}`, driver: snapshot.driver, - capabilities: createKtxConnectorCapabilities(), + capabilities: createKtxConnectorCapabilities({ readOnlySql: queryResult !== undefined }), introspect: vi.fn(async () => snapshot), + executeReadOnly: queryResult === undefined ? undefined : vi.fn(async () => queryResult), cleanup: vi.fn(async () => {}), }; } + async function seedScanReport(projectDir: string, syncId = 'sync-1'): Promise { + const root = `raw-sources/warehouse/live-database/${syncId}`; + await mkdir(join(projectDir, root, 'tables'), { recursive: true }); + await writeFile( + join(projectDir, root, 'connection.json'), + JSON.stringify( + { + connectionId: 'warehouse', + driver: 'postgres', + extractedAt: '2026-05-14T09:00:00.000Z', + scope: { schemas: ['public'] }, + }, + null, + 2, + ), + 'utf-8', + ); + await writeFile( + join(projectDir, root, 'tables', 'orders.json'), + JSON.stringify( + { + catalog: null, + db: 'public', + name: 'orders', + kind: 'table', + comment: 'Customer orders', + estimatedRows: 12, + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: null, + }, + ], + foreignKeys: [], + }, + null, + 2, + ), + 'utf-8', + ); + await writeFile( + join(projectDir, root, 'scan-report.json'), + JSON.stringify( + { + connectionId: 'warehouse', + driver: 'postgres', + syncId, + runId: 'scan-1', + trigger: 'mcp', + mode: 'structural', + dryRun: false, + artifactPaths: { + rawSourcesDir: root, + reportPath: `${root}/scan-report.json`, + manifestShards: [], + enrichmentArtifacts: [], + }, + diffSummary: { + tablesAdded: 0, + tablesModified: 0, + tablesDeleted: 0, + tablesUnchanged: 1, + columnsAdded: 0, + columnsModified: 0, + columnsDeleted: 0, + }, + manifestShardsWritten: 0, + structuralSyncStats: { + tablesCreated: 1, + tablesUpdated: 0, + tablesDeleted: 0, + columnsCreated: 0, + columnsUpdated: 0, + columnsDeleted: 0, + }, + enrichment: { + dataDictionary: 'skipped', + tableDescriptions: 'skipped', + columnDescriptions: 'skipped', + embeddings: 'skipped', + deterministicRelationships: 'skipped', + llmRelationshipValidation: 'skipped', + statisticalValidation: 'skipped', + }, + capabilityGaps: [], + warnings: [], + relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 }, + enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] }, + createdAt: '2026-05-14T09:00:00.000Z', + }, + null, + 2, + ), + 'utf-8', + ); + } + it('lists local project connections from ktx.yaml', async () => { const project = await initKtxProject({ projectDir: tempDir }); project.config.connections.warehouse = { @@ -119,6 +227,382 @@ describe('createLocalProjectMcpContextPorts', () => { expect(connector.cleanup).toHaveBeenCalled(); }); + it('executes MCP SQL only after parser-backed validation passes', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + const connector = testConnector(testSnapshot(), { + headers: ['id'], + headerTypes: ['integer'], + rows: [[1]], + totalRows: 1, + rowCount: 1, + }); + const createConnector = vi.fn(async () => connector); + const sqlAnalysis = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(), + validateReadOnly: vi.fn(async () => ({ ok: true, error: null })), + }; + const ports = createLocalProjectMcpContextPorts(project, { + sqlAnalysis, + localScan: { + createConnector, + }, + }); + + await expect( + ports.sqlExecution?.execute({ + connectionId: 'warehouse', + sql: 'select id from public.orders', + maxRows: 5, + }), + ).resolves.toEqual({ + headers: ['id'], + headerTypes: ['integer'], + rows: [[1]], + rowCount: 1, + }); + expect(sqlAnalysis.validateReadOnly).toHaveBeenCalledWith('select id from public.orders', 'postgres'); + expect(createConnector).toHaveBeenCalledWith('warehouse'); + expect(connector.executeReadOnly).toHaveBeenCalledWith( + { + connectionId: 'warehouse', + sql: 'select id from public.orders', + maxRows: 5, + }, + { runId: 'mcp-sql-execution' }, + ); + expect(connector.cleanup).toHaveBeenCalled(); + }); + + it('rejects MCP SQL before connector execution when parser validation fails', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + const connector = testConnector(testSnapshot(), { + headers: ['id'], + rows: [[1]], + totalRows: 1, + rowCount: 1, + }); + const sqlAnalysis = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(), + validateReadOnly: vi.fn(async () => ({ + ok: false, + error: 'SQL contains read/write operation: Insert', + })), + }; + const ports = createLocalProjectMcpContextPorts(project, { + sqlAnalysis, + localScan: { + createConnector: vi.fn(async () => connector), + }, + }); + + await expect( + ports.sqlExecution?.execute({ + connectionId: 'warehouse', + sql: 'with x as (insert into t values (1) returning *) select * from x', + maxRows: 1000, + }), + ).rejects.toThrow('SQL contains read/write operation: Insert'); + expect(connector.executeReadOnly).not.toHaveBeenCalled(); + }); + + it('exposes local scan entity details through MCP ports', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + await seedScanReport(project.projectDir); + const ports = createLocalProjectMcpContextPorts(project); + + await expect( + ports.entityDetails?.read({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders', columns: ['id'] }], + }), + ).resolves.toMatchObject({ + results: [ + { + ok: true, + connectionId: 'warehouse', + display: 'public.orders', + columns: [{ name: 'id', nativeType: 'integer' }], + snapshot: { syncId: 'sync-1', scanRunId: 'scan-1' }, + }, + ], + }); + }); + + it('returns a structured local entity details error when no scan exists', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + const ports = createLocalProjectMcpContextPorts(project); + + await expect( + ports.entityDetails?.read({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders' }], + }), + ).resolves.toMatchObject({ + results: [ + { + ok: false, + connectionId: 'warehouse', + error: { code: 'scan_missing' }, + }, + ], + }); + }); + + it('exposes local dictionary search through MCP ports', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + await project.fileStore.writeFile( + 'raw-sources/warehouse/live-database/sync-1/enrichment/relationship-profile.json', + `${JSON.stringify( + { + connectionId: 'warehouse', + driver: 'postgres', + sqlAvailable: true, + queryCount: 4, + tables: [], + columns: { + 'orders.status': { + table: { catalog: null, db: 'public', name: 'orders' }, + column: 'status', + nativeType: 'text', + normalizedType: 'string', + distinctCount: 2, + sampleValues: ['paid', 'refunded'], + }, + }, + warnings: [], + }, + null, + 2, + )}\n`, + 'ktx', + 'ktx@example.com', + 'Seed dictionary profile', + ); + + const ports = createLocalProjectMcpContextPorts(project); + + await expect(ports.dictionarySearch?.search({ values: ['paid'] })).resolves.toMatchObject({ + searched: [{ connectionId: 'warehouse', status: 'ready' }], + results: [ + { + value: 'paid', + matches: [{ connectionId: 'warehouse', sourceName: 'orders', columnName: 'status', matchedValue: 'paid' }], + misses: [], + }, + ], + }); + }); + + it('reports missing local dictionary profiles through MCP ports', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + + const ports = createLocalProjectMcpContextPorts(project); + + await expect(ports.dictionarySearch?.search({ values: ['paid'] })).resolves.toEqual({ + searched: [ + { + connectionId: 'warehouse', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 0, + syncId: null, + profiledAt: null, + }, + status: 'no_profile_artifact', + }, + ], + results: [ + { + value: 'paid', + matches: [], + misses: [{ connectionId: 'warehouse', reason: 'no_profile_artifact' }], + }, + ], + }); + }); + + it('exposes local project discover_data across wiki, semantic-layer, and raw schema', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + project.config.connections.warehouse = { + driver: 'postgres', + url: 'env:DATABASE_URL', + }; + await project.fileStore.writeFile( + 'wiki/global/orders-playbook.md', + [ + '---', + 'summary: Paid order operations', + 'tags: [orders]', + 'refs: []', + 'sl_refs: []', + 'usage_mode: auto', + '---', + '', + 'Paid orders are used for customer activity analysis.', + '', + ].join('\n'), + 'ktx', + 'ktx@example.com', + 'seed wiki', + ); + await project.fileStore.writeFile( + 'semantic-layer/warehouse/orders.yaml', + [ + 'name: orders', + 'descriptions:', + ' user: Paid order facts', + 'table: public.orders', + 'grain: [id]', + 'columns:', + ' - name: status', + ' type: string', + ' descriptions:', + ' user: Payment status', + 'measures:', + ' - name: order_count', + ' expr: count(*)', + ' description: Number of paid orders', + '', + ].join('\n'), + 'ktx', + 'ktx@example.com', + 'seed sl', + ); + await project.fileStore.writeFile( + 'raw-sources/warehouse/live-database/sync-1/connection.json', + JSON.stringify({ connectionId: 'warehouse', driver: 'postgres', extractedAt: '2026-05-14T09:00:00.000Z' }, null, 2), + 'ktx', + 'ktx@example.com', + 'seed connection', + ); + await project.fileStore.writeFile( + 'raw-sources/warehouse/live-database/sync-1/tables/public-orders.json', + JSON.stringify( + { + catalog: null, + db: 'public', + name: 'orders', + kind: 'table', + comment: 'Orders table', + estimatedRows: 10, + columns: [ + { + name: 'status', + nativeType: 'text', + normalizedType: 'text', + dimensionType: 'string', + nullable: false, + primaryKey: false, + comment: 'Order status', + sampleValues: ['paid'], + }, + ], + foreignKeys: [], + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed table', + ); + await project.fileStore.writeFile( + 'raw-sources/warehouse/live-database/sync-1/scan-report.json', + JSON.stringify( + { + connectionId: 'warehouse', + driver: 'postgres', + syncId: 'sync-1', + runId: 'scan-1', + trigger: 'mcp', + mode: 'enriched', + dryRun: false, + artifactPaths: { + rawSourcesDir: 'raw-sources/warehouse/live-database/sync-1', + reportPath: 'raw-sources/warehouse/live-database/sync-1/scan-report.json', + manifestShards: [], + enrichmentArtifacts: [], + }, + diffSummary: { + tablesAdded: 1, + tablesModified: 0, + tablesDeleted: 0, + tablesUnchanged: 0, + columnsAdded: 0, + columnsModified: 0, + columnsDeleted: 0, + }, + manifestShardsWritten: 0, + structuralSyncStats: { + tablesCreated: 0, + tablesUpdated: 0, + tablesDeleted: 0, + columnsCreated: 0, + columnsUpdated: 0, + columnsDeleted: 0, + }, + enrichment: { + dataDictionary: 'completed', + tableDescriptions: 'completed', + columnDescriptions: 'completed', + embeddings: 'skipped', + deterministicRelationships: 'skipped', + llmRelationshipValidation: 'skipped', + statisticalValidation: 'skipped', + }, + capabilityGaps: [], + warnings: [], + relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 }, + enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] }, + createdAt: '2026-05-14T09:00:00.000Z', + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed scan report', + ); + + const ports = createLocalProjectMcpContextPorts(project); + const results = await ports.discover?.search({ query: 'paid orders', connectionId: 'warehouse', limit: 10 }); + + expect(results).toEqual( + expect.arrayContaining([ + expect.objectContaining({ kind: 'wiki', id: 'orders-playbook' }), + expect.objectContaining({ kind: 'sl_source', id: 'orders', connectionId: 'warehouse' }), + expect.objectContaining({ kind: 'table', id: 'public.orders', connectionId: 'warehouse' }), + ]), + ); + }); + it('triggers canonical bundle ingest and reads status, report, and replay through MCP ports', async () => { const project = await initKtxProject({ projectDir: tempDir }); project.config.connections.warehouse = { diff --git a/packages/context/src/mcp/local-project-ports.ts b/packages/context/src/mcp/local-project-ports.ts index 0c325453..8088f27a 100644 --- a/packages/context/src/mcp/local-project-ports.ts +++ b/packages/context/src/mcp/local-project-ports.ts @@ -18,6 +18,7 @@ import { import { createLocalKtxEmbeddingProviderFromConfig, KtxIngestEmbeddingPortAdapter } from '../llm/index.js'; import type { KtxLocalProject } from '../project/index.js'; import { + createKtxEntityDetailsService, getLocalScanReport, getLocalScanStatus, type KtxConnectionDriver, @@ -26,8 +27,11 @@ import { type LocalScanMcpOptions, runLocalScan, } from '../scan/index.js'; +import { createKtxDiscoverDataService } from '../search/index.js'; +import type { SqlAnalysisDialect, SqlAnalysisPort } from '../sql-analysis/index.js'; import { compileLocalSlQuery, + createKtxDictionarySearchService, type LocalSlSourceSearchResult, type LocalSlSourceSummary, listLocalSlSources, @@ -44,6 +48,7 @@ import type { KtxScanArtifactReadResponse, KtxScanArtifactSummary, KtxScanArtifactType, + KtxSqlExecutionResponse, } from './types.js'; const LOCAL_AUTHOR = 'ktx'; @@ -53,6 +58,7 @@ const SL_SHAPE_WARNING = 'Local stdio validation checks YAML shape only; Python interface CreateLocalProjectMcpContextPortsOptions { semanticLayerCompute?: KtxSemanticLayerComputePort; queryExecutor?: KtxSqlQueryExecutorPort; + sqlAnalysis?: SqlAnalysisPort; localIngest?: LocalIngestMcpOptions; localScan?: LocalScanMcpOptions; embeddingService?: KtxEmbeddingPort | null; @@ -77,6 +83,10 @@ function dialectForDriver(driver: string | undefined): string { return map[normalized] ?? 'postgres'; } +function sqlAnalysisDialectForDriver(driver: string | undefined): SqlAnalysisDialect { + return dialectForDriver(driver) as SqlAnalysisDialect; +} + function assertSafePathToken(kind: string, value: string): string { if ( value.trim().length === 0 || @@ -378,6 +388,53 @@ function statusFromIngestReport(report: IngestReportSnapshot): KtxIngestStatusRe }; } +async function executeValidatedReadOnlySql( + project: KtxLocalProject, + options: CreateLocalProjectMcpContextPortsOptions, + input: { connectionId: string; sql: string; maxRows: number }, +): Promise { + const connectionId = assertSafeConnectionId(input.connectionId); + const connection = project.config.connections[connectionId]; + if (!connection) { + throw new Error(`Connection "${connectionId}" is not configured in ktx.yaml`); + } + if (!options.sqlAnalysis) { + throw new Error('sql_execution requires parser-backed SQL validation.'); + } + const validation = await options.sqlAnalysis.validateReadOnly(input.sql, sqlAnalysisDialectForDriver(connection.driver)); + if (!validation.ok) { + throw new Error(validation.error ?? 'SQL is not read-only.'); + } + const createConnector = options.localScan?.createConnector; + if (!createConnector) { + throw new Error('sql_execution requires a local scan connector factory.'); + } + + let connector: KtxScanConnector | null = null; + try { + connector = await createConnector(connectionId); + if (!connector.capabilities.readOnlySql || !connector.executeReadOnly) { + throw new Error(`Connection "${connectionId}" does not support read-only SQL execution.`); + } + const result = await connector.executeReadOnly( + { + connectionId, + sql: input.sql, + maxRows: input.maxRows, + }, + { runId: 'mcp-sql-execution' }, + ); + return { + headers: result.headers, + ...(result.headerTypes ? { headerTypes: result.headerTypes } : {}), + rows: result.rows, + rowCount: result.rowCount ?? result.rows.length, + }; + } finally { + await cleanupConnector(connector); + } +} + export function createLocalProjectMcpContextPorts( project: KtxLocalProject, options: CreateLocalProjectMcpContextPortsOptions = {}, @@ -575,8 +632,31 @@ export function createLocalProjectMcpContextPorts( }); }, }, + entityDetails: { + async read(input) { + return createKtxEntityDetailsService(project).read(input); + }, + }, + dictionarySearch: { + async search(input) { + return createKtxDictionarySearchService(project).search(input); + }, + }, + discover: { + async search(input) { + return createKtxDiscoverDataService(project, { userId: 'local', embeddingService }).search(input); + }, + }, }; + if (options.sqlAnalysis && options.localScan?.createConnector) { + ports.sqlExecution = { + async execute(input) { + return executeValidatedReadOnlySql(project, options, input); + }, + }; + } + if (options.localIngest) { ports.ingest = { async trigger(input) { diff --git a/packages/context/src/mcp/server.test.ts b/packages/context/src/mcp/server.test.ts index e02f2574..abf678bb 100644 --- a/packages/context/src/mcp/server.test.ts +++ b/packages/context/src/mcp/server.test.ts @@ -6,11 +6,16 @@ import { createLocalProjectMemoryCapture } from '../memory/index.js'; import { initKtxProject } from '../project/index.js'; import { createKtxMcpServer } from './server.js'; import type { + KtxDiscoverDataMcpPort, + KtxDictionarySearchMcpPort, + KtxEntityDetailsMcpPort, KtxIngestMcpPort, KtxKnowledgeMcpPort, KtxMcpContextPorts, KtxScanMcpPort, KtxSemanticLayerMcpPort, + KtxSqlExecutionMcpPort, + KtxSqlExecutionResponse, MemoryCapturePort, } from './types.js'; @@ -64,6 +69,242 @@ describe('createKtxMcpServer', () => { }); }); + it('registers parser-gated sql_execution when the host provides a SQL execution port', async () => { + const fake = makeFakeServer(); + const response: KtxSqlExecutionResponse = { + headers: ['status', 'count'], + headerTypes: ['text', 'bigint'], + rows: [['paid', 42]], + rowCount: 1, + }; + const sqlExecution: KtxSqlExecutionMcpPort = { + execute: vi.fn().mockResolvedValue(response), + }; + + createKtxMcpServer({ + server: fake.server, + userContext: { userId: 'local-user' }, + contextTools: { + sqlExecution, + }, + }); + + expect(fake.tools.map((tool) => tool.name)).toEqual(['sql_execution']); + await expect( + getTool(fake.tools, 'sql_execution').handler({ + connectionId: 'warehouse', + sql: 'select status, count(*) from public.orders group by status', + maxRows: 50, + }), + ).resolves.toEqual({ + content: [ + { + type: 'text', + text: JSON.stringify( + { + headers: ['status', 'count'], + headerTypes: ['text', 'bigint'], + rows: [['paid', 42]], + rowCount: 1, + }, + null, + 2, + ), + }, + ], + structuredContent: { + headers: ['status', 'count'], + headerTypes: ['text', 'bigint'], + rows: [['paid', 42]], + rowCount: 1, + }, + }); + expect(sqlExecution.execute).toHaveBeenCalledWith({ + connectionId: 'warehouse', + sql: 'select status, count(*) from public.orders group by status', + maxRows: 50, + }); + }); + + it('registers entity_details when the host provides an entity-details port', async () => { + const fake = makeFakeServer(); + const entityDetails: KtxEntityDetailsMcpPort = { + read: vi.fn().mockResolvedValue({ + results: [ + { + ok: true, + connectionId: 'warehouse', + tableRef: { catalog: null, db: 'public', name: 'orders' }, + display: 'public.orders', + kind: 'table', + comment: 'Customer orders', + estimatedRows: 12, + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: null, + }, + ], + foreignKeys: [], + snapshot: { + syncId: 'sync-1', + extractedAt: '2026-05-14T09:00:00.000Z', + scanRunId: 'scan-1', + }, + }, + ], + }), + }; + + createKtxMcpServer({ + server: fake.server, + userContext: { userId: 'local-user' }, + contextTools: { entityDetails }, + }); + + expect(fake.tools.map((tool) => tool.name)).toEqual(['entity_details']); + await expect( + getTool(fake.tools, 'entity_details').handler({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders', columns: ['id'] }], + }), + ).resolves.toMatchObject({ + structuredContent: { + results: [ + { + ok: true, + connectionId: 'warehouse', + display: 'public.orders', + columns: [{ name: 'id' }], + }, + ], + }, + }); + expect(entityDetails.read).toHaveBeenCalledWith({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders', columns: ['id'] }], + }); + }); + + it('registers dictionary_search when the host provides a dictionary-search port', async () => { + const fake = makeFakeServer(); + const dictionarySearch: KtxDictionarySearchMcpPort = { + search: vi.fn().mockResolvedValue({ + searched: [ + { + connectionId: 'warehouse', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 1, + syncId: 'sync-1', + profiledAt: null, + }, + status: 'ready', + }, + ], + results: [ + { + value: 'paid', + matches: [ + { + connectionId: 'warehouse', + sourceName: 'orders', + columnName: 'status', + matchedValue: 'paid', + cardinality: 3, + }, + ], + misses: [], + }, + ], + }), + }; + + createKtxMcpServer({ + server: fake.server, + userContext: { userId: 'local-user' }, + contextTools: { dictionarySearch }, + }); + + expect(fake.tools.map((tool) => tool.name)).toEqual(['dictionary_search']); + await expect( + getTool(fake.tools, 'dictionary_search').handler({ + connectionId: 'warehouse', + values: ['paid'], + }), + ).resolves.toMatchObject({ + structuredContent: { + searched: [{ connectionId: 'warehouse', status: 'ready' }], + results: [ + { + value: 'paid', + matches: [{ connectionId: 'warehouse', sourceName: 'orders', columnName: 'status' }], + misses: [], + }, + ], + }, + }); + expect(dictionarySearch.search).toHaveBeenCalledWith({ + connectionId: 'warehouse', + values: ['paid'], + }); + }); + + it('registers discover_data when the host provides a discover port', async () => { + const fake = makeFakeServer(); + const discover: KtxDiscoverDataMcpPort = { + search: vi.fn().mockResolvedValue([ + { + kind: 'table', + id: 'public.orders', + score: 1, + summary: 'Orders table', + snippet: 'id, status', + matchedOn: 'name', + connectionId: 'warehouse', + tableRef: { catalog: null, db: 'public', name: 'orders' }, + }, + ]), + }; + + createKtxMcpServer({ + server: fake.server, + userContext: { userId: 'local-user' }, + contextTools: { discover }, + }); + + expect(fake.tools.map((tool) => tool.name)).toEqual(['discover_data']); + await expect( + getTool(fake.tools, 'discover_data').handler({ + query: 'orders', + connectionId: 'warehouse', + kinds: ['table'], + limit: 5, + }), + ).resolves.toMatchObject({ + structuredContent: [ + { + kind: 'table', + id: 'public.orders', + connectionId: 'warehouse', + tableRef: { catalog: null, db: 'public', name: 'orders' }, + }, + ], + }); + expect(discover.search).toHaveBeenCalledWith({ + query: 'orders', + connectionId: 'warehouse', + kinds: ['table'], + limit: 5, + }); + }); + it('registers memory capture tools without host app dependencies', async () => { const fake = makeFakeServer(); const capture: MemoryCapturePort = { diff --git a/packages/context/src/mcp/types.ts b/packages/context/src/mcp/types.ts index f68444b2..ab53f56e 100644 --- a/packages/context/src/mcp/types.ts +++ b/packages/context/src/mcp/types.ts @@ -1,7 +1,11 @@ import type { IngestReportSnapshot, MemoryFlowReplayInput, TableUsageOutput } from '../ingest/index.js'; import type { MemoryCaptureService } from '../memory/index.js'; +import type { KtxEntityDetailsInput, KtxEntityDetailsResponse } from '../scan/entity-details.js'; import type { KtxScanMode, KtxScanReport } from '../scan/index.js'; +import type { KtxDiscoverDataInput, KtxDiscoverDataResponse } from '../search/index.js'; import type { + KtxDictionarySearchInput, + KtxDictionarySearchResponse, SemanticLayerQueryInput, SlDictionaryMatch, SlSearchLaneSummary, @@ -312,10 +316,37 @@ export interface KtxScanMcpPort { readArtifact?(input: { runId: string; path: string }): Promise; } +export interface KtxEntityDetailsMcpPort { + read(input: KtxEntityDetailsInput): Promise; +} + +export interface KtxDictionarySearchMcpPort { + search(input: KtxDictionarySearchInput): Promise; +} + +export interface KtxDiscoverDataMcpPort { + search(input: KtxDiscoverDataInput): Promise; +} + +export interface KtxSqlExecutionResponse { + headers: string[]; + headerTypes?: string[]; + rows: unknown[][]; + rowCount: number; +} + +export interface KtxSqlExecutionMcpPort { + execute(input: { connectionId: string; sql: string; maxRows: number }): Promise; +} + export interface KtxMcpContextPorts { connections?: KtxConnectionsMcpPort; knowledge?: KtxKnowledgeMcpPort; semanticLayer?: KtxSemanticLayerMcpPort; + entityDetails?: KtxEntityDetailsMcpPort; + dictionarySearch?: KtxDictionarySearchMcpPort; + discover?: KtxDiscoverDataMcpPort; + sqlExecution?: KtxSqlExecutionMcpPort; ingest?: KtxIngestMcpPort; scan?: KtxScanMcpPort; } diff --git a/packages/context/src/memory/memory-runtime-assets.test.ts b/packages/context/src/memory/memory-runtime-assets.test.ts index 973d7271..68a53cfd 100644 --- a/packages/context/src/memory/memory-runtime-assets.test.ts +++ b/packages/context/src/memory/memory-runtime-assets.test.ts @@ -166,17 +166,17 @@ describe('memory runtime assets', () => { } }); - it('ships only the KTX connectionName sql_execution call shape in writer guidance', async () => { + it('ships only the KTX connectionId sql_execution call shape in writer guidance', async () => { const shared = await readFile(join(skillsDir, '_shared', 'identifier-verification.md'), 'utf-8'); const bodies = [{ name: '_shared/identifier-verification.md', body: shared }]; - expect(shared).toContain('sql_execution({connectionName, sql: "SELECT DISTINCT'); - expect(shared).toContain('sql_execution({connectionName, sql: "SELECT 1 FROM'); + expect(shared).toContain('sql_execution({connectionId, sql: "SELECT DISTINCT'); + expect(shared).toContain('sql_execution({connectionId, sql: "SELECT 1 FROM'); for (const skillName of verificationWriterSkills) { const body = await readFile(join(skillsDir, skillName, 'SKILL.md'), 'utf-8'); bodies.push({ name: `${skillName}/SKILL.md`, body }); - expect(body).toContain('sql_execution({connectionName'); + expect(body).toContain('sql_execution({connectionId'); expect(body).not.toContain('sql_execution({ sql'); expect(body).not.toContain('session shape'); expect(body).not.toContain('connection is already pinned by the ingest session'); @@ -186,8 +186,8 @@ describe('memory runtime assets', () => { const calls = sqlExecutionCallBlocks(body); expect(calls.length, `${name} should contain sql_execution guidance`).toBeGreaterThan(0); expect( - calls.filter((call) => !call.includes('connectionName')), - `${name} has sql_execution calls without connectionName`, + calls.filter((call) => !call.includes('connectionId')), + `${name} has sql_execution calls without connectionId`, ).toEqual([]); expect(body, `${name} has a connectionless multiline sql_execution call`).not.toMatch( /sql_execution\(\{\s*sql\s*:/, diff --git a/packages/context/src/scan/entity-details.test.ts b/packages/context/src/scan/entity-details.test.ts new file mode 100644 index 00000000..db81ad11 --- /dev/null +++ b/packages/context/src/scan/entity-details.test.ts @@ -0,0 +1,291 @@ +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { initKtxProject, type KtxLocalProject } from '../project/index.js'; +import { createKtxEntityDetailsService } from './entity-details.js'; +import type { KtxConnectionDriver, KtxScanReport, KtxSchemaTable } from './types.js'; + +describe('createKtxEntityDetailsService', () => { + let tempDir: string; + let project: KtxLocalProject; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-entity-details-service-')); + project = await initKtxProject({ projectDir: join(tempDir, 'project') }); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + function scanReport(input: { + connectionId: string; + syncId: string; + runId: string; + driver?: KtxConnectionDriver; + createdAt?: string; + }): KtxScanReport { + const rawSourcesDir = `raw-sources/${input.connectionId}/live-database/${input.syncId}`; + return { + connectionId: input.connectionId, + driver: input.driver ?? 'postgres', + syncId: input.syncId, + runId: input.runId, + trigger: 'mcp', + mode: 'structural', + dryRun: false, + artifactPaths: { + rawSourcesDir, + reportPath: `${rawSourcesDir}/scan-report.json`, + manifestShards: [], + enrichmentArtifacts: [], + }, + diffSummary: { + tablesAdded: 0, + tablesModified: 0, + tablesDeleted: 0, + tablesUnchanged: 1, + columnsAdded: 0, + columnsModified: 0, + columnsDeleted: 0, + }, + manifestShardsWritten: 0, + structuralSyncStats: { + tablesCreated: 1, + tablesUpdated: 0, + tablesDeleted: 0, + columnsCreated: 0, + columnsUpdated: 0, + columnsDeleted: 0, + }, + enrichment: { + dataDictionary: 'skipped', + tableDescriptions: 'skipped', + columnDescriptions: 'skipped', + embeddings: 'skipped', + deterministicRelationships: 'skipped', + llmRelationshipValidation: 'skipped', + statisticalValidation: 'skipped', + }, + capabilityGaps: [], + warnings: [], + relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 }, + enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] }, + createdAt: input.createdAt ?? '2026-05-14T09:00:00.000Z', + }; + } + + function ordersTable(input: { db?: string | null; estimatedRows?: number | null } = {}): KtxSchemaTable { + return { + catalog: null, + db: input.db ?? 'public', + name: 'orders', + kind: 'table', + comment: 'Customer orders', + estimatedRows: input.estimatedRows ?? 12, + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: 'Order id', + }, + { + name: 'status', + nativeType: 'text', + normalizedType: 'text', + dimensionType: 'string', + nullable: false, + primaryKey: false, + comment: 'Order status', + }, + ], + foreignKeys: [ + { + fromColumn: 'customer_id', + toCatalog: null, + toDb: 'public', + toTable: 'customers', + toColumn: 'id', + constraintName: 'orders_customer_id_fkey', + }, + ], + }; + } + + async function seedScan(input: { + connectionId?: string; + syncId: string; + runId: string; + driver?: KtxConnectionDriver; + extractedAt?: string; + tables?: KtxSchemaTable[]; + }): Promise { + const connectionId = input.connectionId ?? 'warehouse'; + const report = scanReport({ + connectionId, + syncId: input.syncId, + runId: input.runId, + driver: input.driver, + createdAt: input.extractedAt, + }); + const root = report.artifactPaths.rawSourcesDir; + await project.fileStore.writeFile( + `${root}/connection.json`, + JSON.stringify( + { + connectionId, + driver: report.driver, + extractedAt: input.extractedAt ?? report.createdAt, + scope: { schemas: ['public'] }, + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed connection', + ); + for (const table of input.tables ?? [ordersTable()]) { + await project.fileStore.writeFile( + `${root}/tables/${table.db ?? 'default'}-${table.name}.json`, + JSON.stringify(table, null, 2), + 'ktx', + 'ktx@example.com', + `seed ${table.name}`, + ); + } + await project.fileStore.writeFile( + `${root}/scan-report.json`, + JSON.stringify(report, null, 2), + 'ktx', + 'ktx@example.com', + 'seed scan report', + ); + } + + it('returns the latest scan snapshot table details for a display string', async () => { + await seedScan({ syncId: 'sync-1', runId: 'scan-old', extractedAt: '2026-05-14T08:00:00.000Z' }); + await seedScan({ + syncId: 'sync-2', + runId: 'scan-new', + extractedAt: '2026-05-14T09:00:00.000Z', + tables: [ordersTable({ estimatedRows: 99 })], + }); + const service = createKtxEntityDetailsService(project); + + const result = await service.read({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders' }], + }); + + expect(result.results).toHaveLength(1); + expect(result.results[0]).toMatchObject({ + ok: true, + connectionId: 'warehouse', + display: 'public.orders', + estimatedRows: 99, + snapshot: { + syncId: 'sync-2', + scanRunId: 'scan-new', + extractedAt: '2026-05-14T09:00:00.000Z', + }, + columns: [ + { name: 'id', nativeType: 'integer', primaryKey: true }, + { name: 'status', nativeType: 'text', nullable: false }, + ], + }); + }); + + it('filters requested columns while keeping full-table foreign keys', async () => { + await seedScan({ syncId: 'sync-1', runId: 'scan-1' }); + const service = createKtxEntityDetailsService(project); + + const result = await service.read({ + connectionId: 'warehouse', + entities: [{ table: { catalog: null, db: 'public', name: 'orders' }, columns: ['status'] }], + }); + + expect(result.results[0]).toMatchObject({ + ok: true, + columns: [{ name: 'status' }], + foreignKeys: [ + { + fromColumn: 'customer_id', + toDb: 'public', + toTable: 'customers', + toColumn: 'id', + }, + ], + }); + }); + + it('returns a structured missing-scan error', async () => { + const service = createKtxEntityDetailsService(project); + + const result = await service.read({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders' }], + }); + + expect(result.results).toEqual([ + { + ok: false, + connectionId: 'warehouse', + table: 'public.orders', + error: { + code: 'scan_missing', + message: 'No live-database scan found for connection "warehouse"; run `ktx ingest warehouse` or `ktx scan warehouse`.', + }, + }, + ]); + }); + + it('reports ambiguous bare table names across schemas', async () => { + await seedScan({ + syncId: 'sync-1', + runId: 'scan-1', + tables: [ordersTable({ db: 'public' }), ordersTable({ db: 'archive' })], + }); + const service = createKtxEntityDetailsService(project); + + const result = await service.read({ + connectionId: 'warehouse', + entities: [{ table: 'orders' }], + }); + + expect(result.results[0]).toMatchObject({ + ok: false, + error: { + code: 'ambiguous_table', + candidates: [ + { tableRef: { catalog: null, db: 'archive', name: 'orders' }, display: 'archive.orders' }, + { tableRef: { catalog: null, db: 'public', name: 'orders' }, display: 'public.orders' }, + ], + }, + }); + }); + + it('reports missing requested columns with available column candidates', async () => { + await seedScan({ syncId: 'sync-1', runId: 'scan-1' }); + const service = createKtxEntityDetailsService(project); + + const result = await service.read({ + connectionId: 'warehouse', + entities: [{ table: 'public.orders', columns: ['status', 'plan_tier'] }], + }); + + expect(result.results[0]).toMatchObject({ + ok: false, + error: { + code: 'column_not_found', + message: 'Column(s) not found on public.orders: plan_tier', + candidates: ['id', 'status'], + }, + }); + }); +}); diff --git a/packages/context/src/scan/entity-details.ts b/packages/context/src/scan/entity-details.ts new file mode 100644 index 00000000..6e95690e --- /dev/null +++ b/packages/context/src/scan/entity-details.ts @@ -0,0 +1,315 @@ +import type { KtxLocalProject } from '../project/index.js'; +import { readLocalScanStructuralSnapshot } from './local-structural-artifacts.js'; +import type { + KtxConnectionDriver, + KtxScanReport, + KtxSchemaColumn, + KtxSchemaSnapshot, + KtxSchemaTable, + KtxTableRef, +} from './types.js'; + +export type KtxEntityDetailsTableInput = string | KtxTableRef; + +export interface KtxEntityDetailsInput { + connectionId: string; + entities: Array<{ + table: KtxEntityDetailsTableInput; + columns?: string[]; + }>; +} + +export interface KtxEntityDetailsSnapshotInfo { + syncId: string; + extractedAt: string; + scanRunId: string | null; +} + +export interface KtxEntityDetailsColumn { + name: string; + nativeType: string; + normalizedType: string; + dimensionType: KtxSchemaColumn['dimensionType']; + nullable: boolean; + primaryKey: boolean; + comment: string | null; +} + +export interface KtxEntityDetailsRecord { + ok: true; + connectionId: string; + tableRef: KtxTableRef; + display: string; + kind: KtxSchemaTable['kind']; + comment: string | null; + estimatedRows: number | null; + columns: KtxEntityDetailsColumn[]; + foreignKeys: KtxSchemaTable['foreignKeys']; + snapshot: KtxEntityDetailsSnapshotInfo; +} + +export type KtxEntityDetailsErrorCode = 'scan_missing' | 'table_not_found' | 'ambiguous_table' | 'column_not_found'; + +export interface KtxEntityDetailsErrorResult { + ok: false; + connectionId: string; + table: KtxEntityDetailsTableInput; + snapshot?: KtxEntityDetailsSnapshotInfo; + error: { + code: KtxEntityDetailsErrorCode; + message: string; + candidates?: Array<{ tableRef: KtxTableRef; display: string }> | string[]; + }; +} + +export interface KtxEntityDetailsResponse { + results: Array; +} + +interface LatestScan { + report: KtxScanReport; + snapshot: KtxSchemaSnapshot; +} + +interface ResolveResult { + table: KtxSchemaTable | null; + error?: Omit & { message: string }; +} + +function normalize(value: string | null | undefined): string { + return (value ?? '').toLowerCase(); +} + +function refsEqual(left: KtxTableRef, right: KtxTableRef): boolean { + return ( + normalize(left.catalog) === normalize(right.catalog) && + normalize(left.db) === normalize(right.db) && + normalize(left.name) === normalize(right.name) + ); +} + +function cleanIdentifierPart(part: string): string { + return part.trim().replace(/^["'`\[]|["'`\]]$/g, ''); +} + +function splitDisplay(display: string): string[] { + return display + .trim() + .split('.') + .map(cleanIdentifierPart) + .filter(Boolean); +} + +function displayForTable(driver: KtxConnectionDriver, table: KtxTableRef): string { + if (driver === 'sqlite') { + return table.name; + } + return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.'); +} + +function tableRef(table: KtxSchemaTable): KtxTableRef { + return { catalog: table.catalog, db: table.db, name: table.name }; +} + +function candidateList( + driver: KtxConnectionDriver, + tables: KtxSchemaTable[], +): Array<{ tableRef: KtxTableRef; display: string }> { + return tables + .map((table) => ({ + tableRef: tableRef(table), + display: displayForTable(driver, table), + })) + .sort((left, right) => left.display.localeCompare(right.display)); +} + +function parseDisplayRef(driver: KtxConnectionDriver, display: string): KtxTableRef | null { + const parts = splitDisplay(display); + if (driver === 'sqlite') { + return parts.length === 1 ? { catalog: null, db: null, name: parts[0]! } : null; + } + if (driver === 'bigquery' || driver === 'snowflake' || driver === 'sqlserver') { + return parts.length === 3 ? { catalog: parts[0]!, db: parts[1]!, name: parts[2]! } : null; + } + if (parts.length === 2) { + return { catalog: null, db: parts[0]!, name: parts[1]! }; + } + if (parts.length === 3) { + return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! }; + } + return null; +} + +function resolveTable(snapshot: KtxSchemaSnapshot, input: KtxEntityDetailsTableInput): ResolveResult { + if (typeof input !== 'string') { + const table = snapshot.tables.find((candidate) => refsEqual(candidate, input)) ?? null; + return table + ? { table } + : { + table: null, + error: { + code: 'table_not_found', + message: `Table not found in latest scan: ${displayForTable(snapshot.driver, input)}`, + candidates: candidateList(snapshot.driver, snapshot.tables), + }, + }; + } + + const parsed = parseDisplayRef(snapshot.driver, input); + if (parsed) { + const table = snapshot.tables.find((candidate) => refsEqual(candidate, parsed)) ?? null; + return table + ? { table } + : { + table: null, + error: { + code: 'table_not_found', + message: `Table not found in latest scan: ${input}`, + candidates: candidateList(snapshot.driver, snapshot.tables), + }, + }; + } + + const byName = snapshot.tables.filter((candidate) => normalize(candidate.name) === normalize(input)); + if (byName.length === 1) { + return { table: byName[0]! }; + } + if (byName.length > 1) { + return { + table: null, + error: { + code: 'ambiguous_table', + message: `Table name "${input}" is ambiguous across schemas/catalogs; pass a structured table ref.`, + candidates: candidateList(snapshot.driver, byName), + }, + }; + } + return { + table: null, + error: { + code: 'table_not_found', + message: `Table not found in latest scan: ${input}`, + candidates: candidateList(snapshot.driver, snapshot.tables), + }, + }; +} + +function toColumn(column: KtxSchemaColumn): KtxEntityDetailsColumn { + return { + name: column.name, + nativeType: column.nativeType, + normalizedType: column.normalizedType, + dimensionType: column.dimensionType, + nullable: column.nullable, + primaryKey: column.primaryKey, + comment: column.comment, + }; +} + +function snapshotInfo(report: KtxScanReport, snapshot: KtxSchemaSnapshot): KtxEntityDetailsSnapshotInfo { + return { + syncId: report.syncId, + extractedAt: snapshot.extractedAt, + scanRunId: report.runId ?? null, + }; +} + +async function readJson(project: KtxLocalProject, path: string): Promise { + return JSON.parse((await project.fileStore.readFile(path)).content) as T; +} + +async function latestScan(project: KtxLocalProject, connectionId: string): Promise { + const root = `raw-sources/${connectionId}/live-database`; + let listed; + try { + listed = await project.fileStore.listFiles(root); + } catch { + return null; + } + const reportPath = listed.files.filter((path) => path.endsWith('/scan-report.json')).sort().at(-1); + if (!reportPath) { + return null; + } + const report = await readJson(project, reportPath); + const rawSourcesDir = report.artifactPaths.rawSourcesDir ?? reportPath.slice(0, -'/scan-report.json'.length); + const snapshot = await readLocalScanStructuralSnapshot({ + project, + connectionId, + driver: report.driver, + rawSourcesDir, + extractedAtFallback: report.createdAt, + }); + return { report, snapshot }; +} + +export function createKtxEntityDetailsService(project: KtxLocalProject) { + return { + async read(input: KtxEntityDetailsInput): Promise { + const scan = await latestScan(project, input.connectionId); + if (!scan) { + return { + results: input.entities.map((entity) => ({ + ok: false, + connectionId: input.connectionId, + table: entity.table, + error: { + code: 'scan_missing', + message: `No live-database scan found for connection "${input.connectionId}"; run \`ktx ingest ${input.connectionId}\` or \`ktx scan ${input.connectionId}\`.`, + }, + })), + }; + } + + const info = snapshotInfo(scan.report, scan.snapshot); + const results: KtxEntityDetailsResponse['results'] = []; + for (const entity of input.entities) { + const resolved = resolveTable(scan.snapshot, entity.table); + if (!resolved.table) { + results.push({ + ok: false, + connectionId: input.connectionId, + table: entity.table, + snapshot: info, + error: resolved.error!, + }); + continue; + } + + const requested = new Set((entity.columns ?? []).map((column) => normalize(column))); + const columns = requested.size + ? resolved.table.columns.filter((column) => requested.has(normalize(column.name))) + : resolved.table.columns; + if (requested.size && columns.length !== requested.size) { + const found = new Set(columns.map((column) => normalize(column.name))); + const missing = [...requested].filter((column) => !found.has(column)); + results.push({ + ok: false, + connectionId: input.connectionId, + table: entity.table, + snapshot: info, + error: { + code: 'column_not_found', + message: `Column(s) not found on ${displayForTable(scan.snapshot.driver, resolved.table)}: ${missing.join(', ')}`, + candidates: resolved.table.columns.map((column) => column.name), + }, + }); + continue; + } + + results.push({ + ok: true, + connectionId: input.connectionId, + tableRef: tableRef(resolved.table), + display: displayForTable(scan.snapshot.driver, resolved.table), + kind: resolved.table.kind, + comment: resolved.table.comment, + estimatedRows: resolved.table.estimatedRows, + columns: columns.map(toColumn), + foreignKeys: resolved.table.foreignKeys, + snapshot: info, + }); + } + return { results }; + }, + }; +} diff --git a/packages/context/src/scan/index.ts b/packages/context/src/scan/index.ts index e7207b49..4360fec7 100644 --- a/packages/context/src/scan/index.ts +++ b/packages/context/src/scan/index.ts @@ -60,6 +60,24 @@ export { ktxScanErrorMessage, skippedKtxScanEnrichmentSummary, } from './enrichment-summary.js'; +export type { + KtxEntityDetailsColumn, + KtxEntityDetailsErrorCode, + KtxEntityDetailsErrorResult, + KtxEntityDetailsInput, + KtxEntityDetailsRecord, + KtxEntityDetailsResponse, + KtxEntityDetailsSnapshotInfo, + KtxEntityDetailsTableInput, +} from './entity-details.js'; +export { createKtxEntityDetailsService } from './entity-details.js'; +export type { + DisplayTargetResolution, + RawSchemaHit, + TableDetail, + WarehouseCatalogServiceDeps, +} from './warehouse-catalog.js'; +export { WarehouseCatalogService } from './warehouse-catalog.js'; export type { KtxColumnSampleUpdate, KtxDescriptionSource, diff --git a/packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.test.ts b/packages/context/src/scan/warehouse-catalog.test.ts similarity index 85% rename from packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.test.ts rename to packages/context/src/scan/warehouse-catalog.test.ts index 03340ace..7cabc1df 100644 --- a/packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.test.ts +++ b/packages/context/src/scan/warehouse-catalog.test.ts @@ -2,8 +2,8 @@ import { mkdtemp, rm } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { initKtxProject, type KtxLocalProject } from '../../../project/index.js'; -import { WarehouseCatalogService } from './warehouse-catalog.service.js'; +import { initKtxProject, type KtxLocalProject } from '../project/index.js'; +import { WarehouseCatalogService } from './warehouse-catalog.js'; describe('WarehouseCatalogService', () => { let tempDir: string; @@ -18,8 +18,8 @@ describe('WarehouseCatalogService', () => { await rm(tempDir, { recursive: true, force: true }); }); - async function seedLiveDatabaseScan(connectionName = 'warehouse', syncId = 'sync-2', driver = 'postgres') { - const root = `raw-sources/${connectionName}/live-database/${syncId}`; + async function seedLiveDatabaseScan(connectionId = 'warehouse', syncId = 'sync-2', driver = 'postgres') { + const root = `raw-sources/${connectionId}/live-database/${syncId}`; const tableRef = { catalog: driver === 'bigquery' ? 'analytics' : null, db: driver === 'sqlite' ? null : 'public', @@ -27,7 +27,7 @@ describe('WarehouseCatalogService', () => { }; await project.fileStore.writeFile( `${root}/connection.json`, - JSON.stringify({ connectionId: connectionName, driver, extractedAt: '2026-05-12T00:00:00.000Z' }, null, 2), + JSON.stringify({ connectionId, driver, extractedAt: '2026-05-12T00:00:00.000Z' }, null, 2), 'ktx', 'ktx@example.com', 'seed connection', @@ -75,7 +75,7 @@ describe('WarehouseCatalogService', () => { `${root}/enrichment/relationship-profile.json`, JSON.stringify( { - connectionId: connectionName, + connectionId, driver, sqlAvailable: true, queryCount: 3, @@ -113,10 +113,10 @@ describe('WarehouseCatalogService', () => { const catalog = new WarehouseCatalogService({ fileStore: project.fileStore }); await expect(catalog.getLatestSyncId('warehouse')).resolves.toBe('sync-2'); - const detail = await catalog.getTable({ connectionName: 'warehouse', catalog: null, db: 'public', name: 'orders' }); + const detail = await catalog.getTable({ connectionId: 'warehouse', catalog: null, db: 'public', name: 'orders' }); expect(detail).toMatchObject({ - connectionName: 'warehouse', + connectionId: 'warehouse', display: 'public.orders', rowCount: 12, columns: [ @@ -124,11 +124,20 @@ describe('WarehouseCatalogService', () => { { name: 'status', nativeType: 'text', sampleValues: ['paid', 'refunded'], distinctCount: 2 }, ], }); + expect(detail).not.toHaveProperty(['connection', 'Name'].join('')); + + const hits = await catalog.searchByName('warehouse', 'orders', 5); + expect(hits[0]).toMatchObject({ + kind: 'table', + connectionId: 'warehouse', + display: 'public.orders', + }); + expect(hits[0]).not.toHaveProperty(['connection', 'Name'].join('')); }); it('returns scanAvailable=false when no live-database scan exists', async () => { const catalog = new WarehouseCatalogService({ fileStore: project.fileStore }); - await expect(catalog.getTable({ connectionName: 'missing', catalog: null, db: 'public', name: 'orders' })).resolves.toBeNull(); + await expect(catalog.getTable({ connectionId: 'missing', catalog: null, db: 'public', name: 'orders' })).resolves.toBeNull(); await expect(catalog.hasScan('missing')).resolves.toBe(false); }); diff --git a/packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.ts b/packages/context/src/scan/warehouse-catalog.ts similarity index 87% rename from packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.ts rename to packages/context/src/scan/warehouse-catalog.ts index b916107c..8cbe324d 100644 --- a/packages/context/src/ingest/tools/warehouse-verification/warehouse-catalog.service.ts +++ b/packages/context/src/scan/warehouse-catalog.ts @@ -1,12 +1,12 @@ -import { getDialectForDriver } from '../../../connections/index.js'; -import type { KtxFileStorePort } from '../../../core/index.js'; +import { getDialectForDriver } from '../connections/index.js'; +import type { KtxFileStorePort } from '../core/index.js'; import type { KtxConnectionDriver, KtxSchemaColumn, KtxSchemaForeignKey, KtxSchemaTable, KtxTableRef, -} from '../../../scan/types.js'; +} from './types.js'; type CatalogDriver = KtxConnectionDriver | 'sqlite3'; @@ -24,7 +24,7 @@ interface WarehouseColumnDetail extends KtxSchemaColumn { } export interface TableDetail { - connectionName: string; + connectionId: string; catalog: string | null; db: string | null; name: string; @@ -40,14 +40,14 @@ export interface TableDetail { export type RawSchemaHit = | { kind: 'table'; - connectionName: string; + connectionId: string; ref: KtxTableRef; display: string; matchedOn: 'name' | 'db' | 'comment' | 'description'; } | { kind: 'column'; - connectionName: string; + connectionId: string; ref: KtxTableRef & { column: string }; display: string; matchedOn: 'name' | 'comment' | 'description'; @@ -80,7 +80,7 @@ interface RelationshipProfileArtifact { } interface ConnectionCatalog { - connectionName: string; + connectionId: string; syncId: string; driver: CatalogDriver; tables: KtxSchemaTable[]; @@ -250,21 +250,21 @@ export class WarehouseCatalogService { constructor(private readonly deps: WarehouseCatalogServiceDeps) {} - async hasScan(connectionName: string): Promise { - return (await this.loadCatalog(connectionName)) !== null; + async hasScan(connectionId: string): Promise { + return (await this.loadCatalog(connectionId)) !== null; } - async getLatestSyncId(connectionName: string): Promise { - return (await this.loadCatalog(connectionName))?.syncId ?? null; + async getLatestSyncId(connectionId: string): Promise { + return (await this.loadCatalog(connectionId))?.syncId ?? null; } - async listTables(connectionName: string): Promise { - const catalog = await this.loadCatalog(connectionName); + async listTables(connectionId: string): Promise { + const catalog = await this.loadCatalog(connectionId); return catalog?.tables.map((table) => ({ catalog: table.catalog, db: table.db, name: table.name })) ?? []; } - async getTable(ref: { connectionName: string } & KtxTableRef): Promise { - const catalog = await this.loadCatalog(ref.connectionName); + async getTable(ref: { connectionId: string } & KtxTableRef): Promise { + const catalog = await this.loadCatalog(ref.connectionId); if (!catalog) { return null; } @@ -277,7 +277,7 @@ export class WarehouseCatalogService { const profileColumns = catalog.profile?.columns ?? {}; return { - connectionName: ref.connectionName, + connectionId: ref.connectionId, catalog: table.catalog, db: table.db, name: table.name, @@ -310,14 +310,14 @@ export class WarehouseCatalogService { } async resolveDisplay( - connectionName: string, + connectionId: string, display: string, ): Promise<{ resolved: KtxTableRef | null; candidates: KtxTableRef[]; dialect: string; }> { - const catalog = await this.loadCatalog(connectionName); + const catalog = await this.loadCatalog(connectionId); if (!catalog) { return { resolved: null, candidates: [], dialect: 'unknown' }; } @@ -333,14 +333,14 @@ export class WarehouseCatalogService { return { resolved: { catalog: table.catalog, db: table.db, name: table.name }, candidates: [], dialect }; } - async resolveDisplayTarget(connectionName: string, display: string): Promise { - const catalog = await this.loadCatalog(connectionName); + async resolveDisplayTarget(connectionId: string, display: string): Promise { + const catalog = await this.loadCatalog(connectionId); if (!catalog) { return { resolved: null, candidates: [], dialect: 'unknown' }; } const dialect = getDialectForDriver(catalog.driver).type; - const tableResolution = await this.resolveDisplay(connectionName, display); + const tableResolution = await this.resolveDisplay(connectionId, display); if (tableResolution.resolved) { return tableResolution; } @@ -367,8 +367,8 @@ export class WarehouseCatalogService { }; } - async searchByName(connectionName: string, query: string, limit: number): Promise { - const catalog = await this.loadCatalog(connectionName); + async searchByName(connectionId: string, query: string, limit: number): Promise { + const catalog = await this.loadCatalog(connectionId); if (!catalog) { return []; } @@ -378,7 +378,7 @@ export class WarehouseCatalogService { if (tableMatch) { hits.push({ kind: 'table', - connectionName, + connectionId, ref: { catalog: table.catalog, db: table.db, name: table.name }, display: formatDisplay(catalog.driver, table), matchedOn: tableMatch, @@ -391,7 +391,7 @@ export class WarehouseCatalogService { } hits.push({ kind: 'column', - connectionName, + connectionId, ref: { catalog: table.catalog, db: table.db, name: table.name, column: column.name }, display: `${formatDisplay(catalog.driver, table)}.${column.name}`, matchedOn: columnMatch, @@ -401,18 +401,18 @@ export class WarehouseCatalogService { return hits.slice(0, Math.max(0, limit)); } - private loadCatalog(connectionName: string): Promise { - const existing = this.catalogs.get(connectionName); + private loadCatalog(connectionId: string): Promise { + const existing = this.catalogs.get(connectionId); if (existing) { return existing; } - const pending = this.readCatalog(connectionName); - this.catalogs.set(connectionName, pending); + const pending = this.readCatalog(connectionId); + this.catalogs.set(connectionId, pending); return pending; } - private async readCatalog(connectionName: string): Promise { - const root = `raw-sources/${connectionName}/live-database`; + private async readCatalog(connectionId: string): Promise { + const root = `raw-sources/${connectionId}/live-database`; const listed = await this.deps.fileStore.listFiles(root); const connectionFiles = listed.files.filter((file) => file.endsWith('/connection.json')).sort(); const latestConnectionPath = connectionFiles.at(-1); @@ -438,7 +438,7 @@ export class WarehouseCatalogService { } return { - connectionName, + connectionId, syncId, driver: connection.driver ?? profile?.driver ?? 'postgres', tables, diff --git a/packages/context/src/search/discover.test.ts b/packages/context/src/search/discover.test.ts new file mode 100644 index 00000000..7f9df413 --- /dev/null +++ b/packages/context/src/search/discover.test.ts @@ -0,0 +1,264 @@ +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { initKtxProject, type KtxLocalProject } from '../project/index.js'; +import { writeLocalKnowledgePage } from '../wiki/local-knowledge.js'; +import { createKtxDiscoverDataService } from './discover.js'; + +describe('createKtxDiscoverDataService', () => { + let tempDir: string; + let project: KtxLocalProject; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-discover-data-')); + project = await initKtxProject({ projectDir: join(tempDir, 'project') }); + project.config.connections.warehouse = { driver: 'postgres', url: 'env:DATABASE_URL' }; + project.config.connections.billing = { driver: 'postgres', url: 'env:BILLING_DATABASE_URL' }; + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + async function seedWiki(): Promise { + await writeLocalKnowledgePage(project, { + key: 'orders-playbook', + scope: 'GLOBAL', + summary: 'Paid order operations', + content: 'Use paid orders and order_count to inspect monthly customer activity for Acme Corp.', + tags: ['orders'], + }); + } + + async function seedSl(): Promise { + await project.fileStore.writeFile( + 'semantic-layer/warehouse/orders.yaml', + [ + 'name: orders', + 'descriptions:', + ' user: Paid order facts', + 'table: public.orders', + 'grain: [id]', + 'columns:', + ' - name: status', + ' type: string', + ' descriptions:', + ' user: Payment status for the order', + ' - name: ordered_at', + ' type: time', + 'measures:', + ' - name: order_count', + ' expr: count(*)', + ' description: Number of paid orders', + '', + ].join('\n'), + 'ktx', + 'ktx@example.com', + 'seed sl source', + ); + } + + async function seedScan(input: { + connectionId?: string; + syncId: string; + tableName?: string; + comment?: string; + sampleValues?: string[]; + }): Promise { + const connectionId = input.connectionId ?? 'warehouse'; + const root = `raw-sources/${connectionId}/live-database/${input.syncId}`; + const tableName = input.tableName ?? 'orders'; + await project.fileStore.writeFile( + `${root}/connection.json`, + JSON.stringify( + { + connectionId, + driver: 'postgres', + extractedAt: `2026-05-14T09:00:00.000Z`, + scope: { schemas: ['public'] }, + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed scan connection', + ); + await project.fileStore.writeFile( + `${root}/tables/public-${tableName}.json`, + JSON.stringify( + { + catalog: null, + db: 'public', + name: tableName, + kind: 'table', + comment: input.comment ?? 'Orders table from warehouse', + estimatedRows: 123, + descriptions: { db: input.comment ?? 'Orders table from warehouse' }, + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: 'Order id', + }, + { + name: 'status', + nativeType: 'text', + normalizedType: 'text', + dimensionType: 'string', + nullable: false, + primaryKey: false, + comment: 'Order status', + sampleValues: input.sampleValues ?? ['paid', 'pending'], + }, + ], + foreignKeys: [], + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed table', + ); + await project.fileStore.writeFile( + `${root}/scan-report.json`, + JSON.stringify( + { + connectionId, + driver: 'postgres', + syncId: input.syncId, + runId: `scan-${input.syncId}`, + trigger: 'mcp', + mode: 'enriched', + dryRun: false, + artifactPaths: { + rawSourcesDir: root, + reportPath: `${root}/scan-report.json`, + manifestShards: [], + enrichmentArtifacts: [], + }, + diffSummary: { + tablesAdded: 1, + tablesModified: 0, + tablesDeleted: 0, + tablesUnchanged: 0, + columnsAdded: 0, + columnsModified: 0, + columnsDeleted: 0, + }, + manifestShardsWritten: 0, + structuralSyncStats: { + tablesCreated: 0, + tablesUpdated: 0, + tablesDeleted: 0, + columnsCreated: 0, + columnsUpdated: 0, + columnsDeleted: 0, + }, + enrichment: { + dataDictionary: 'completed', + tableDescriptions: 'completed', + columnDescriptions: 'completed', + embeddings: 'skipped', + deterministicRelationships: 'skipped', + llmRelationshipValidation: 'skipped', + statisticalValidation: 'skipped', + }, + capabilityGaps: [], + warnings: [], + relationships: { accepted: 0, review: 0, rejected: 0, skipped: 0 }, + enrichmentState: { resumedStages: [], completedStages: [], failedStages: [] }, + createdAt: '2026-05-14T09:00:00.000Z', + }, + null, + 2, + ), + 'ktx', + 'ktx@example.com', + 'seed scan report', + ); + } + + it('returns unified ranked refs across wiki, semantic-layer, and raw schema', async () => { + await seedWiki(); + await seedSl(); + await seedScan({ syncId: 'sync-1', sampleValues: ['paid', 'refunded'] }); + const service = createKtxDiscoverDataService(project, { userId: 'local-user' }); + + const results = await service.search({ query: 'paid orders', connectionId: 'warehouse', limit: 10 }); + + expect(results.map((result) => result.kind)).toEqual( + expect.arrayContaining(['wiki', 'sl_source', 'sl_measure', 'sl_dimension', 'table', 'column']), + ); + expect(results.every((result) => result.score >= 0 && result.score <= 1)).toBe(true); + expect(results.every((result) => result.snippet === null || result.snippet.length <= 200)).toBe(true); + expect(results).toContainEqual( + expect.objectContaining({ + kind: 'table', + id: 'public.orders', + connectionId: 'warehouse', + tableRef: { catalog: null, db: 'public', name: 'orders' }, + matchedOn: expect.stringMatching(/name|description|comment|display/), + }), + ); + expect(results).toContainEqual( + expect.objectContaining({ + kind: 'column', + id: 'public.orders.status', + connectionId: 'warehouse', + columnName: 'status', + matchedOn: expect.stringMatching(/name|comment|description|sample_value/), + }), + ); + expect(results).toContainEqual( + expect.objectContaining({ + kind: 'sl_measure', + id: 'orders.order_count', + connectionId: 'warehouse', + summary: 'Number of paid orders', + snippet: 'count(*)', + matchedOn: expect.stringMatching(/name|description|expr/), + }), + ); + }); + + it('honors kind filters and connection scope', async () => { + await seedWiki(); + await seedSl(); + await seedScan({ syncId: 'sync-1', connectionId: 'warehouse', tableName: 'orders' }); + await seedScan({ syncId: 'sync-2', connectionId: 'billing', tableName: 'invoices', comment: 'Billing invoices' }); + const service = createKtxDiscoverDataService(project); + + const results = await service.search({ + query: 'orders', + connectionId: 'warehouse', + kinds: ['table', 'column'], + limit: 10, + }); + + expect(results.every((result) => result.kind === 'table' || result.kind === 'column')).toBe(true); + expect(results.every((result) => result.connectionId === 'warehouse')).toBe(true); + expect(results.some((result) => result.id.includes('invoices'))).toBe(false); + expect(results.some((result) => result.kind === 'wiki')).toBe(false); + }); + + it('re-reads the latest scan artifacts on each call', async () => { + await seedScan({ syncId: 'sync-1', tableName: 'orders', comment: 'Old orders table' }); + const service = createKtxDiscoverDataService(project); + await expect( + service.search({ query: 'orders', connectionId: 'warehouse', kinds: ['table'], limit: 10 }), + ).resolves.toEqual(expect.arrayContaining([expect.objectContaining({ id: 'public.orders' })])); + + await seedScan({ syncId: 'sync-2', tableName: 'invoices', comment: 'Invoice facts' }); + const fresh = await service.search({ query: 'invoice', connectionId: 'warehouse', kinds: ['table'], limit: 10 }); + + expect(fresh).toEqual(expect.arrayContaining([expect.objectContaining({ id: 'public.invoices' })])); + expect(fresh.some((result) => result.id === 'public.orders')).toBe(false); + }); +}); diff --git a/packages/context/src/search/discover.ts b/packages/context/src/search/discover.ts new file mode 100644 index 00000000..53694f6a --- /dev/null +++ b/packages/context/src/search/discover.ts @@ -0,0 +1,466 @@ +import type { KtxEmbeddingPort } from '../core/index.js'; +import type { KtxLocalProject } from '../project/index.js'; +import type { KtxScanReport, KtxSchemaColumn, KtxSchemaTable, KtxTableRef } from '../scan/index.js'; +import { DEFAULT_PRIORITY, loadLocalSlSourceRecords, resolveDescription } from '../sl/index.js'; +import { readLocalKnowledgePage, searchLocalKnowledgePages } from '../wiki/local-knowledge.js'; +import { HybridSearchCore, type FusedSearchCandidate, type SearchCandidateGenerator } from './index.js'; + +export type KtxDiscoverDataKind = 'wiki' | 'sl_source' | 'sl_measure' | 'sl_dimension' | 'table' | 'column'; +export type KtxDiscoverDataMatchedOn = 'name' | 'display' | 'description' | 'comment' | 'expr' | 'sample_value' | 'body'; + +export interface KtxDiscoverDataInput { + query: string; + connectionId?: string; + kinds?: KtxDiscoverDataKind[]; + limit?: number; +} + +export interface KtxDiscoverDataRef { + kind: KtxDiscoverDataKind; + id: string; + score: number; + summary: string | null; + snippet: string | null; + matchedOn: KtxDiscoverDataMatchedOn; + connectionId?: string; + tableRef?: KtxTableRef; + columnName?: string; +} + +export type KtxDiscoverDataResponse = KtxDiscoverDataRef[]; + +export interface KtxDiscoverDataServiceOptions { + userId?: string; + embeddingService?: KtxEmbeddingPort | null; +} + +interface CandidateRecord { + ref: Omit; + rankScore: number; +} + +type RawTable = KtxSchemaTable & { + descriptions?: Record; + columns: Array; sampleValues?: unknown[] }>; +}; + +interface LatestScan { + report: KtxScanReport; + rawSourcesDir: string; + tables: RawTable[]; +} + +const ALL_KINDS: KtxDiscoverDataKind[] = ['wiki', 'sl_source', 'sl_measure', 'sl_dimension', 'table', 'column']; + +function normalize(value: string | null | undefined): string { + return (value ?? '').toLowerCase(); +} + +function queryTerms(query: string): string[] { + return query + .toLowerCase() + .split(/[^a-z0-9_]+/u) + .map((term) => term.trim()) + .filter(Boolean); +} + +function hasKind(kinds: ReadonlySet, kind: KtxDiscoverDataKind): boolean { + return kinds.has(kind); +} + +function cap200(value: string | null | undefined): string | null { + if (!value) { + return null; + } + const compact = value.replace(/\s+/g, ' ').trim(); + return compact.length > 200 ? compact.slice(0, 200) : compact; +} + +function snippetAround(text: string | null | undefined, terms: readonly string[]): string | null { + if (!text) { + return null; + } + const lower = text.toLowerCase(); + const index = + terms + .map((term) => lower.indexOf(term)) + .filter((position) => position >= 0) + .sort((a, b) => a - b)[0] ?? 0; + return cap200(text.slice(Math.max(0, index - 60), index + 140)); +} + +function textScore(value: string | null | undefined, terms: readonly string[]): number { + const haystack = normalize(value); + if (!haystack || terms.length === 0) { + return 0; + } + const matched = terms.filter((term) => haystack.includes(term)).length; + return matched / terms.length; +} + +function bestField( + fields: Array<{ matchedOn: KtxDiscoverDataMatchedOn; text: string | null | undefined; weight: number }>, + terms: readonly string[], +): { matchedOn: KtxDiscoverDataMatchedOn; score: number; text: string | null } | null { + const scored = fields + .map((field) => ({ + matchedOn: field.matchedOn, + score: textScore(field.text, terms) * field.weight, + text: field.text ?? null, + })) + .filter((field) => field.score > 0) + .sort((left, right) => right.score - left.score || left.matchedOn.localeCompare(right.matchedOn)); + return scored[0] ?? null; +} + +function displayForTable(table: KtxTableRef): string { + return [table.catalog, table.db, table.name].filter((part): part is string => Boolean(part)).join('.'); +} + +function tableRef(table: KtxSchemaTable): KtxTableRef { + return { catalog: table.catalog, db: table.db, name: table.name }; +} + +async function readJson(project: KtxLocalProject, path: string): Promise { + return JSON.parse((await project.fileStore.readFile(path)).content) as T; +} + +async function latestScan(project: KtxLocalProject, connectionId: string): Promise { + const root = `raw-sources/${connectionId}/live-database`; + let files: string[]; + try { + files = (await project.fileStore.listFiles(root)).files; + } catch { + return null; + } + + const reportPath = files + .filter((path) => path.endsWith('/scan-report.json')) + .sort() + .at(-1); + if (!reportPath) { + return null; + } + const report = await readJson(project, reportPath); + const rawSourcesDir = report.artifactPaths.rawSourcesDir ?? reportPath.slice(0, -'/scan-report.json'.length); + const listedTables = await project.fileStore.listFiles(`${rawSourcesDir}/tables`); + const tables: RawTable[] = []; + for (const path of listedTables.files.filter((file) => file.endsWith('.json')).sort()) { + tables.push(await readJson(project, path)); + } + return { report, rawSourcesDir, tables }; +} + +function configuredConnectionIds(project: KtxLocalProject, connectionId?: string): string[] { + return connectionId ? [connectionId] : Object.keys(project.config.connections).sort(); +} + +async function wikiCandidates( + project: KtxLocalProject, + input: KtxDiscoverDataInput, + options: KtxDiscoverDataServiceOptions, + terms: readonly string[], +): Promise { + const searchResults = await searchLocalKnowledgePages(project, { + query: input.query, + userId: options.userId, + embeddingService: options.embeddingService ?? null, + limit: Math.max(input.limit ?? 15, 25), + }); + const records: CandidateRecord[] = []; + for (const result of searchResults) { + const page = await readLocalKnowledgePage(project, { key: result.key, userId: options.userId }); + const content = page?.content ?? ''; + const matched = bestField( + [ + { matchedOn: 'name', text: result.key, weight: 1.1 }, + { matchedOn: 'description', text: result.summary, weight: 1 }, + { matchedOn: 'body', text: content, weight: 0.8 }, + ], + terms, + ); + records.push({ + rankScore: result.score + (matched?.score ?? 0), + ref: { + kind: 'wiki', + id: result.key, + summary: result.summary || null, + snippet: snippetAround(content, terms), + matchedOn: matched?.matchedOn ?? 'body', + }, + }); + } + return records.sort((left, right) => right.rankScore - left.rankScore || left.ref.id.localeCompare(right.ref.id)); +} + +async function slCandidates( + project: KtxLocalProject, + input: KtxDiscoverDataInput, + kinds: ReadonlySet, + terms: readonly string[], +): Promise { + const records: CandidateRecord[] = []; + for (const connectionId of configuredConnectionIds(project, input.connectionId)) { + const sources = await loadLocalSlSourceRecords(project, { connectionId }).catch(() => []); + for (const sourceRecord of sources) { + const source = sourceRecord.source; + if (hasKind(kinds, 'sl_source')) { + const description = resolveDescription(source.descriptions, { priority: DEFAULT_PRIORITY }); + const matched = bestField( + [ + { matchedOn: 'name', text: source.name, weight: 1.2 }, + { matchedOn: 'description', text: description, weight: 1 }, + { matchedOn: 'display', text: source.table ?? source.sql ?? null, weight: 0.8 }, + ], + terms, + ); + if (matched) { + records.push({ + rankScore: matched.score, + ref: { + kind: 'sl_source', + id: source.name, + connectionId, + summary: description, + snippet: + matched.matchedOn === 'description' + ? snippetAround(description, terms) + : cap200( + `${source.name}: ${[ + ...source.measures.map((measure) => measure.name), + ...source.columns.map((column) => column.name), + ] + .slice(0, 3) + .join(', ')}`, + ), + matchedOn: matched.matchedOn, + }, + }); + } + } + + if (hasKind(kinds, 'sl_measure')) { + for (const measure of source.measures) { + const matched = bestField( + [ + { matchedOn: 'name', text: measure.name, weight: 1.2 }, + { matchedOn: 'description', text: measure.description, weight: 1 }, + { matchedOn: 'expr', text: measure.expr, weight: 0.9 }, + ], + terms, + ); + if (matched) { + records.push({ + rankScore: matched.score, + ref: { + kind: 'sl_measure', + id: `${source.name}.${measure.name}`, + connectionId, + summary: measure.description ?? null, + snippet: cap200(measure.expr), + matchedOn: matched.matchedOn, + }, + }); + } + } + } + + if (hasKind(kinds, 'sl_dimension')) { + for (const column of source.columns) { + const description = resolveDescription(column.descriptions, { priority: DEFAULT_PRIORITY }); + const matched = bestField( + [ + { matchedOn: 'name', text: column.name, weight: 1.2 }, + { matchedOn: 'display', text: `${source.name}.${column.name}`, weight: 1.1 }, + { matchedOn: 'description', text: description, weight: 1 }, + { matchedOn: 'expr', text: column.expr, weight: 0.9 }, + ], + terms, + ); + if (matched) { + records.push({ + rankScore: matched.score, + ref: { + kind: 'sl_dimension', + id: `${source.name}.${column.name}`, + connectionId, + summary: description, + snippet: cap200(`${column.name} (${column.type})`), + matchedOn: matched.matchedOn, + }, + }); + } + } + } + } + } + return records.sort((left, right) => right.rankScore - left.rankScore || left.ref.id.localeCompare(right.ref.id)); +} + +async function rawCandidates( + project: KtxLocalProject, + input: KtxDiscoverDataInput, + kinds: ReadonlySet, + terms: readonly string[], +): Promise { + const records: CandidateRecord[] = []; + for (const connectionId of configuredConnectionIds(project, input.connectionId)) { + const scan = await latestScan(project, connectionId); + if (!scan) { + continue; + } + for (const table of scan.tables) { + const ref = tableRef(table); + const display = displayForTable(ref); + const tableDescription = resolveDescription(table.descriptions, { priority: DEFAULT_PRIORITY }) ?? table.comment; + if (hasKind(kinds, 'table')) { + const matched = bestField( + [ + { matchedOn: 'name', text: table.name, weight: 1.2 }, + { matchedOn: 'display', text: display, weight: 1.1 }, + { matchedOn: 'description', text: tableDescription, weight: 1 }, + { matchedOn: 'comment', text: table.comment, weight: 1 }, + ], + terms, + ); + if (matched) { + records.push({ + rankScore: matched.score, + ref: { + kind: 'table', + id: display, + connectionId, + tableRef: ref, + summary: tableDescription, + snippet: + matched.matchedOn === 'description' || matched.matchedOn === 'comment' + ? snippetAround(matched.text, terms) + : cap200(table.columns.slice(0, 5).map((column) => column.name).join(', ')), + matchedOn: matched.matchedOn, + }, + }); + } + } + + if (hasKind(kinds, 'column')) { + for (const column of table.columns) { + const columnDescription = resolveDescription(column.descriptions, { priority: DEFAULT_PRIORITY }) ?? column.comment; + const samples = (column.sampleValues ?? []).map((value) => String(value)).slice(0, 5); + const matched = bestField( + [ + { matchedOn: 'name', text: column.name, weight: 1.2 }, + { matchedOn: 'display', text: `${display}.${column.name}`, weight: 1.1 }, + { matchedOn: 'description', text: columnDescription, weight: 1 }, + { matchedOn: 'comment', text: column.comment, weight: 1 }, + { matchedOn: 'sample_value', text: samples.join(' '), weight: 1.3 }, + ], + terms, + ); + if (matched) { + records.push({ + rankScore: matched.score, + ref: { + kind: 'column', + id: `${display}.${column.name}`, + connectionId, + tableRef: ref, + columnName: column.name, + summary: columnDescription, + snippet: + matched.matchedOn === 'sample_value' + ? cap200(`${column.nativeType} - samples: ${samples.join(', ')}`) + : matched.matchedOn === 'description' || matched.matchedOn === 'comment' + ? snippetAround(matched.text, terms) + : cap200(column.nativeType), + matchedOn: matched.matchedOn, + }, + }); + } + } + } + } + } + return records.sort((left, right) => right.rankScore - left.rankScore || left.ref.id.localeCompare(right.ref.id)); +} + +function generator( + name: string, + candidates: CandidateRecord[], + refsByKey: Map>, +): SearchCandidateGenerator { + candidates.forEach((candidate) => + refsByKey.set(`${candidate.ref.kind}:${candidate.ref.connectionId ?? ''}:${candidate.ref.id}`, candidate.ref), + ); + return { + lane: name, + weight: 1, + async generate() { + return { + candidates: candidates.map((candidate, index) => ({ + id: `${candidate.ref.kind}:${candidate.ref.connectionId ?? ''}:${candidate.ref.id}`, + rank: index + 1, + rawScore: candidate.rankScore, + })), + }; + }, + }; +} + +function hydrate( + fused: FusedSearchCandidate[], + refsByKey: Map>, +): KtxDiscoverDataRef[] { + const maxScore = Math.max(...fused.map((candidate) => candidate.score), 0); + return fused + .map((candidate) => { + const ref = refsByKey.get(candidate.id); + if (!ref) { + return null; + } + return { + ...ref, + score: maxScore > 0 ? Number((candidate.score / maxScore).toFixed(6)) : 0, + }; + }) + .filter((result): result is KtxDiscoverDataRef => result !== null); +} + +export function createKtxDiscoverDataService( + project: KtxLocalProject, + options: KtxDiscoverDataServiceOptions = {}, +): { search(input: KtxDiscoverDataInput): Promise } { + return { + async search(input) { + const limit = Math.max(1, Math.min(input.limit ?? 15, 50)); + const query = input.query.trim(); + if (!query) { + return []; + } + const kinds = new Set(input.kinds ?? ALL_KINDS); + const terms = queryTerms(query); + const refsByKey = new Map>(); + const generators: SearchCandidateGenerator[] = []; + + if (hasKind(kinds, 'wiki')) { + generators.push(generator('wiki', await wikiCandidates(project, { ...input, limit }, options, terms), refsByKey)); + } + if (hasKind(kinds, 'sl_source') || hasKind(kinds, 'sl_measure') || hasKind(kinds, 'sl_dimension')) { + generators.push(generator('semantic_layer', await slCandidates(project, { ...input, limit }, kinds, terms), refsByKey)); + } + if (hasKind(kinds, 'table') || hasKind(kinds, 'column')) { + generators.push(generator('raw_schema', await rawCandidates(project, { ...input, limit }, kinds, terms), refsByKey)); + } + if (generators.length === 0) { + return []; + } + + const result = await new HybridSearchCore().search({ + queryText: query, + limit, + generators, + laneWeights: { wiki: 1, semantic_layer: 1, raw_schema: 1 }, + }); + return hydrate(result.results, refsByKey); + }, + }; +} diff --git a/packages/context/src/search/index.ts b/packages/context/src/search/index.ts index a62ae7bc..9cec3602 100644 --- a/packages/context/src/search/index.ts +++ b/packages/context/src/search/index.ts @@ -10,6 +10,15 @@ export { assertSearchBackendCapabilities, assertSearchBackendConformanceCase, } from './backend-conformance.js'; +export { createKtxDiscoverDataService } from './discover.js'; +export type { + KtxDiscoverDataInput, + KtxDiscoverDataKind, + KtxDiscoverDataMatchedOn, + KtxDiscoverDataRef, + KtxDiscoverDataResponse, + KtxDiscoverDataServiceOptions, +} from './discover.js'; export { HybridSearchCore } from './hybrid-search-core.js'; export { defaultLaneCandidatePoolLimit, normalizeSearchQuery } from './query.js'; export { diff --git a/packages/context/src/sl/dictionary-search.test.ts b/packages/context/src/sl/dictionary-search.test.ts new file mode 100644 index 00000000..7c3e2d1f --- /dev/null +++ b/packages/context/src/sl/dictionary-search.test.ts @@ -0,0 +1,228 @@ +import { mkdtemp, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import { initKtxProject, type KtxLocalProject } from '../project/index.js'; +import { createKtxDictionarySearchService } from './dictionary-search.js'; + +describe('createKtxDictionarySearchService', () => { + let tempDir: string; + let project: KtxLocalProject; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'ktx-dictionary-search-')); + project = await initKtxProject({ projectDir: join(tempDir, 'project') }); + project.config.connections.warehouse = { driver: 'postgres', url: 'env:DATABASE_URL' }; + project.config.connections.billing = { driver: 'postgres', url: 'env:BILLING_DATABASE_URL' }; + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + async function seedProfile(input: { + connectionId: string; + syncId: string; + columns: Record; + }): Promise { + await project.fileStore.writeFile( + `raw-sources/${input.connectionId}/live-database/${input.syncId}/enrichment/relationship-profile.json`, + `${JSON.stringify( + { + connectionId: input.connectionId, + driver: 'postgres', + sqlAvailable: true, + queryCount: 4, + tables: [], + columns: input.columns, + warnings: [], + }, + null, + 2, + )}\n`, + 'ktx', + 'ktx@example.com', + 'Seed relationship profile', + ); + } + + it('returns matches and non-authoritative misses across configured connections', async () => { + await seedProfile({ + connectionId: 'warehouse', + syncId: 'sync-1', + columns: { + 'orders.status': { + table: { catalog: null, db: 'public', name: 'orders' }, + column: 'status', + nativeType: 'text', + normalizedType: 'string', + distinctCount: 3, + sampleValues: ['paid', 'refunded', 'pending'], + }, + }, + }); + await seedProfile({ + connectionId: 'billing', + syncId: 'sync-2', + columns: { + 'customers.name': { + table: { catalog: null, db: 'public', name: 'customers' }, + column: 'name', + nativeType: 'text', + normalizedType: 'string', + distinctCount: 4, + sampleValues: ['Acme Corp', 'Globex'], + }, + }, + }); + const service = createKtxDictionarySearchService(project); + + await expect(service.search({ values: ['PAID', 'missing'] })).resolves.toEqual({ + searched: [ + { + connectionId: 'billing', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 1, + syncId: 'sync-2', + profiledAt: null, + }, + status: 'ready', + }, + { + connectionId: 'warehouse', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 1, + syncId: 'sync-1', + profiledAt: null, + }, + status: 'ready', + }, + ], + results: [ + { + value: 'PAID', + matches: [ + { + connectionId: 'warehouse', + sourceName: 'orders', + columnName: 'status', + matchedValue: 'paid', + cardinality: 3, + }, + ], + misses: [{ connectionId: 'billing', reason: 'value_not_in_sample' }], + }, + { + value: 'missing', + matches: [], + misses: [ + { connectionId: 'billing', reason: 'value_not_in_sample' }, + { connectionId: 'warehouse', reason: 'value_not_in_sample' }, + ], + }, + ], + }); + }); + + it('distinguishes missing profile artifacts from profiles with no candidate columns', async () => { + await seedProfile({ + connectionId: 'billing', + syncId: 'sync-empty', + columns: { + 'events.id': { + table: { catalog: null, db: 'public', name: 'events' }, + column: 'id', + nativeType: 'integer', + normalizedType: 'integer', + distinctCount: 100, + sampleValues: [1, 2, 3], + }, + }, + }); + const service = createKtxDictionarySearchService(project); + + await expect(service.search({ values: ['Acme'] })).resolves.toEqual({ + searched: [ + { + connectionId: 'billing', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 0, + syncId: 'sync-empty', + profiledAt: null, + }, + status: 'no_candidate_columns', + }, + { + connectionId: 'warehouse', + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 0, + syncId: null, + profiledAt: null, + }, + status: 'no_profile_artifact', + }, + ], + results: [ + { + value: 'Acme', + matches: [], + misses: [ + { connectionId: 'billing', reason: 'no_candidate_columns' }, + { connectionId: 'warehouse', reason: 'no_profile_artifact' }, + ], + }, + ], + }); + }); + + it('scopes search to the requested connection', async () => { + await seedProfile({ + connectionId: 'warehouse', + syncId: 'sync-1', + columns: { + 'orders.status': { + table: { catalog: null, db: 'public', name: 'orders' }, + column: 'status', + nativeType: 'text', + normalizedType: 'string', + distinctCount: 3, + sampleValues: ['paid'], + }, + }, + }); + await seedProfile({ + connectionId: 'billing', + syncId: 'sync-2', + columns: { + 'invoices.status': { + table: { catalog: null, db: 'public', name: 'invoices' }, + column: 'status', + nativeType: 'text', + normalizedType: 'string', + distinctCount: 2, + sampleValues: ['paid'], + }, + }, + }); + const service = createKtxDictionarySearchService(project); + + await expect(service.search({ connectionId: 'billing', values: ['paid'] })).resolves.toMatchObject({ + searched: [{ connectionId: 'billing', status: 'ready' }], + results: [ + { + value: 'paid', + matches: [{ connectionId: 'billing', sourceName: 'invoices', columnName: 'status', matchedValue: 'paid' }], + misses: [], + }, + ], + }); + }); +}); diff --git a/packages/context/src/sl/dictionary-search.ts b/packages/context/src/sl/dictionary-search.ts new file mode 100644 index 00000000..041b828d --- /dev/null +++ b/packages/context/src/sl/dictionary-search.ts @@ -0,0 +1,214 @@ +import type { KtxLocalProject } from '../project/index.js'; +import { loadLatestSlDictionaryEntries, type SlDictionaryEntry } from './sl-dictionary-profile.js'; + +export type KtxDictionarySearchStatus = 'ready' | 'no_profile_artifact' | 'no_candidate_columns'; +export type KtxDictionarySearchMissReason = 'no_profile_artifact' | 'no_candidate_columns' | 'value_not_in_sample'; + +export interface KtxDictionarySearchInput { + values: string[]; + connectionId?: string; +} + +export interface KtxDictionarySearchCoverage { + sampledRows: number | null; + valuesPerColumn: number | null; + profiledColumns: number; + syncId: string | null; + profiledAt: string | null; +} + +export interface KtxDictionarySearchSearchedConnection { + connectionId: string; + coverage: KtxDictionarySearchCoverage; + status: KtxDictionarySearchStatus; +} + +export interface KtxDictionarySearchMatch { + connectionId: string; + sourceName: string; + columnName: string; + matchedValue: string; + cardinality: number | null; +} + +export interface KtxDictionarySearchMiss { + connectionId: string; + reason: KtxDictionarySearchMissReason; +} + +export interface KtxDictionarySearchValueResult { + value: string; + matches: KtxDictionarySearchMatch[]; + misses: KtxDictionarySearchMiss[]; +} + +export interface KtxDictionarySearchResponse { + searched: KtxDictionarySearchSearchedConnection[]; + results: KtxDictionarySearchValueResult[]; +} + +interface RelationshipProfileArtifact { + connectionId?: string; + profileSampleRows?: unknown; + sampleValuesPerColumn?: unknown; + profiledAt?: unknown; + extractedAt?: unknown; +} + +function uniqueSorted(values: Iterable): string[] { + return [...new Set([...values].filter((value) => value.trim().length > 0))].sort((left, right) => + left.localeCompare(right), + ); +} + +function latestProfileSyncId(path: string): string | null { + const parts = path.split('/'); + return parts.at(-3) ?? null; +} + +function optionalNumber(value: unknown): number | null { + return typeof value === 'number' && Number.isFinite(value) ? value : null; +} + +function optionalString(value: unknown): string | null { + return typeof value === 'string' && value.trim().length > 0 ? value : null; +} + +async function latestProfilePath(project: KtxLocalProject, connectionId: string): Promise { + const root = `raw-sources/${connectionId}/live-database`; + let files: string[]; + try { + files = (await project.fileStore.listFiles(root)).files; + } catch { + return null; + } + + return ( + files + .filter((path) => path.endsWith('/enrichment/relationship-profile.json')) + .sort((left, right) => left.localeCompare(right)) + .at(-1) ?? null + ); +} + +async function readProfile(project: KtxLocalProject, path: string): Promise { + const raw = await project.fileStore.readFile(path); + const parsed = JSON.parse(raw.content) as unknown; + return typeof parsed === 'object' && parsed !== null && !Array.isArray(parsed) + ? (parsed as RelationshipProfileArtifact) + : {}; +} + +function profiledColumnCount(entries: readonly SlDictionaryEntry[]): number { + return new Set(entries.map((entry) => `${entry.sourceName}\u001f${entry.columnName}`)).size; +} + +async function searchedConnection( + project: KtxLocalProject, + connectionId: string, + entries: readonly SlDictionaryEntry[], +): Promise { + const path = await latestProfilePath(project, connectionId); + if (!path) { + return { + connectionId, + coverage: { + sampledRows: null, + valuesPerColumn: null, + profiledColumns: 0, + syncId: null, + profiledAt: null, + }, + status: 'no_profile_artifact', + }; + } + + const profile = await readProfile(project, path); + const count = profiledColumnCount(entries); + return { + connectionId, + coverage: { + sampledRows: optionalNumber(profile.profileSampleRows), + valuesPerColumn: optionalNumber(profile.sampleValuesPerColumn), + profiledColumns: count, + syncId: latestProfileSyncId(path), + profiledAt: optionalString(profile.profiledAt) ?? optionalString(profile.extractedAt), + }, + status: count > 0 ? 'ready' : 'no_candidate_columns', + }; +} + +function entryMatchesValue(entry: SlDictionaryEntry, value: string): boolean { + return entry.value.toLowerCase().includes(value.toLowerCase()); +} + +function toMatch(entry: SlDictionaryEntry): KtxDictionarySearchMatch { + return { + connectionId: entry.connectionId, + sourceName: entry.sourceName, + columnName: entry.columnName, + matchedValue: entry.value, + cardinality: entry.cardinality, + }; +} + +function sortMatches(matches: KtxDictionarySearchMatch[]): KtxDictionarySearchMatch[] { + return matches.sort( + (left, right) => + left.connectionId.localeCompare(right.connectionId) || + left.sourceName.localeCompare(right.sourceName) || + left.columnName.localeCompare(right.columnName) || + left.matchedValue.localeCompare(right.matchedValue), + ); +} + +function missReason(status: KtxDictionarySearchStatus): KtxDictionarySearchMissReason { + return status === 'ready' ? 'value_not_in_sample' : status; +} + +export function createKtxDictionarySearchService(project: KtxLocalProject): { + search(input: KtxDictionarySearchInput): Promise; +} { + return { + async search(input) { + const connectionIds = input.connectionId + ? [input.connectionId] + : uniqueSorted(Object.keys(project.config.connections)); + const entries = await loadLatestSlDictionaryEntries(project, connectionIds); + const entriesByConnection = new Map(); + for (const connectionId of connectionIds) { + entriesByConnection.set( + connectionId, + entries.filter((entry) => entry.connectionId === connectionId), + ); + } + + const searched = ( + await Promise.all( + connectionIds.map((connectionId) => + searchedConnection(project, connectionId, entriesByConnection.get(connectionId) ?? []), + ), + ) + ).sort((left, right) => left.connectionId.localeCompare(right.connectionId)); + const searchedByConnection = new Map(searched.map((connection) => [connection.connectionId, connection])); + + return { + searched, + results: input.values.map((value) => { + const matches = sortMatches(entries.filter((entry) => entryMatchesValue(entry, value)).map(toMatch)); + const matchedConnections = new Set(matches.map((match) => match.connectionId)); + return { + value, + matches, + misses: searched + .filter((connection) => !matchedConnections.has(connection.connectionId)) + .map((connection) => ({ + connectionId: connection.connectionId, + reason: missReason(searchedByConnection.get(connection.connectionId)?.status ?? 'no_profile_artifact'), + })), + }; + }), + }; + }, + }; +} diff --git a/packages/context/src/sl/index.ts b/packages/context/src/sl/index.ts index 1a0167cb..600a5a93 100644 --- a/packages/context/src/sl/index.ts +++ b/packages/context/src/sl/index.ts @@ -25,6 +25,18 @@ export { } from './semantic-layer.service.js'; export { loadLatestSlDictionaryEntries } from './sl-dictionary-profile.js'; export type { SlDictionaryEntry } from './sl-dictionary-profile.js'; +export { createKtxDictionarySearchService } from './dictionary-search.js'; +export type { + KtxDictionarySearchCoverage, + KtxDictionarySearchInput, + KtxDictionarySearchMatch, + KtxDictionarySearchMiss, + KtxDictionarySearchMissReason, + KtxDictionarySearchResponse, + KtxDictionarySearchSearchedConnection, + KtxDictionarySearchStatus, + KtxDictionarySearchValueResult, +} from './dictionary-search.js'; export { buildSemanticLayerSourceSearchText, SlSearchService } from './sl-search.service.js'; export { SqliteSlSourcesIndex, type SqliteSlSourcesIndexOptions } from './sqlite-sl-sources-index.js'; export * from './local-sl.js'; diff --git a/packages/context/src/sql-analysis/http-sql-analysis-port.test.ts b/packages/context/src/sql-analysis/http-sql-analysis-port.test.ts index 6e22fd47..2d759369 100644 --- a/packages/context/src/sql-analysis/http-sql-analysis-port.test.ts +++ b/packages/context/src/sql-analysis/http-sql-analysis-port.test.ts @@ -108,6 +108,44 @@ describe('createHttpSqlAnalysisPort', () => { }); }); + it('maps read-only SQL validation responses', async () => { + const requests: Array<{ path: string; payload: Record }> = []; + const port = createHttpSqlAnalysisPort({ + baseUrl: 'http://127.0.0.1:8765', + requestJson: async (path, payload) => { + requests.push({ path, payload }); + return { ok: false, error: 'SQL contains read/write operation: Insert' }; + }, + }); + + await expect( + port.validateReadOnly('with x as (insert into t values (1)) select * from x', 'postgres'), + ).resolves.toEqual({ + ok: false, + error: 'SQL contains read/write operation: Insert', + }); + expect(requests).toEqual([ + { + path: '/sql/validate-read-only', + payload: { + dialect: 'postgres', + sql: 'with x as (insert into t values (1)) select * from x', + }, + }, + ]); + }); + + it('rejects malformed read-only validation responses', async () => { + const port = createHttpSqlAnalysisPort({ + baseUrl: 'http://127.0.0.1:8765', + requestJson: async () => ({ ok: 'yes' }), + }); + + await expect(port.validateReadOnly('select 1', 'postgres')).rejects.toThrow( + 'sql analysis response is missing boolean field ok', + ); + }); + it('rejects malformed SQL batch responses instead of inventing defaults', async () => { const requestJson = vi.fn(async () => ({ results: { diff --git a/packages/context/src/sql-analysis/http-sql-analysis-port.ts b/packages/context/src/sql-analysis/http-sql-analysis-port.ts index 9da37556..238b8863 100644 --- a/packages/context/src/sql-analysis/http-sql-analysis-port.ts +++ b/packages/context/src/sql-analysis/http-sql-analysis-port.ts @@ -9,6 +9,7 @@ import type { SqlAnalysisLiteralSlot, SqlAnalysisLiteralSlotType, SqlAnalysisPort, + SqlReadOnlyValidationResult, } from './ports.js'; export type KtxSqlAnalysisHttpJsonRunner = ( @@ -96,6 +97,14 @@ function requiredStringArray(raw: Record, field: string): strin return value; } +function requiredBoolean(raw: Record, field: string): boolean { + const value = raw[field]; + if (typeof value !== 'boolean') { + throw new Error(`sql analysis response is missing boolean field ${field}`); + } + return value; +} + function requiredObject(raw: Record, field: string): Record { const value = raw[field]; if (!value || typeof value !== 'object' || Array.isArray(value)) { @@ -187,6 +196,14 @@ function mapBatchResponse(raw: Record): Map): SqlReadOnlyValidationResult { + const error = optionalString(raw, 'error'); + return { + ok: requiredBoolean(raw, 'ok'), + ...(error !== undefined ? { error } : {}), + }; +} + export function createHttpSqlAnalysisPort(options: HttpSqlAnalysisPortOptions): SqlAnalysisPort { const requestJson = options.requestJson ?? postJson(options.baseUrl); @@ -205,5 +222,12 @@ export function createHttpSqlAnalysisPort(options: HttpSqlAnalysisPortOptions): }); return mapBatchResponse(raw); }, + async validateReadOnly(sql: string, dialect: SqlAnalysisDialect) { + const raw = await requestJson('/sql/validate-read-only', { + dialect, + sql, + }); + return mapReadOnlyValidation(raw); + }, }; } diff --git a/packages/context/src/sql-analysis/index.ts b/packages/context/src/sql-analysis/index.ts index 8338b822..c01a8aaa 100644 --- a/packages/context/src/sql-analysis/index.ts +++ b/packages/context/src/sql-analysis/index.ts @@ -9,4 +9,5 @@ export type { SqlAnalysisLiteralSlot, SqlAnalysisLiteralSlotType, SqlAnalysisPort, + SqlReadOnlyValidationResult, } from './ports.js'; diff --git a/packages/context/src/sql-analysis/ports.ts b/packages/context/src/sql-analysis/ports.ts index 3361a7c4..891515b7 100644 --- a/packages/context/src/sql-analysis/ports.ts +++ b/packages/context/src/sql-analysis/ports.ts @@ -38,10 +38,16 @@ export interface SqlAnalysisBatchResult { error?: string | null; } +export interface SqlReadOnlyValidationResult { + ok: boolean; + error?: string | null; +} + export interface SqlAnalysisPort { analyzeForFingerprint(sql: string, dialect: SqlAnalysisDialect): Promise; analyzeBatch( items: SqlAnalysisBatchItem[], dialect: SqlAnalysisDialect, ): Promise>; + validateReadOnly(sql: string, dialect: SqlAnalysisDialect): Promise; } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9c66ab51..22e0c035 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -106,6 +106,9 @@ importers: '@ktx/llm': specifier: workspace:* version: link:../llm + '@modelcontextprotocol/sdk': + specifier: ^1.29.0 + version: 1.29.0(zod@4.4.3) commander: specifier: 14.0.3 version: 14.0.3 diff --git a/python/ktx-daemon/src/ktx_daemon/app.py b/python/ktx-daemon/src/ktx_daemon/app.py index 76325719..0d7016dd 100644 --- a/python/ktx-daemon/src/ktx_daemon/app.py +++ b/python/ktx-daemon/src/ktx_daemon/app.py @@ -51,7 +51,10 @@ from ktx_daemon.source_generation import ( from ktx_daemon.sql_analysis import ( AnalyzeSqlBatchRequest, AnalyzeSqlBatchResponse, + ValidateReadOnlySqlRequest, + ValidateReadOnlySqlResponse, analyze_sql_batch_response, + validate_read_only_sql_response, ) from ktx_daemon.table_identifier import ( ParseTableIdentifierBatchRequest, @@ -198,6 +201,19 @@ def create_app( detail=f"Table identifier parsing failed: {error}", ) from error + @app.post("/sql/validate-read-only", response_model=ValidateReadOnlySqlResponse) + async def sql_validate_read_only( + request: ValidateReadOnlySqlRequest, + ) -> ValidateReadOnlySqlResponse: + try: + return validate_read_only_sql_response(request) + except Exception as error: + logger.exception("SQL read-only validation failed: %s", error) + raise HTTPException( + status_code=500, + detail=f"SQL read-only validation failed: {error}", + ) from error + @app.post("/sql/analyze-batch", response_model=AnalyzeSqlBatchResponse) async def sql_analyze_batch( request: AnalyzeSqlBatchRequest, diff --git a/python/ktx-daemon/src/ktx_daemon/sql_analysis.py b/python/ktx-daemon/src/ktx_daemon/sql_analysis.py index d5deb240..ebecf83c 100644 --- a/python/ktx-daemon/src/ktx_daemon/sql_analysis.py +++ b/python/ktx-daemon/src/ktx_daemon/sql_analysis.py @@ -34,6 +34,46 @@ class AnalyzeSqlBatchResponse(BaseModel): results: dict[str, AnalyzeSqlBatchResult] +class ValidateReadOnlySqlRequest(BaseModel): + dialect: str + sql: str + + +class ValidateReadOnlySqlResponse(BaseModel): + ok: bool + error: str | None = None + + +_READ_ONLY_ROOT_TYPES = (exp.Select, exp.Union) +_READ_WRITE_NODE_TYPES = ( + exp.Alter, + exp.Analyze, + exp.Cache, + exp.Command, + exp.Commit, + exp.Copy, + exp.Create, + exp.Delete, + exp.Describe, + exp.Drop, + exp.Execute, + exp.Grant, + exp.Insert, + exp.Merge, + exp.Pragma, + exp.Refresh, + exp.Revoke, + exp.Rollback, + exp.Set, + exp.Show, + exp.Transaction, + exp.TruncateTable, + exp.Uncache, + exp.Update, + exp.Use, +) + + def _ordered_unique(values: list[str]) -> list[str]: seen: set[str] = set() result: list[str] = [] @@ -137,6 +177,42 @@ def _analyze_payload( return _analyze_one(item_id, sql, dialect) +def validate_read_only_sql_response( + request: ValidateReadOnlySqlRequest, +) -> ValidateReadOnlySqlResponse: + try: + statements = sqlglot.parse(request.sql, read=request.dialect) + except sqlglot.errors.SqlglotError as exc: + return ValidateReadOnlySqlResponse(ok=False, error=f"Invalid expression: {exc}") + + if len(statements) != 1: + return ValidateReadOnlySqlResponse( + ok=False, + error="Only one SQL statement can be executed.", + ) + + tree = statements[0] + if tree is None: + return ValidateReadOnlySqlResponse( + ok=False, + error="SQL did not parse to a statement.", + ) + if not isinstance(tree, _READ_ONLY_ROOT_TYPES): + return ValidateReadOnlySqlResponse( + ok=False, + error=f"SQL contains read/write operation: {type(tree).__name__}", + ) + + for node in tree.walk(): + if isinstance(node, _READ_WRITE_NODE_TYPES): + return ValidateReadOnlySqlResponse( + ok=False, + error=f"SQL contains read/write operation: {type(node).__name__}", + ) + + return ValidateReadOnlySqlResponse(ok=True, error=None) + + def _worker_count(request: AnalyzeSqlBatchRequest) -> int: if len(request.items) <= 1: return 1 diff --git a/python/ktx-daemon/tests/test_app.py b/python/ktx-daemon/tests/test_app.py index eb2c3d68..3c1ce18d 100644 --- a/python/ktx-daemon/tests/test_app.py +++ b/python/ktx-daemon/tests/test_app.py @@ -280,6 +280,30 @@ def test_sql_parse_table_identifier_endpoint() -> None: assert body["results"]["template"]["reason"] == "looker_template_unresolved" +def test_sql_validate_read_only_endpoint() -> None: + client = TestClient(create_app()) + + ok_response = client.post( + "/sql/validate-read-only", + json={"dialect": "postgres", "sql": "select * from public.orders"}, + ) + bad_response = client.post( + "/sql/validate-read-only", + json={ + "dialect": "postgres", + "sql": "with x as (insert into audit.events values (1) returning *) select * from x", + }, + ) + + assert ok_response.status_code == 200 + assert ok_response.json() == {"ok": True, "error": None} + assert bad_response.status_code == 200 + assert bad_response.json() == { + "ok": False, + "error": "SQL contains read/write operation: Insert", + } + + def test_sql_analyze_batch_endpoint_returns_per_item_results() -> None: client = TestClient(create_app()) diff --git a/python/ktx-daemon/tests/test_sql_analysis.py b/python/ktx-daemon/tests/test_sql_analysis.py index c1fc35f8..855d16fd 100644 --- a/python/ktx-daemon/tests/test_sql_analysis.py +++ b/python/ktx-daemon/tests/test_sql_analysis.py @@ -3,8 +3,10 @@ from __future__ import annotations from ktx_daemon.sql_analysis import ( AnalyzeSqlBatchItem, AnalyzeSqlBatchRequest, + ValidateReadOnlySqlRequest, _columns_from_nodes, analyze_sql_batch_response, + validate_read_only_sql_response, ) @@ -56,3 +58,74 @@ def test_analyze_sql_batch_returns_per_item_parse_errors() -> None: def test_columns_from_nodes_ignores_non_expression_clause_values() -> None: assert _columns_from_nodes([True, False, None]) == [] + + +def test_validate_read_only_sql_accepts_select_and_with_queries() -> None: + select_response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest( + dialect="postgres", + sql="select id, status from public.orders where status = 'paid'", + ) + ) + with_response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest( + dialect="postgres", + sql=( + "with paid as (select * from public.orders where status = 'paid') " + "select count(*) from paid" + ), + ) + ) + + assert select_response.ok is True + assert select_response.error is None + assert with_response.ok is True + assert with_response.error is None + + +def test_validate_read_only_sql_rejects_cte_dml() -> None: + response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest( + dialect="postgres", + sql="with x as (insert into audit.events values (1) returning *) select * from x", + ) + ) + + assert response.ok is False + assert response.error == "SQL contains read/write operation: Insert" + + +def test_validate_read_only_sql_rejects_multi_statement_payloads() -> None: + response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest( + dialect="postgres", + sql="select * from public.orders; delete from public.orders", + ) + ) + + assert response.ok is False + assert response.error == "Only one SQL statement can be executed." + + +def test_validate_read_only_sql_rejects_commands_and_pragmas() -> None: + command_response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest(dialect="postgres", sql="call refresh_stats()") + ) + pragma_response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest(dialect="sqlite", sql="pragma table_info(users)") + ) + + assert command_response.ok is False + assert command_response.error == "SQL contains read/write operation: Command" + assert pragma_response.ok is False + assert pragma_response.error == "SQL contains read/write operation: Pragma" + + +def test_validate_read_only_sql_reports_parse_errors() -> None: + response = validate_read_only_sql_response( + ValidateReadOnlySqlRequest(dialect="postgres", sql="select * from where") + ) + + assert response.ok is False + assert response.error is not None + assert "Invalid expression" in response.error diff --git a/scripts/conductor-scripts.test.mjs b/scripts/conductor-scripts.test.mjs index 5a84194c..fa97a501 100644 --- a/scripts/conductor-scripts.test.mjs +++ b/scripts/conductor-scripts.test.mjs @@ -28,6 +28,7 @@ describe('Conductor workspace scripts', () => { assert.match(setupScript, /pnpm install --frozen-lockfile --prefer-offline/); assert.match(setupScript, /pnpm run native:rebuild/); assert.match(setupScript, /pnpm run build/); + assert.match(setupScript, /pnpm run artifacts:build/); assert.match(setupScript, /packages\/cli\/dist\/bin\.js status --no-input/); assert.doesNotMatch(setupScript, /scripts\/conductor\//); }); diff --git a/scripts/conductor-setup.sh b/scripts/conductor-setup.sh index d5058f07..6f270508 100755 --- a/scripts/conductor-setup.sh +++ b/scripts/conductor-setup.sh @@ -136,6 +136,9 @@ pnpm run native:rebuild echo "Building KTX packages..." pnpm run build +echo "Building KTX runtime artifacts..." +pnpm run artifacts:build + echo "Running KTX setup doctor..." node packages/cli/dist/bin.js status --no-input