feat(cli): add ktx dev schema to emit ktx.yaml JSON Schema

Annotates the Zod config schema with .describe() text on every field and adds generateKtxProjectConfigJsonSchema() plus a ktx dev schema command that prints (or writes) a draft-07 JSON Schema for editors and LLM agents.
2026-07-25 12:01:03 +02:00 · 2026-05-14 16:15:20 +02:00 · 2026-05-14 16:15:20 +02:00 · bd4b41f681
commit bd4b41f681
parent b3be54e3fa
4 changed files with 299 additions and 110 deletions
--- a/packages/cli/src/dev.ts
+++ b/packages/cli/src/dev.ts
@ -46,5 +46,23 @@ export function registerDevCommands(program: Command, context: KtxCliCommandCont
      },
    );

+  dev
+    .command('schema')
+    .description('Print a JSON Schema describing ktx.yaml (for editors and LLM agents)')
+    .option('--output <file>', 'Write the schema to a file instead of stdout')
+    .action(async (options: { output?: string }) => {
+      const { generateKtxProjectConfigJsonSchema } = await import('@ktx/context/project');
+      const json = `${JSON.stringify(generateKtxProjectConfigJsonSchema(), null, 2)}\n`;
+      if (options.output) {
+        const { writeFile } = await import('node:fs/promises');
+        const target = resolve(options.output);
+        await writeFile(target, json, 'utf8');
+        context.io.stdout.write(`Wrote ${target}\n`);
+      } else {
+        context.io.stdout.write(json);
+      }
+      context.setExitCode(0);
+    });
+
  registerRuntimeCommands(dev, context);
 }
--- a/packages/context/src/project/config.test.ts
+++ b/packages/context/src/project/config.test.ts
@ -1,6 +1,7 @@
 import { describe, expect, it } from 'vitest';
 import {
  buildDefaultKtxProjectConfig,
+  generateKtxProjectConfigJsonSchema,
  parseKtxProjectConfig,
  serializeKtxProjectConfig,
  validateKtxProjectConfig,
@ -485,3 +486,49 @@ scan:
    });
  });
 });
+
+describe('generateKtxProjectConfigJsonSchema', () => {
+  const schema = generateKtxProjectConfigJsonSchema();
+
+  it('emits draft-07 metadata', () => {
+    expect(schema.$schema).toBe('http://json-schema.org/draft-07/schema#');
+    expect(schema.$id).toBe('https://ktx.dev/schemas/ktx-project-config.json');
+    expect(schema.title).toBe('ktx.yaml');
+    expect(schema.type).toBe('object');
+  });
+
+  it('exposes every top-level ktx.yaml section under properties', () => {
+    const properties = schema.properties as Record<string, unknown>;
+    expect(Object.keys(properties).sort()).toEqual(
+      ['agent', 'connections', 'ingest', 'llm', 'memory', 'project', 'scan', 'setup', 'storage'].sort(),
+    );
+  });
+
+  it('marks "project" as required', () => {
+    expect(schema.required).toEqual(expect.arrayContaining(['project']));
+  });
+
+  it('carries .describe() text on top-level fields', () => {
+    const properties = schema.properties as Record<string, { description?: string }>;
+    expect(properties.project?.description).toMatch(/Project identifier/);
+    expect(properties.llm?.description).toMatch(/LLM/);
+    expect(properties.scan?.description).toMatch(/Schema-scan/);
+  });
+
+  it('propagates enum values through to nested fields', () => {
+    const llm = (schema.properties as Record<string, { properties?: Record<string, unknown> }>).llm;
+    const provider = llm?.properties?.provider as { properties?: Record<string, unknown> };
+    const backend = provider?.properties?.backend as { enum?: readonly string[] };
+    expect(backend?.enum).toEqual(['none', 'anthropic', 'vertex', 'gateway']);
+
+    const storage = (schema.properties as Record<string, { properties?: Record<string, unknown> }>).storage;
+    const state = storage?.properties?.state as { enum?: readonly string[] };
+    expect(state?.enum).toEqual(['sqlite', 'postgres']);
+  });
+
+  it('carries descriptions on deeply nested leaves', () => {
+    const scan = (schema.properties as Record<string, { properties?: Record<string, unknown> }>).scan;
+    const relationships = scan?.properties?.relationships as { properties?: Record<string, { description?: string }> };
+    expect(relationships?.properties?.acceptThreshold?.description).toMatch(/auto-accepted/);
+  });
+});
--- a/packages/context/src/project/config.ts
+++ b/packages/context/src/project/config.ts
@ -19,137 +19,247 @@ const DEPRECATED_KEY_HINTS: Record<string, string> = {
  'scan.enrichment.embeddings.provider': 'use scan.enrichment.embeddings.backend',
 };

-const apiCredentialsSchema = z.strictObject({
-  api_key: z.string().min(1).optional(),
-  base_url: z.string().min(1).optional(),
-});
+const apiCredentialsSchema = z
+  .strictObject({
+    api_key: z.string().min(1).optional().describe('API key for the provider. Read from this value or the provider-specific environment variable.'),
+    base_url: z.string().min(1).optional().describe('Override the provider\'s default API base URL (e.g. a proxy or self-hosted gateway).'),
+  })
+  .describe('API credentials block: optional key and base URL for an LLM or embedding provider.');

-const vertexProviderSchema = z.strictObject({
-  project: z.string().min(1).optional(),
-  location: z.string().default(''),
-});
+const vertexProviderSchema = z
+  .strictObject({
+    project: z.string().min(1).optional().describe('Google Cloud project ID hosting the Vertex AI endpoint.'),
+    location: z.string().default('').describe('Vertex AI region (e.g. "us-east5"). Empty string falls back to the SDK default.'),
+  })
+  .describe('Google Vertex AI provider configuration.');

-const sentenceTransformersSchema = z.strictObject({
-  base_url: z.string().default(''),
-  pathPrefix: z.string().optional(),
-});
+const sentenceTransformersSchema = z
+  .strictObject({
+    base_url: z.string().default('').describe('Base URL of the sentence-transformers HTTP server. Empty string uses the managed local runtime.'),
+    pathPrefix: z.string().optional().describe('Optional URL path prefix prepended to embedding requests.'),
+  })
+  .describe('Sentence-transformers embedding server configuration.');

-const llmProviderSchema = z.strictObject({
-  backend: z.enum(KTX_LLM_BACKENDS).default('none'),
-  vertex: vertexProviderSchema.optional(),
-  anthropic: apiCredentialsSchema.optional(),
-  gateway: apiCredentialsSchema.optional(),
-});
+const llmProviderSchema = z
+  .strictObject({
+    backend: z
+      .enum(KTX_LLM_BACKENDS)
+      .default('none')
+      .describe('LLM provider backend. "none" disables LLM features; "anthropic" / "vertex" / "gateway" require the matching nested credentials block.'),
+    vertex: vertexProviderSchema.optional().describe('Vertex AI credentials, used when backend is "vertex".'),
+    anthropic: apiCredentialsSchema.optional().describe('Anthropic API credentials, used when backend is "anthropic".'),
+    gateway: apiCredentialsSchema.optional().describe('AI Gateway credentials, used when backend is "gateway".'),
+  })
+  .describe('LLM provider selection and credentials.');

-const promptCachingSchema = z.strictObject({
-  enabled: z.boolean().optional(),
-  systemTtl: z.enum(KTX_PROMPT_CACHE_TTLS).optional(),
-  toolsTtl: z.enum(KTX_PROMPT_CACHE_TTLS).optional(),
-  historyTtl: z.enum(KTX_PROMPT_CACHE_TTLS).optional(),
-  vertexFallbackTo5m: z.boolean().optional(),
-});
+const promptCachingSchema = z
+  .strictObject({
+    enabled: z.boolean().optional().describe('Master switch for Anthropic-style prompt caching. When omitted, the backend\'s default applies.'),
+    systemTtl: z.enum(KTX_PROMPT_CACHE_TTLS).optional().describe('Cache TTL for the system prompt segment ("5m" or "1h").'),
+    toolsTtl: z.enum(KTX_PROMPT_CACHE_TTLS).optional().describe('Cache TTL for the tools/schema segment ("5m" or "1h").'),
+    historyTtl: z.enum(KTX_PROMPT_CACHE_TTLS).optional().describe('Cache TTL for conversation-history cache breakpoints ("5m" or "1h").'),
+    vertexFallbackTo5m: z.boolean().optional().describe('When true, transparently downgrade 1h TTLs to 5m on Vertex, which does not support 1h caching.'),
+  })
+  .describe('Prompt-caching tunables for Anthropic-compatible providers.');

-const llmSchema = z.strictObject({
-  provider: llmProviderSchema.prefault({}),
-  models: z.partialRecord(z.enum(KTX_MODEL_ROLES), z.string().min(1)).default({}),
-  promptCaching: promptCachingSchema.optional(),
-});
+const llmSchema = z
+  .strictObject({
+    provider: llmProviderSchema.prefault({}).describe('LLM provider backend and credentials.'),
+    models: z
+      .partialRecord(z.enum(KTX_MODEL_ROLES), z.string().min(1))
+      .default({})
+      .describe('Per-role model overrides keyed by KTX model role (e.g. "default", "triage"). Values are provider-specific model identifiers.'),
+    promptCaching: promptCachingSchema.optional().describe('Optional prompt-caching tunables.'),
+  })
+  .describe('LLM provider, per-role model overrides, and prompt-caching tunables.');

-const embeddingSchema = z.strictObject({
-  backend: z.enum(KTX_EMBEDDING_BACKENDS).default('deterministic'),
-  model: z.string().min(1).optional(),
-  dimensions: z.int().positive().default(8),
-  openai: apiCredentialsSchema.optional(),
-  sentenceTransformers: sentenceTransformersSchema.optional(),
-  batchSize: z.int().positive().optional(),
-});
+const embeddingSchema = z
+  .strictObject({
+    backend: z
+      .enum(KTX_EMBEDDING_BACKENDS)
+      .default('deterministic')
+      .describe('Embedding backend. "deterministic" is a built-in hash-based vector for offline use; "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'),
+    model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small"). Ignored by the "deterministic" backend.'),
+    dimensions: z.int().positive().default(8).describe('Embedding vector dimensionality. Must match the chosen model when using a real provider.'),
+    openai: apiCredentialsSchema.optional().describe('OpenAI credentials, used when backend is "openai".'),
+    sentenceTransformers: sentenceTransformersSchema.optional().describe('Sentence-transformers server config, used when backend is "sentence-transformers".'),
+    batchSize: z.int().positive().optional().describe('Number of texts per embedding API call. Omit to use the backend default.'),
+  })
+  .describe('Embedding backend, model, and provider credentials.');

-const workUnitsSchema = z.strictObject({
-  stepBudget: z.int().positive().default(40),
-  maxConcurrency: z.int().positive().default(1),
-  failureMode: z.enum(KTX_WORK_UNIT_FAILURE_MODES).default('continue'),
-});
+const workUnitsSchema = z
+  .strictObject({
+    stepBudget: z.int().positive().default(40).describe('Maximum number of agent steps allowed per work unit before it is force-terminated.'),
+    maxConcurrency: z.int().positive().default(1).describe('Maximum number of work units run concurrently during ingest.'),
+    failureMode: z
+      .enum(KTX_WORK_UNIT_FAILURE_MODES)
+      .default('continue')
+      .describe('Behavior when a work unit fails: "abort" stops the whole ingest run; "continue" records the failure and keeps going.'),
+  })
+  .describe('Concurrency and failure handling for ingest work units.');

-const ingestSchema = z.strictObject({
-  adapters: z.array(z.string().min(1)).default([]),
-  embeddings: embeddingSchema.prefault({ backend: 'deterministic', model: 'deterministic' }),
-  workUnits: workUnitsSchema.prefault({}),
-});
+const ingestSchema = z
+  .strictObject({
+    adapters: z
+      .array(z.string().min(1))
+      .default([])
+      .describe('Ingest adapter identifiers to run (e.g. "metabase", "looker", "historic-sql"). Empty array means no adapters are run.'),
+    embeddings: embeddingSchema
+      .prefault({ backend: 'deterministic', model: 'deterministic' })
+      .describe('Embedding configuration used when ingest adapters need to embed documents.'),
+    workUnits: workUnitsSchema.prefault({}).describe('Concurrency and failure handling for ingest work units.'),
+  })
+  .describe('Ingest pipeline configuration: adapters, embeddings, and work-unit policy.');

-const scanEnrichmentSchema = z.strictObject({
-  mode: z.enum(KTX_ENRICHMENT_MODES).default('none'),
-  embeddings: embeddingSchema.optional(),
-});
+const scanEnrichmentSchema = z
+  .strictObject({
+    mode: z
+      .enum(KTX_ENRICHMENT_MODES)
+      .default('none')
+      .describe('Column/table enrichment mode. "none" disables enrichment; "deterministic" uses local heuristics; "llm" calls the configured LLM provider.'),
+    embeddings: embeddingSchema.optional().describe('Optional embedding override for enrichment-time vectorization. Falls back to ingest.embeddings when omitted.'),
+  })
+  .describe('Schema-scan enrichment: how columns and tables are described.');

-const scanRelationshipsSchema = z.strictObject({
-  enabled: z.boolean().default(true),
-  llmProposals: z.boolean().default(true),
-  validationRequiredForManifest: z.boolean().default(true),
-  acceptThreshold: z.number().min(0).max(1).default(0.85),
-  reviewThreshold: z.number().min(0).max(1).default(0.55),
-  maxLlmTablesPerBatch: z.int().positive().default(40),
-  maxCandidatesPerColumn: z.int().positive().default(25),
-  profileSampleRows: z.int().positive().default(10000),
-  validationConcurrency: z.int().positive().default(4),
-  validationBudget: z.union([z.literal('all'), z.int().nonnegative()]).optional(),
-});
+const scanRelationshipsSchema = z
+  .strictObject({
+    enabled: z.boolean().default(true).describe('Master switch for relationship discovery during scan.'),
+    llmProposals: z.boolean().default(true).describe('When true, propose relationships using the configured LLM in addition to deterministic candidates.'),
+    validationRequiredForManifest: z
+      .boolean()
+      .default(true)
+      .describe('When true, only relationships that pass database-side validation are written to the manifest.'),
+    acceptThreshold: z
+      .number()
+      .min(0)
+      .max(1)
+      .default(0.85)
+      .describe('Confidence score (0–1) at or above which an LLM-proposed relationship is auto-accepted into the manifest.'),
+    reviewThreshold: z
+      .number()
+      .min(0)
+      .max(1)
+      .default(0.55)
+      .describe('Confidence score (0–1) at or above which a proposal is surfaced for human review (but not auto-accepted).'),
+    maxLlmTablesPerBatch: z
+      .int()
+      .positive()
+      .default(40)
+      .describe('Maximum number of tables included in a single LLM relationship-proposal batch.'),
+    maxCandidatesPerColumn: z
+      .int()
+      .positive()
+      .default(25)
+      .describe('Maximum number of candidate join partners considered per column during relationship discovery.'),
+    profileSampleRows: z.int().positive().default(10000).describe('Number of rows sampled per table when profiling values for relationship inference.'),
+    validationConcurrency: z.int().positive().default(4).describe('Number of relationship validation queries run in parallel against the database.'),
+    validationBudget: z
+      .union([z.literal('all'), z.int().nonnegative()])
+      .optional()
+      .describe('Cap on validation queries per scan run. Use "all" for unlimited, an integer for a hard cap, or omit for the runtime default.'),
+  })
+  .describe('Schema-scan relationship discovery and validation tunables.');

-const scanSchema = z.strictObject({
-  enrichment: scanEnrichmentSchema.prefault({}),
-  relationships: scanRelationshipsSchema.prefault({}),
-});
+const scanSchema = z
+  .strictObject({
+    enrichment: scanEnrichmentSchema.prefault({}).describe('Column/table enrichment configuration.'),
+    relationships: scanRelationshipsSchema.prefault({}).describe('Relationship discovery and validation configuration.'),
+  })
+  .describe('Schema-scan configuration: enrichment and relationship discovery.');

 const setupSchema = z
  .strictObject({
-    database_connection_ids: z.array(z.string().min(1)).default([]),
-    completed_steps: z.unknown().optional(),
+    database_connection_ids: z
+      .array(z.string().min(1))
+      .default([])
+      .describe('Connection IDs (keys of the top-level `connections` map) that the setup wizard treats as the project\'s primary databases.'),
+    completed_steps: z
+      .unknown()
+      .optional()
+      .describe('Deprecated. Accepted for backward compatibility but ignored; KTX no longer tracks setup progress here.'),
  })
-  .transform(({ database_connection_ids }) => ({ database_connection_ids }));
+  .transform(({ database_connection_ids }) => ({ database_connection_ids }))
+  .describe('Setup-wizard state captured during `ktx setup`.');

-const storageGitSchema = z.strictObject({
-  auto_commit: z.boolean().default(true),
-  author: z.string().min(1).default('ktx <ktx@example.com>'),
-});
+const storageGitSchema = z
+  .strictObject({
+    auto_commit: z.boolean().default(true).describe('When true, KTX automatically commits state changes to the local Git-backed store.'),
+    author: z
+      .string()
+      .min(1)
+      .default('ktx <ktx@example.com>')
+      .describe('Git author identity used for auto-commits, in standard "Name <email>" form.'),
+  })
+  .describe('Git-backed storage commit policy.');

-const storageSchema = z.strictObject({
-  state: z.enum(KTX_STORAGE_STATES).default('sqlite'),
-  search: z.enum(KTX_SEARCH_BACKENDS).default('sqlite-fts5'),
-  git: storageGitSchema.prefault({}),
-});
+const storageSchema = z
+  .strictObject({
+    state: z
+      .enum(KTX_STORAGE_STATES)
+      .default('sqlite')
+      .describe('Backend for KTX state storage. "sqlite" uses .ktx/db.sqlite; "postgres" expects a configured Postgres connection.'),
+    search: z
+      .enum(KTX_SEARCH_BACKENDS)
+      .default('sqlite-fts5')
+      .describe('Backend for search indexes. "sqlite-fts5" uses SQLite FTS5; "postgres-hybrid" uses Postgres lexical + vector hybrid search.'),
+    git: storageGitSchema.prefault({}).describe('Git-backed storage commit policy.'),
+  })
+  .describe('Storage backends and commit policy for KTX state and search indexes.');

-const connectionSchema = z.looseObject({
-  driver: z.string().min(1).optional(),
-  url: z.string().optional(),
-});
+const connectionSchema = z
+  .looseObject({
+    driver: z.string().min(1).optional().describe('Connector driver identifier (e.g. "postgres", "bigquery", "snowflake").'),
+    url: z.string().optional().describe('Connection URL or DSN. Format depends on the driver; may contain environment-variable references.'),
+  })
+  .describe('A single database/connector connection entry. Additional driver-specific fields are accepted and passed through.');

-const agentSchema = z.strictObject({
-  run_research: z
-    .strictObject({
-      enabled: z.boolean().default(false),
-      max_iterations: z.number().int().nonnegative().default(20),
-      default_toolset: z.array(z.string().min(1)).default(['sl_query', 'wiki_search', 'sl_read_source']),
-    })
-    .prefault({}),
-});
+const agentSchema = z
+  .strictObject({
+    run_research: z
+      .strictObject({
+        enabled: z.boolean().default(false).describe('Master switch for the research agent.'),
+        max_iterations: z
+          .number()
+          .int()
+          .nonnegative()
+          .default(20)
+          .describe('Maximum number of tool-call iterations the research agent may take per run.'),
+        default_toolset: z
+          .array(z.string().min(1))
+          .default(['sl_query', 'wiki_search', 'sl_read_source'])
+          .describe('Default list of tool identifiers exposed to the research agent.'),
+      })
+      .prefault({})
+      .describe('Research-agent configuration.'),
+  })
+  .describe('Agent feature configuration.');

-const memorySchema = z.strictObject({
-  auto_commit: z.boolean().default(true),
-});
+const memorySchema = z
+  .strictObject({
+    auto_commit: z.boolean().default(true).describe('When true, KTX automatically commits memory updates to the Git-backed store.'),
+  })
+  .describe('Memory subsystem configuration.');

-const ktxProjectConfigSchema = z.strictObject({
-  project: z
-    .string({ error: 'ktx.yaml field "project" is required' })
-    .trim()
-    .min(1, 'ktx.yaml field "project" is required'),
-  setup: setupSchema.optional(),
-  connections: z.record(z.string(), connectionSchema).default({}),
-  storage: storageSchema.prefault({}),
-  llm: llmSchema.prefault({}),
-  ingest: ingestSchema.prefault({}),
-  agent: agentSchema.prefault({}),
-  memory: memorySchema.prefault({}),
-  scan: scanSchema.prefault({}),
-});
+const ktxProjectConfigSchema = z
+  .strictObject({
+    project: z
+      .string({ error: 'ktx.yaml field "project" is required' })
+      .trim()
+      .min(1, 'ktx.yaml field "project" is required')
+      .describe('Project identifier; used in logs, ktx state files, and as the default workspace name.'),
+    setup: setupSchema.optional().describe('Setup-wizard state. Written by `ktx setup`; may be omitted.'),
+    connections: z
+      .record(z.string(), connectionSchema)
+      .default({})
+      .describe('Map of connection ID to connector configuration. Keys are user-chosen names referenced elsewhere in the config.'),
+    storage: storageSchema.prefault({}).describe('Storage backends and commit policy for KTX state and search indexes.'),
+    llm: llmSchema.prefault({}).describe('LLM provider, per-role model overrides, and prompt-caching tunables.'),
+    ingest: ingestSchema.prefault({}).describe('Ingest pipeline configuration.'),
+    agent: agentSchema.prefault({}).describe('Agent feature configuration.'),
+    memory: memorySchema.prefault({}).describe('Memory subsystem configuration.'),
+    scan: scanSchema.prefault({}).describe('Schema-scan configuration: enrichment and relationship discovery.'),
+  })
+  .describe('Configuration schema for KTX project files (ktx.yaml).');

 export type KtxProjectConfig = z.infer<typeof ktxProjectConfigSchema>;
 export type KtxProjectLlmConfig = z.infer<typeof llmSchema>;
@ -260,6 +370,19 @@ export function validateKtxProjectConfig(raw: string): KtxConfigValidation {
  return { ok: false, issues: collectIssues(result.error, parsed) };
 }

+export function generateKtxProjectConfigJsonSchema(): Record<string, unknown> {
+  const schema = z.toJSONSchema(ktxProjectConfigSchema, {
+    target: 'draft-7',
+    io: 'input',
+  }) as Record<string, unknown>;
+  return {
+    $schema: 'http://json-schema.org/draft-07/schema#',
+    $id: 'https://ktx.dev/schemas/ktx-project-config.json',
+    title: 'ktx.yaml',
+    ...schema,
+  };
+}
+
 export function serializeKtxProjectConfig(config: KtxProjectConfig): string {
  const serializedConfig =
    config.ingest.adapters.length === 0
--- a/packages/context/src/project/index.ts
+++ b/packages/context/src/project/index.ts
@ -10,6 +10,7 @@ export type {
 } from './config.js';
 export {
  buildDefaultKtxProjectConfig,
+  generateKtxProjectConfigJsonSchema,
  parseKtxProjectConfig,
  serializeKtxProjectConfig,
  validateKtxProjectConfig,