ktx/packages/context/src/project/config.ts

394 lines
18 KiB
TypeScript
Raw Normal View History

import { KTX_MODEL_ROLES } from '@ktx/llm';
2026-05-10 23:12:26 +02:00
import YAML from 'yaml';
import * as z from 'zod';
const KTX_LLM_BACKENDS = ['none', 'anthropic', 'vertex', 'gateway'] as const;
const KTX_EMBEDDING_BACKENDS = ['none', 'deterministic', 'openai', 'sentence-transformers'] as const;
const KTX_PROMPT_CACHE_TTLS = ['5m', '1h'] as const;
const KTX_ENRICHMENT_MODES = ['none', 'deterministic', 'llm'] as const;
const KTX_WORK_UNIT_FAILURE_MODES = ['abort', 'continue'] as const;
const KTX_STORAGE_STATES = ['sqlite', 'postgres'] as const;
const KTX_SEARCH_BACKENDS = ['sqlite-fts5', 'postgres-hybrid'] as const;
const DEPRECATED_KEY_HINTS: Record<string, string> = {
'llm.provider.provider': 'use llm.provider.backend',
'ingest.llm': 'use top-level llm.provider, llm.models, and ingest.workUnits',
'ingest.embeddings.provider': 'use ingest.embeddings.backend',
'scan.enrichment.backend': 'use scan.enrichment.mode',
'scan.enrichment.llm': 'use top-level llm.provider and llm.models',
'scan.enrichment.embeddings.provider': 'use scan.enrichment.embeddings.backend',
};
const apiCredentialsSchema = z
.strictObject({
api_key: z.string().min(1).optional().describe('API key for the provider. Read from this value or the provider-specific environment variable.'),
base_url: z.string().min(1).optional().describe('Override the provider\'s default API base URL (e.g. a proxy or self-hosted gateway).'),
})
.describe('API credentials block: optional key and base URL for an LLM or embedding provider.');
const vertexProviderSchema = z
.strictObject({
project: z.string().min(1).optional().describe('Google Cloud project ID hosting the Vertex AI endpoint.'),
location: z.string().default('').describe('Vertex AI region (e.g. "us-east5"). Empty string falls back to the SDK default.'),
})
.describe('Google Vertex AI provider configuration.');
const sentenceTransformersSchema = z
.strictObject({
base_url: z.string().default('').describe('Base URL of the sentence-transformers HTTP server. Empty string uses the managed local runtime.'),
pathPrefix: z.string().optional().describe('Optional URL path prefix prepended to embedding requests.'),
})
.describe('Sentence-transformers embedding server configuration.');
const llmProviderSchema = z
.strictObject({
backend: z
.enum(KTX_LLM_BACKENDS)
.default('none')
.describe('LLM provider backend. "none" disables LLM features; "anthropic" / "vertex" / "gateway" require the matching nested credentials block.'),
vertex: vertexProviderSchema.optional().describe('Vertex AI credentials, used when backend is "vertex".'),
anthropic: apiCredentialsSchema.optional().describe('Anthropic API credentials, used when backend is "anthropic".'),
gateway: apiCredentialsSchema.optional().describe('AI Gateway credentials, used when backend is "gateway".'),
})
.describe('LLM provider selection and credentials.');
const promptCachingSchema = z
.strictObject({
enabled: z.boolean().optional().describe('Master switch for Anthropic-style prompt caching. When omitted, the backend\'s default applies.'),
systemTtl: z.enum(KTX_PROMPT_CACHE_TTLS).optional().describe('Cache TTL for the system prompt segment ("5m" or "1h").'),
toolsTtl: z.enum(KTX_PROMPT_CACHE_TTLS).optional().describe('Cache TTL for the tools/schema segment ("5m" or "1h").'),
historyTtl: z.enum(KTX_PROMPT_CACHE_TTLS).optional().describe('Cache TTL for conversation-history cache breakpoints ("5m" or "1h").'),
vertexFallbackTo5m: z.boolean().optional().describe('When true, transparently downgrade 1h TTLs to 5m on Vertex, which does not support 1h caching.'),
})
.describe('Prompt-caching tunables for Anthropic-compatible providers.');
const llmSchema = z
.strictObject({
provider: llmProviderSchema.prefault({}).describe('LLM provider backend and credentials.'),
models: z
.partialRecord(z.enum(KTX_MODEL_ROLES), z.string().min(1))
.default({})
.describe('Per-role model overrides keyed by KTX model role (e.g. "default", "triage"). Values are provider-specific model identifiers.'),
promptCaching: promptCachingSchema.optional().describe('Optional prompt-caching tunables.'),
})
.describe('LLM provider, per-role model overrides, and prompt-caching tunables.');
const embeddingSchema = z
.strictObject({
backend: z
.enum(KTX_EMBEDDING_BACKENDS)
.default('deterministic')
.describe('Embedding backend. "deterministic" is a built-in hash-based vector for offline use; "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'),
model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small"). Ignored by the "deterministic" backend.'),
dimensions: z.int().positive().default(8).describe('Embedding vector dimensionality. Must match the chosen model when using a real provider.'),
openai: apiCredentialsSchema.optional().describe('OpenAI credentials, used when backend is "openai".'),
sentenceTransformers: sentenceTransformersSchema.optional().describe('Sentence-transformers server config, used when backend is "sentence-transformers".'),
batchSize: z.int().positive().optional().describe('Number of texts per embedding API call. Omit to use the backend default.'),
})
.describe('Embedding backend, model, and provider credentials.');
const workUnitsSchema = z
.strictObject({
stepBudget: z.int().positive().default(40).describe('Maximum number of agent steps allowed per work unit before it is force-terminated.'),
maxConcurrency: z.int().positive().default(1).describe('Maximum number of work units run concurrently during ingest.'),
failureMode: z
.enum(KTX_WORK_UNIT_FAILURE_MODES)
.default('continue')
.describe('Behavior when a work unit fails: "abort" stops the whole ingest run; "continue" records the failure and keeps going.'),
})
.describe('Concurrency and failure handling for ingest work units.');
const ingestSchema = z
.strictObject({
adapters: z
.array(z.string().min(1))
.default([])
.describe('Ingest adapter identifiers to run (e.g. "metabase", "looker", "historic-sql"). Empty array means no adapters are run.'),
embeddings: embeddingSchema
.prefault({ backend: 'deterministic', model: 'deterministic' })
.describe('Embedding configuration used when ingest adapters need to embed documents.'),
workUnits: workUnitsSchema.prefault({}).describe('Concurrency and failure handling for ingest work units.'),
})
.describe('Ingest pipeline configuration: adapters, embeddings, and work-unit policy.');
const scanEnrichmentSchema = z
.strictObject({
mode: z
.enum(KTX_ENRICHMENT_MODES)
.default('none')
.describe('Column/table enrichment mode. "none" disables enrichment; "deterministic" uses local heuristics; "llm" calls the configured LLM provider.'),
embeddings: embeddingSchema.optional().describe('Optional embedding override for enrichment-time vectorization. Falls back to ingest.embeddings when omitted.'),
})
.describe('Schema-scan enrichment: how columns and tables are described.');
const scanRelationshipsSchema = z
.strictObject({
enabled: z.boolean().default(true).describe('Master switch for relationship discovery during scan.'),
llmProposals: z.boolean().default(true).describe('When true, propose relationships using the configured LLM in addition to deterministic candidates.'),
validationRequiredForManifest: z
.boolean()
.default(true)
.describe('When true, only relationships that pass database-side validation are written to the manifest.'),
acceptThreshold: z
.number()
.min(0)
.max(1)
.default(0.85)
.describe('Confidence score (01) at or above which an LLM-proposed relationship is auto-accepted into the manifest.'),
reviewThreshold: z
.number()
.min(0)
.max(1)
.default(0.55)
.describe('Confidence score (01) at or above which a proposal is surfaced for human review (but not auto-accepted).'),
maxLlmTablesPerBatch: z
.int()
.positive()
.default(40)
.describe('Maximum number of tables included in a single LLM relationship-proposal batch.'),
maxCandidatesPerColumn: z
.int()
.positive()
.default(25)
.describe('Maximum number of candidate join partners considered per column during relationship discovery.'),
profileSampleRows: z.int().positive().default(10000).describe('Number of rows sampled per table when profiling values for relationship inference.'),
validationConcurrency: z.int().positive().default(4).describe('Number of relationship validation queries run in parallel against the database.'),
validationBudget: z
.union([z.literal('all'), z.int().nonnegative()])
.optional()
.describe('Cap on validation queries per scan run. Use "all" for unlimited, an integer for a hard cap, or omit for the runtime default.'),
})
.describe('Schema-scan relationship discovery and validation tunables.');
const scanSchema = z
.strictObject({
enrichment: scanEnrichmentSchema.prefault({}).describe('Column/table enrichment configuration.'),
relationships: scanRelationshipsSchema.prefault({}).describe('Relationship discovery and validation configuration.'),
})
.describe('Schema-scan configuration: enrichment and relationship discovery.');
const setupSchema = z
.strictObject({
database_connection_ids: z
.array(z.string().min(1))
.default([])
.describe('Connection IDs (keys of the top-level `connections` map) that the setup wizard treats as the project\'s primary databases.'),
completed_steps: z
.unknown()
.optional()
.describe('Deprecated. Accepted for backward compatibility but ignored; KTX no longer tracks setup progress here.'),
})
.transform(({ database_connection_ids }) => ({ database_connection_ids }))
.describe('Setup-wizard state captured during `ktx setup`.');
const storageGitSchema = z
.strictObject({
auto_commit: z.boolean().default(true).describe('When true, KTX automatically commits state changes to the local Git-backed store.'),
author: z
.string()
.min(1)
.default('ktx <ktx@example.com>')
.describe('Git author identity used for auto-commits, in standard "Name <email>" form.'),
})
.describe('Git-backed storage commit policy.');
const storageSchema = z
.strictObject({
state: z
.enum(KTX_STORAGE_STATES)
.default('sqlite')
.describe('Backend for KTX state storage. "sqlite" uses .ktx/db.sqlite; "postgres" expects a configured Postgres connection.'),
search: z
.enum(KTX_SEARCH_BACKENDS)
.default('sqlite-fts5')
.describe('Backend for search indexes. "sqlite-fts5" uses SQLite FTS5; "postgres-hybrid" uses Postgres lexical + vector hybrid search.'),
git: storageGitSchema.prefault({}).describe('Git-backed storage commit policy.'),
})
.describe('Storage backends and commit policy for KTX state and search indexes.');
const connectionSchema = z
.looseObject({
driver: z.string().min(1).optional().describe('Connector driver identifier (e.g. "postgres", "bigquery", "snowflake").'),
url: z.string().optional().describe('Connection URL or DSN. Format depends on the driver; may contain environment-variable references.'),
})
.describe('A single database/connector connection entry. Additional driver-specific fields are accepted and passed through.');
const agentSchema = z
.strictObject({
run_research: z
.strictObject({
enabled: z.boolean().default(false).describe('Master switch for the research agent.'),
max_iterations: z
.number()
.int()
.nonnegative()
.default(20)
.describe('Maximum number of tool-call iterations the research agent may take per run.'),
default_toolset: z
.array(z.string().min(1))
.default(['sl_query', 'wiki_search', 'sl_read_source'])
.describe('Default list of tool identifiers exposed to the research agent.'),
})
.prefault({})
.describe('Research-agent configuration.'),
})
.describe('Agent feature configuration.');
const memorySchema = z
.strictObject({
auto_commit: z.boolean().default(true).describe('When true, KTX automatically commits memory updates to the Git-backed store.'),
})
.describe('Memory subsystem configuration.');
const ktxProjectConfigSchema = z
.strictObject({
setup: setupSchema.optional().describe('Setup-wizard state. Written by `ktx setup`; may be omitted.'),
connections: z
.record(z.string(), connectionSchema)
.default({})
.describe('Map of connection ID to connector configuration. Keys are user-chosen names referenced elsewhere in the config.'),
storage: storageSchema.prefault({}).describe('Storage backends and commit policy for KTX state and search indexes.'),
llm: llmSchema.prefault({}).describe('LLM provider, per-role model overrides, and prompt-caching tunables.'),
ingest: ingestSchema.prefault({}).describe('Ingest pipeline configuration.'),
agent: agentSchema.prefault({}).describe('Agent feature configuration.'),
memory: memorySchema.prefault({}).describe('Memory subsystem configuration.'),
scan: scanSchema.prefault({}).describe('Schema-scan configuration: enrichment and relationship discovery.'),
})
.describe('Configuration schema for KTX project files (ktx.yaml).');
export type KtxProjectConfig = z.infer<typeof ktxProjectConfigSchema>;
export type KtxProjectLlmConfig = z.infer<typeof llmSchema>;
export type KtxProjectLlmProviderConfig = z.infer<typeof llmProviderSchema>;
export type KtxProjectEmbeddingConfig = z.infer<typeof embeddingSchema>;
export type KtxScanEnrichmentConfig = z.infer<typeof scanEnrichmentSchema>;
export type KtxIngestWorkUnitsConfig = z.infer<typeof workUnitsSchema>;
export type KtxScanRelationshipConfig = z.infer<typeof scanRelationshipsSchema>;
export type KtxProjectScanConfig = z.infer<typeof scanSchema>;
export type KtxProjectConnectionConfig = z.infer<typeof connectionSchema>;
export type KtxProjectSetupConfig = z.infer<typeof setupSchema>;
export type KtxStorageState = z.infer<typeof storageSchema>['state'];
export type KtxSearchBackend = z.infer<typeof storageSchema>['search'];
export interface KtxConfigIssue {
path: string;
message: string;
fix?: string;
}
export interface KtxConfigValidation {
ok: boolean;
issues: KtxConfigIssue[];
2026-05-10 23:12:26 +02:00
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function dottedPath(path: ReadonlyArray<PropertyKey>): string {
return path.map((segment) => String(segment)).join('.');
2026-05-10 23:12:26 +02:00
}
function valueAtPath(root: unknown, path: ReadonlyArray<PropertyKey>): unknown {
let cursor: unknown = root;
for (const segment of path) {
if (cursor === null || typeof cursor !== 'object') return undefined;
cursor = (cursor as Record<PropertyKey, unknown>)[segment];
2026-05-10 23:12:26 +02:00
}
return cursor;
2026-05-10 23:12:26 +02:00
}
function formatIssue(issue: z.core.$ZodIssue, input: unknown): KtxConfigIssue[] {
const basePath = dottedPath(issue.path);
2026-05-10 23:12:26 +02:00
if (issue.code === 'unrecognized_keys') {
const keys = (issue as { keys?: readonly string[] }).keys ?? [];
return keys.map((key) => {
const fullPath = basePath.length > 0 ? `${basePath}.${key}` : key;
const hint = DEPRECATED_KEY_HINTS[fullPath];
if (hint !== undefined) {
return { path: fullPath, message: `Unsupported ${fullPath}: ${hint}`, fix: hint };
}
return { path: fullPath, message: `Unsupported ${fullPath}: unknown field` };
});
2026-05-10 23:12:26 +02:00
}
const lastSegment = issue.path[issue.path.length - 1];
if (lastSegment === 'backend' && (issue.code === 'invalid_value' || issue.code === 'invalid_type')) {
const value = valueAtPath(input, issue.path);
return [{ path: basePath, message: `Unsupported ${basePath}: ${String(value)}` }];
2026-05-10 23:12:26 +02:00
}
return [{ path: basePath, message: basePath.length > 0 ? `${basePath}: ${issue.message}` : issue.message }];
2026-05-10 23:12:26 +02:00
}
function collectIssues(error: z.ZodError, input: unknown): KtxConfigIssue[] {
return error.issues.flatMap((issue) => formatIssue(issue, input));
2026-05-10 23:12:26 +02:00
}
function formatZodError(error: z.ZodError, input: unknown): string {
return collectIssues(error, input)
.map((issue) => issue.message)
.join('\n');
2026-05-10 23:12:26 +02:00
}
export function buildDefaultKtxProjectConfig(): KtxProjectConfig {
return ktxProjectConfigSchema.parse({});
2026-05-10 23:12:26 +02:00
}
2026-05-10 23:51:24 +02:00
export function parseKtxProjectConfig(raw: string): KtxProjectConfig {
2026-05-10 23:12:26 +02:00
const parsed = YAML.parse(raw) as unknown;
if (!isRecord(parsed)) {
2026-05-10 23:51:24 +02:00
throw new Error('ktx.yaml must contain a YAML object');
2026-05-10 23:12:26 +02:00
}
const result = ktxProjectConfigSchema.safeParse(parsed);
if (!result.success) {
throw new Error(formatZodError(result.error, parsed));
2026-05-10 23:12:26 +02:00
}
return result.data;
}
2026-05-10 23:12:26 +02:00
export function validateKtxProjectConfig(raw: string): KtxConfigValidation {
let parsed: unknown;
try {
parsed = YAML.parse(raw);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
return { ok: false, issues: [{ path: '', message: `ktx.yaml parse error: ${message}` }] };
2026-05-10 23:12:26 +02:00
}
if (!isRecord(parsed)) {
return { ok: false, issues: [{ path: '', message: 'ktx.yaml must contain a YAML object' }] };
2026-05-10 23:12:26 +02:00
}
const result = ktxProjectConfigSchema.safeParse(parsed);
if (result.success) {
return { ok: true, issues: [] };
2026-05-10 23:12:26 +02:00
}
return { ok: false, issues: collectIssues(result.error, parsed) };
2026-05-10 23:12:26 +02:00
}
export function generateKtxProjectConfigJsonSchema(): Record<string, unknown> {
const schema = z.toJSONSchema(ktxProjectConfigSchema, {
target: 'draft-7',
io: 'input',
}) as Record<string, unknown>;
return {
$schema: 'http://json-schema.org/draft-07/schema#',
$id: 'https://ktx.dev/schemas/ktx-project-config.json',
title: 'ktx.yaml',
...schema,
};
}
2026-05-10 23:51:24 +02:00
export function serializeKtxProjectConfig(config: KtxProjectConfig): string {
feat: merge ingest and scan * docs: add CLI component reuse guidance * docs: add unified ingest ux design * Refine unified ingest UX design after adversarial review iteration 1 * Refine unified ingest UX design after adversarial review iteration 2 * Refine unified ingest UX design after adversarial review iteration 3 * feat(cli): route public connection ingest command * feat(cli): hide standalone scan from public help * feat(cli): plan public ingest depth and query history * feat(cli): execute public database ingest facets * feat(ingest): read connection query history config * fix(cli): use public ingest wording * fix(config): stop generating ingest adapter allow lists * docs: document public ingest command * test: align ingest surface expectations * docs: add unified ingest public CLI surface plan * feat(cli): preflight deep public ingest readiness * feat(setup): store query history in connection context * feat(setup): store database context depth * feat(setup): verify context readiness by database depth * fix(setup): keep context build foreground only * fix(config): reject reserved ingest connection ids * test: close unified ingest v1 expectations * docs: add unified ingest v1 closure plan * fix(ingest): bypass adapter allow-list for public source ingest * fix(ingest): honor query history window intent * fix(ingest): hide scan internals from public database ingest * feat(ingest): use foreground view for interactive public ingest * fix(setup): use schema context and query history wording * test(cli): verify unified ingest public output * docs: add unified ingest v1 public output closure plan * fix(setup): forward query history flags * fix(setup): prompt for postgres query history * fix(status): report query history readiness * fix(ingest): remove legacy public guidance * fix(ingest): polish foreground retry copy * docs(examples): use unified query history wording * chore(ingest): finish public query history cleanup * docs: add unified ingest v1 query history status cleanup plan * test(docs): cover unified ingest public docs * docs: align ingest CLI reference with unified UX * docs: update context build guides for unified ingest * docs: update setup and primary source ingest wording * docs: stop advertising adapter-backed example ingest * docs: close unified ingest public docs gaps * docs: add unified ingest v1 docs site closure plan * fix: render unified ingest foreground warnings * fix: explain query history schema order * fix: add public ingest retry guidance * fix: align setup next steps with unified ingest * fix: remove scan wording from demo progress * test: verify unified ingest ux closure * docs: add unified ingest v1 foreground and retry closure plan * fix(cli): preserve query-history pull config in public ingest * fix(cli): omit hidden commands from docs command tree * test(cli): close unified ingest final public surface checks * docs: add unified ingest v1 final public surface closure plan * fix(cli): use public source labels in ingest reports * fix(cli): suppress low-level public ingest output * test(cli): verify unified ingest public plain output * docs: add unified ingest v1 public plain output closure plan * fix(cli): add public ingest copy sanitizers * fix(cli): sanitize public ingest progress copy * fix(cli): rename setup schema scope prompt * docs(plan): add progress copy closure; test: align setup back-nav fixture Adds the iter9 plan and updates the setup back-navigation test fixture to pass disableQueryHistory plus listSchemas/listTables stubs that the unified ingest setup step now requires. * docs(plan): add final ux labels plan with narrowed label scans * fix(cli): aggregate unsupported query-history warnings * fix(cli): align setup database labels * test(cli): fix setup database test type-check * fix(cli): remove primary-source wording from setup output * test(cli): verify unified ingest setup closure * docs(plan): add unified ingest v1 verification copy closure plan * fix(cli): remove top-level scan command * fix(cli): remove legacy ingest and wiki commands * Merge scan into ingest flow * feat(cli): split ingest progress into per-phase rows, rename work units to tasks Each database target in the unified ingest dashboard now renders one row per real subprocess (Schema, then Query history when enabled) instead of a single combined bar. Each phase has its own monotonic 0-100% bar so the progress never snaps back to zero when historic-sql starts after scan completes. Completed phases keep their final bar, summary, and elapsed time visible as an inline audit trail; queued and skipped phases are shown explicitly. Also rename user-facing "work units" / "Failed work units" to "tasks" / "Failed tasks" in ingest output and parseIngestSummary. The parser still accepts the legacy "Work units:" wording in captured output for backward compat. Internal memory-flow event names and type fields are left alone. * Fix test harness failures * Fix CI smoke checks --------- Co-authored-by: Andrey Avtomonov <7889985+andreybavt@users.noreply.github.com>
2026-05-14 01:43:06 +02:00
const serializedConfig =
config.ingest.adapters.length === 0
? {
...config,
ingest: {
embeddings: config.ingest.embeddings,
workUnits: config.ingest.workUnits,
},
}
: config;
return `${YAML.stringify(serializedConfig, { indent: 2, lineWidth: 0 }).trimEnd()}\n`;
2026-05-10 23:12:26 +02:00
}