Merge remote-tracking branch 'origin/main' into explore-research-agent-tools

# Conflicts:
#	packages/context/skills/metricflow_ingest/SKILL.md
This commit is contained in:
Andrey Avtomonov 2026-05-15 02:12:30 +02:00
commit 05d666e75f
103 changed files with 4149 additions and 1024 deletions

View file

@ -0,0 +1,11 @@
<svg viewBox="0 0 200 200" xmlns="http://www.w3.org/2000/svg" role="img" aria-label="ktx mascot">
<g fill="none" stroke="#F5F1EA" stroke-width="16" stroke-linecap="round">
<path d="M 62 110 Q 32 130 44 152"/>
<path d="M 88 116 Q 80 152 70 174"/>
<path d="M 112 116 Q 120 152 130 174"/>
</g>
<path d="M 134 108 C 162 116, 172 96, 162 78 C 154 64, 168 56, 178 60" fill="none" stroke="#FF8A4C" stroke-width="16" stroke-linecap="round"/>
<path d="M 48 102 C 48 56, 78 30, 100 30 C 122 30, 152 56, 152 102 C 152 116, 132 120, 100 120 C 68 120, 48 116, 48 102 Z" fill="#F5F1EA"/>
<path d="M 80 84 Q 86 77 92 84" fill="none" stroke="#1B3139" stroke-width="3.5" stroke-linecap="round"/>
<path d="M 108 84 Q 114 77 120 84" fill="none" stroke="#1B3139" stroke-width="3.5" stroke-linecap="round"/>
</svg>

After

Width:  |  Height:  |  Size: 818 B

11
assets/ktx-mascot.svg Normal file
View file

@ -0,0 +1,11 @@
<svg viewBox="0 0 200 200" xmlns="http://www.w3.org/2000/svg" role="img" aria-label="ktx mascot">
<g fill="none" stroke="#1B3139" stroke-width="16" stroke-linecap="round">
<path d="M 62 110 Q 32 130 44 152"/>
<path d="M 88 116 Q 80 152 70 174"/>
<path d="M 112 116 Q 120 152 130 174"/>
</g>
<path d="M 134 108 C 162 116, 172 96, 162 78 C 154 64, 168 56, 178 60" fill="none" stroke="#FF8A4C" stroke-width="16" stroke-linecap="round"/>
<path d="M 48 102 C 48 56, 78 30, 100 30 C 122 30, 152 56, 152 102 C 152 116, 132 120, 100 120 C 68 120, 48 116, 48 102 Z" fill="#1B3139"/>
<path d="M 80 84 Q 86 77 92 84" fill="none" stroke="#F5F1EA" stroke-width="3.5" stroke-linecap="round"/>
<path d="M 108 84 Q 114 77 120 84" fill="none" stroke="#F5F1EA" stroke-width="3.5" stroke-linecap="round"/>
</svg>

After

Width:  |  Height:  |  Size: 818 B

View file

@ -1,22 +1,28 @@
export function Logo() {
return (
<div className="flex items-center gap-2 group">
<div className="flex items-center gap-2.5 group">
<div className="relative flex items-center justify-center transition-transform duration-300 ease-out group-hover:rotate-[-4deg]">
<img
src="/brand/ktx-mascot.png"
src="/brand/ktx-mascot.svg"
alt=""
aria-hidden="true"
className="h-8 w-8 object-contain"
className="h-14 w-14 object-contain block dark:hidden"
/>
<img
src="/brand/ktx-mascot-dark.svg"
alt=""
aria-hidden="true"
className="h-14 w-14 object-contain hidden dark:block"
/>
</div>
<span
className="text-[15px] font-semibold text-fd-foreground tracking-tight"
className="text-[17px] font-semibold text-fd-foreground tracking-tight"
style={{ fontFamily: "var(--font-display), var(--font-sans), sans-serif" }}
>
KTX
</span>
<span
className="text-[13px] font-medium text-fd-muted-foreground/80 tracking-tight border-l border-fd-border pl-2 ml-0.5"
className="text-[14px] font-medium text-fd-muted-foreground/80 tracking-tight border-l border-fd-border pl-2 ml-0.5"
style={{ fontFamily: "var(--font-display), var(--font-sans), sans-serif" }}
>
Docs

View file

@ -1,22 +1,37 @@
---
title: AI Resources
description: Machine-readable docs and prompt recipes for coding assistants reading KTX documentation.
description: Machine-readable docs, retrieval paths, and prompt recipes for coding assistants using KTX documentation.
---
Use this section when a coding assistant, IDE agent, or automation system needs to understand the KTX documentation.
Use this section when a coding assistant, IDE agent, or automation system needs
to read, cite, or update KTX documentation. These resources are optimized for
retrieval: agents can fetch small Markdown pages, use the full corpus only when
needed, and copy prompts that point them at current setup and CLI behavior.
> **Documentation index**
>
> Start with [`/llms.txt`](/llms.txt) to discover the available docs. Use [`/llms-full.txt`](/llms-full.txt) when the assistant needs the complete docs corpus in one Markdown response.
> Start with [`/llms.txt`](/llms.txt) to discover the available docs. Use
> [`/llms-full.txt`](/llms-full.txt) when the assistant needs the complete docs
> corpus in one Markdown response.
## Choose the right path
## What agents can do
| Need | Recommended path |
|------|------------------|
| Find the right setup or CLI page | Fetch [`/llms.txt`](/llms.txt), then read the smallest matching `.md` page |
| Answer a setup question | Read [Agent Quickstart](/docs/ai-resources/agent-quickstart), then [Quickstart](/docs/getting-started/quickstart) or [ktx setup](/docs/cli-reference/ktx-setup) |
| Quote a command or flag | Read the matching [CLI Reference](/docs/cli-reference) page as Markdown |
| Update docs in this repo | Use [Agent Instructions](/docs/ai-resources/agent-instructions) and verify generated Markdown routes after editing |
| Reuse a prompt | Copy from [Prompt Recipes](/docs/ai-resources/prompt-recipes) |
## Section map
| Goal | Use this page |
|------|---------------|
| Tell a coding assistant how to approach KTX docs | [Agent Quickstart](/docs/ai-resources/agent-quickstart) |
| Fetch docs as Markdown instead of HTML | [Markdown Access](/docs/ai-resources/markdown-access) |
| Add lightweight instructions to an assistant prompt | [Agent Instructions](/docs/ai-resources/agent-instructions) |
| Copy prompts for common agent workflows | [Prompt Recipes](/docs/ai-resources/prompt-recipes) |
| Give an assistant a task-first route through the docs | [Agent Quickstart](/docs/ai-resources/agent-quickstart) |
| Fetch docs as Markdown instead of rendered HTML | [Markdown Access](/docs/ai-resources/markdown-access) |
| Add lightweight KTX docs guidance to a system prompt | [Agent Instructions](/docs/ai-resources/agent-instructions) |
| Copy prompts for setup, command lookup, and docs editing | [Prompt Recipes](/docs/ai-resources/prompt-recipes) |
## Available resources
@ -26,13 +41,24 @@ Use this section when a coding assistant, IDE agent, or automation system needs
| [`/llms-full.txt`](/llms-full.txt) | Complete docs corpus in one plain-text Markdown response |
| `/docs/<path>.md` | Per-page Markdown for any docs page |
| Page-level actions | Copy Markdown, view Markdown, or copy MDX from rendered docs pages |
| Prompt recipes | Reusable prompts for docs lookup, setup help, and docs editing |
| Prompt recipes | Reusable prompts for docs lookup, setup help, command discovery, and docs editing |
## Agent usage notes
When an assistant is unsure where to begin, use this order:
When an assistant is unsure where to begin, use this retrieval order:
1. Read [`/llms.txt`](/llms.txt).
2. Fetch the specific Markdown page for the task.
3. Use [Agent Quickstart](/docs/ai-resources/agent-quickstart) to choose the next command or page.
4. Use page-level copy actions when the user wants the exact Markdown or MDX source.
2. Fetch one or two specific Markdown pages for the task.
3. Use [Agent Quickstart](/docs/ai-resources/agent-quickstart) to choose the
next command, guide, or CLI reference page.
4. Use [`/llms-full.txt`](/llms-full.txt) only when the answer requires broad
context across setup, integrations, concepts, and CLI reference.
5. Use page-level copy actions when the user wants exact generated Markdown or
source MDX.
## Boundaries
AI Resources explain how agents consume the docs. To install KTX into an
agent client, use [Agent Clients](/docs/integrations/agent-clients). To set up a
project, use [Quickstart](/docs/getting-started/quickstart) or
[`ktx setup`](/docs/cli-reference/ktx-setup).

View file

@ -37,9 +37,7 @@ ktx
```
The public context-build entrypoint is `ktx ingest [connectionId]` or
`ktx ingest --all`. Legacy command shapes such as `ktx scan`, `ktx ingest run`,
`ktx ingest status`, `ktx ingest replay`, `ktx ingest watch`, and
`ktx setup status` are not part of the current public CLI.
`ktx ingest --all`.
## Global Options

View file

@ -77,7 +77,7 @@ python/
examples/ # Example projects and fixtures
scripts/ # Workspace scripts (benchmarks, verification, release)
docs/ # Documentation site (Fumadocs)
docs-site/ # Documentation site (Fumadocs)
```
All TypeScript packages are ESM (`"type": "module"`) and use `NodeNext` module resolution. The Python projects use `pyproject.toml` for dependency management.

View file

@ -0,0 +1,54 @@
---
title: Community
description: Contribute to KTX through code, docs, connectors, and examples.
---
KTX is an open-source context layer for database agents. The project welcomes
focused contributions that improve setup, integrations, CLI behavior,
documentation, connector coverage, and examples.
## Where to start
| Goal | Start here |
|------|------------|
| Prepare a local development checkout | [Contributing](/docs/community/contributing#development-setup) |
| Understand the workspace layout | [Repository structure](/docs/community/contributing#repository-structure) |
| Run verification before a pull request | [Running tests](/docs/community/contributing#running-tests) |
| Add a database connector | [Adding a connector](/docs/community/contributing#adding-a-connector) |
| Update docs for a user-visible CLI or setup change | [PR guidelines](/docs/community/contributing#pr-guidelines) |
## Contribution areas
| Area | Good first context |
|------|--------------------|
| CLI and setup | `packages/cli`, especially setup steps, command definitions, status checks, and smoke tests |
| Context engine | `packages/context`, including project config, ingest orchestration, and semantic search |
| Connectors | `packages/connector-*`, plus connector-specific tests and integration docs |
| Python semantic layer | `python/ktx-sl` for planning and SQL generation |
| Python daemon | `python/ktx-daemon` for the portable runtime API |
| Documentation | `docs-site/content/docs` for public docs and `docs-site/tests` for docs behavior |
## Development loop
```bash
pnpm install
uv sync --all-groups
pnpm run setup:dev
pnpm run link:dev
ktx-dev status
```
Use `ktx-dev` for local CLI testing after linking the development binary. Use
the published `ktx` command when you are testing the released package in a
separate analytics project.
## Before submitting
1. Keep the change focused on one behavior, connector, doc area, or workflow.
2. Run the smallest tests that cover the changed surface.
3. Run broader checks when changing shared exports, setup state, or generated files.
4. Update `docs-site/content/docs/` when user-visible setup, CLI, configuration, or integration behavior changes.
5. Do not commit local secrets, generated build output, virtualenvs, dependency directories, or local databases.
For complete contributor setup and verification commands, read
[Contributing](/docs/community/contributing).

View file

@ -1,5 +1,5 @@
{
"title": "Community",
"defaultOpen": true,
"pages": ["contributing"]
"pages": ["index", "contributing"]
}

View file

@ -27,6 +27,161 @@ The industry has moved through three distinct approaches to getting AI and data
A context layer is the infrastructure that gives agents the business knowledge they need to produce correct analytics artifacts. It includes a semantic layer - that's a critical component - but it's not the whole thing.
<div
className="not-prose my-10 overflow-hidden rounded-lg border border-fd-border bg-fd-card shadow-sm"
aria-label="How KTX turns source systems into agent-ready context"
>
<div className="border-b border-fd-border bg-fd-muted/35 px-5 py-4">
<p className="mb-1 text-xs font-semibold uppercase text-fd-primary">
{"How KTX works"}
</p>
<p className="max-w-3xl text-sm leading-6 text-fd-muted-foreground">
{"KTX pulls structured metadata and human knowledge from your analytics stack, reconciles it into reviewable files, then gives agents a trusted surface for search, SQL generation, validation, and edits."}
</p>
</div>
<div className="grid gap-0 lg:grid-cols-[1.05fr_2.1rem_0.95fr_2.1rem_1.15fr_2.1rem_0.95fr]">
<section className="bg-fd-background p-4">
<p className="mb-3 text-[11px] font-semibold uppercase tracking-wide text-fd-muted-foreground">
{"Source systems"}
</p>
<div className="space-y-2">
<div className="border-l-2 border-fd-primary bg-fd-card px-3 py-2">
<p className="text-sm font-semibold text-fd-foreground">
{"Warehouses"}
</p>
<p className="mt-0.5 text-xs leading-5 text-fd-muted-foreground">
{"schemas, types, row counts, constraints, query history"}
</p>
</div>
<div className="border-l-2 border-amber-500 bg-fd-card px-3 py-2">
<p className="text-sm font-semibold text-fd-foreground">
{"Modeling tools"}
</p>
<p className="mt-0.5 text-xs leading-5 text-fd-muted-foreground">
{"dbt, MetricFlow, LookML"}
</p>
</div>
<div className="border-l-2 border-orange-500 bg-fd-card px-3 py-2">
<p className="text-sm font-semibold text-fd-foreground">
{"BI systems"}
</p>
<p className="mt-0.5 text-xs leading-5 text-fd-muted-foreground">
{"Looker explores, Metabase questions, dashboards"}
</p>
</div>
<div className="border-l-2 border-slate-500 bg-fd-card px-3 py-2 dark:border-cyan-200">
<p className="text-sm font-semibold text-fd-foreground">
{"Notion and team knowledge"}
</p>
<p className="mt-0.5 text-xs leading-5 text-fd-muted-foreground">
{"runbooks, definitions, policies, analyst notes"}
</p>
</div>
</div>
</section>
<div className="hidden items-center justify-center bg-fd-background lg:flex" aria-hidden="true">
<span className="h-px w-full bg-fd-border" />
</div>
<section className="relative bg-[#102226] p-4 text-white dark:bg-[#0b181b]">
<div className="absolute inset-y-0 left-0 w-1 bg-fd-primary" />
<p className="mb-3 text-[11px] font-semibold uppercase tracking-wide text-cyan-200">
{"KTX ingest"}
</p>
<div className="space-y-3">
<div>
<p className="text-sm font-semibold">{"Extract"}</p>
<p className="mt-0.5 text-xs leading-5 text-cyan-50/75">
{"adapters read metadata, files, APIs, and warehouse evidence"}
</p>
</div>
<div>
<p className="text-sm font-semibold">{"Reconcile"}</p>
<p className="mt-0.5 text-xs leading-5 text-cyan-50/75">
{"the ingest agent merges new facts with existing context"}
</p>
</div>
<div>
<p className="text-sm font-semibold">{"Verify"}</p>
<p className="mt-0.5 text-xs leading-5 text-cyan-50/75">
{"validation and provenance make each write auditable"}
</p>
</div>
</div>
</section>
<div className="hidden items-center justify-center bg-fd-background lg:flex" aria-hidden="true">
<span className="h-px w-full bg-fd-border" />
</div>
<section className="bg-fd-background p-4">
<p className="mb-3 text-[11px] font-semibold uppercase tracking-wide text-fd-muted-foreground">
{"KTX project"}
</p>
<dl className="grid gap-2 text-sm">
<div className="rounded-md border border-fd-border bg-fd-card px-3 py-2 shadow-[0_1px_0_rgba(0,0,0,0.03)]">
<dt className="font-mono text-xs text-fd-foreground">
{"semantic-layer/"}
</dt>
<dd className="mt-1 text-xs leading-5 text-fd-muted-foreground">
{"sources, columns, joins, grain, measures, segments, filters"}
</dd>
</div>
<div className="rounded-md border border-fd-border bg-fd-card px-3 py-2 shadow-[0_1px_0_rgba(0,0,0,0.03)]">
<dt className="font-mono text-xs text-fd-foreground">{"wiki/"}</dt>
<dd className="mt-1 text-xs leading-5 text-fd-muted-foreground">
{"business definitions, rules, gotchas, semantic references"}
</dd>
</div>
<div className="rounded-md border border-fd-border bg-fd-card px-3 py-2 shadow-[0_1px_0_rgba(0,0,0,0.03)]">
<dt className="font-mono text-xs text-fd-foreground">
{"raw-sources/"}
</dt>
<dd className="mt-1 text-xs leading-5 text-fd-muted-foreground">
{"scan artifacts, extracted metadata, relationship evidence"}
</dd>
</div>
<div className="rounded-md border border-fd-border bg-fd-card px-3 py-2 shadow-[0_1px_0_rgba(0,0,0,0.03)]">
<dt className="font-mono text-xs text-fd-foreground">{".ktx/"}</dt>
<dd className="mt-1 text-xs leading-5 text-fd-muted-foreground">
{"local indexes, embeddings, session state, caches"}
</dd>
</div>
</dl>
</section>
<div className="hidden items-center justify-center bg-fd-background lg:flex" aria-hidden="true">
<span className="h-px w-full bg-fd-border" />
</div>
<section className="bg-fd-muted/35 p-4">
<p className="mb-3 text-[11px] font-semibold uppercase tracking-wide text-fd-muted-foreground">
{"Agent workflows"}
</p>
<div className="space-y-2 text-sm">
<div className="rounded-md border border-fd-border bg-fd-card px-3 py-2">
{"Search sources and wiki pages"}
</div>
<div className="rounded-md border border-fd-border bg-fd-card px-3 py-2">
{"Compile trusted SQL"}
</div>
<div className="rounded-md border border-fd-border bg-fd-card px-3 py-2">
{"Explain metrics and provenance"}
</div>
<div className="rounded-md border border-fd-border bg-fd-card px-3 py-2">
{"Patch files, validate, open review"}
</div>
</div>
</section>
</div>
<div className="border-t border-dashed border-fd-border bg-fd-background px-5 py-3 text-sm text-fd-muted-foreground">
{"Reviewed agent and analyst edits flow back into the same YAML and Markdown files, so the next ingest run starts from the team's accepted context."}
</div>
</div>
KTX organizes context into four pillars:
- Semantic sources

View file

@ -1,254 +1,286 @@
---
title: Quickstart
description: Set up KTX and build your first context in under 10 minutes.
description: Set up KTX, build local context, and connect your coding agent.
---
This guide walks you through `ktx setup` - an interactive wizard that configures your LLM provider, connects your database, optionally ingests from your existing tools, builds context, and installs agent integration.
This guide gets a local analytics project ready for KTX. You will install the
CLI, run the setup wizard, connect a database, build context, and install agent
rules that teach your coding assistant which KTX commands to run.
If you are a coding assistant trying to decide which KTX docs page to read, start with the [Agent Quickstart](/docs/ai-resources/agent-quickstart). This page is the human setup walkthrough.
If you are a coding assistant choosing a docs route, start with the
[Agent Quickstart](/docs/ai-resources/agent-quickstart). This page is the
human setup walkthrough.
## Workflow summary
## What setup does
Use this sequence when you are setting up KTX in an analytics project:
`ktx setup` is the main project workflow. It can create or resume `ktx.yaml`,
configure model and embedding providers, add database connections, add optional
context sources, build the first context artifacts, and install agent
integration.
1. `npm install -g @kaelio/ktx` - install the published KTX CLI from npm.
2. `ktx setup` - create or resume a KTX project.
When you run bare `ktx` in an interactive terminal outside a KTX project, the
CLI opens the same setup experience. Inside an existing project, `ktx setup`
resumes incomplete work or opens a menu for changing setup, connecting an
agent, checking status, or exploring a demo project.
The setup wizard is stateful. If it exits before completion, rerun `ktx setup` in the same project directory to resume from the first incomplete step.
## Install the CLI
## Install and run setup
Install the published [`@kaelio/ktx`](https://www.npmjs.com/package/@kaelio/ktx) CLI:
Install the published `@kaelio/ktx` package:
```bash
npm install -g @kaelio/ktx
```
Then run the setup wizard:
Then run setup from the analytics project directory:
```bash
ktx setup
```
The local checkout flow is only for contributors working on KTX itself. See [Contributing](/docs/community/contributing) for that setup.
The local checkout workflow is only for KTX contributors. See
[Contributing](/docs/community/contributing) for that path.
The wizard walks through six steps. You can go back at any point, and if you exit early, rerunning `ktx setup` resumes where you left off.
## Step 1: Choose the project
## Step 1: Configure LLM
In an interactive terminal, setup can create a new KTX project or resume the
nearest existing project. The main project file is `ktx.yaml`.
KTX uses an Anthropic model to enrich schema descriptions, generate semantic sources during ingestion, and reconcile metadata from your tools.
For scripted setup, pass the project directory explicitly:
The wizard asks how to find your API key:
```
◆ How should KTX find your Anthropic API key?
│ ○ Use ANTHROPIC_API_KEY from the environment
│ ○ Paste a key and save it as a local secret file
```bash
ktx setup --project-dir ./analytics
```
If you choose to paste a key, KTX saves it in `.ktx/secrets/anthropic-api-key` with local file permissions. Your `ktx.yaml` stores a `file:` reference, never the raw key.
If setup exits early, rerun `ktx setup` in the same directory. KTX tracks
completed setup steps and resumes from the remaining work.
Next, choose a model:
## Step 2: Configure the LLM
```
◆ Which Anthropic model should KTX use?
│ ○ Claude Sonnet 4.6 (recommended)
│ ○ Claude Opus 4.6
│ ○ Claude Haiku 4.5
│ ○ Enter a model ID manually
```
KTX uses a Claude model for ingest agents that turn schemas, SQL, BI metadata,
and documents into semantic-layer sources and wiki context.
KTX runs a health check to verify your key and model work before saving.
Setup supports two LLM provider paths:
## Step 2: Configure embeddings
| Provider | Use when | Credential model |
|----------|----------|------------------|
| Anthropic API | You have an Anthropic API key | `ANTHROPIC_API_KEY` or a local `file:` secret |
| Google Vertex AI for Anthropic Claude | Your organization runs Claude through Google Cloud | Application Default Credentials plus Vertex project and location |
KTX uses embeddings for semantic search over sources, wiki content, schema metadata, and relationship evidence.
For Anthropic API, setup can read the key from the environment or save a pasted
key to `.ktx/secrets/anthropic-api-key`. `ktx.yaml` stores an `env:` or `file:`
reference, not the raw key.
```
◆ Which embedding option should KTX use?
│ ○ Local sentence-transformers embeddings
│ ○ OpenAI embeddings (recommended)
```
For Vertex AI, setup uses Google Application Default Credentials. It can read
your active `gcloud` project, list visible projects, or accept explicit
`--vertex-project` and `--vertex-location` values.
**OpenAI embeddings** use `text-embedding-3-small` (1536 dimensions) and require an `OPENAI_API_KEY`.
Setup checks the selected model before saving. Anthropic API setup fetches live
Claude model choices when possible and falls back to bundled defaults if model
discovery is unavailable.
**Local embeddings** use `all-MiniLM-L6-v2` (384 dimensions) via the KTX managed Python runtime. No API key is needed. KTX can install and start the runtime during setup; to prepare it ahead of time, run:
## Step 3: Configure embeddings
KTX uses embeddings for semantic search over semantic-layer sources, wiki
context, schema metadata, and relationship evidence.
| Backend | Default model | Notes |
|---------|---------------|-------|
| OpenAI | `text-embedding-3-small` | Recommended for hosted embeddings. Requires an OpenAI API key. |
| Local sentence-transformers | `all-MiniLM-L6-v2` | Runs through the KTX-managed Python runtime. No hosted embedding key is required. |
OpenAI setup reads `OPENAI_API_KEY` or saves a local secret file. Local
sentence-transformers setup can install and start the managed runtime during
setup. To prepare that runtime before setup, run:
```bash
ktx dev runtime install --feature local-embeddings --yes
ktx dev runtime start --feature local-embeddings
```
## Step 3: Connect a database
## Step 4: Add a database
Select one or more databases for KTX to connect to. The wizard supports
SQLite, PostgreSQL, MySQL, ClickHouse, SQL Server, BigQuery, and Snowflake.
KTX needs at least one primary database connection before it can build database
context. The wizard supports SQLite, PostgreSQL, MySQL, ClickHouse, SQL Server,
BigQuery, and Snowflake.
For PostgreSQL, you can enter connection details field by field or paste a connection URL:
You can usually enter connection fields interactively or provide a URL. Secret
URLs can be stored as local files under `.ktx/secrets/` or referenced with
`env:NAME` in `ktx.yaml`.
```
◆ How do you want to connect to PostgreSQL?
│ ○ Enter connection details (host, port, database, user)
│ ○ Paste a connection URL
```
After saving a connection, setup tests it and builds fast schema context:
If your URL contains credentials, KTX saves it to `.ktx/secrets/` and writes a `file:` reference in `ktx.yaml`. You can also use `env:DATABASE_URL` to reference an environment variable.
After connecting, KTX automatically runs a connection test and builds fast
schema context:
```
Testing postgres-warehouse
```text
Testing warehouse
Connection test passed
Driver: PostgreSQL - Status: ok
Building schema context for postgres-warehouse
Building schema context for warehouse
Running fast database ingest
Schema context complete for postgres-warehouse
Changes: 42 new tables
Database ready
postgres-warehouse - PostgreSQL - schema context complete
warehouse - PostgreSQL - schema context complete
```
For PostgreSQL, Snowflake, and BigQuery, the wizard can enable query-history
ingest when the warehouse history feature is available. Query history is stored
under `connections.<id>.context.queryHistory` in `ktx.yaml`.
PostgreSQL, BigQuery, and Snowflake can also enable query-history ingest. Query
history helps KTX learn common query patterns, joins, service-account filters,
and warehouse-specific usage.
## Step 4: Add context sources
## Step 5: Add context sources
Context sources let KTX ingest metadata from your existing analytics tools. This step is optional - you can skip it and add sources later.
Context sources are optional, but they make the first context layer much richer.
Setup can add:
```
◆ Which context sources should KTX ingest?
│ ◻ dbt
│ ◻ MetricFlow
│ ◻ Metabase
│ ◻ Looker
│ ◻ LookML
│ ◻ Notion
```
| Source | Typical input | What KTX learns |
|--------|---------------|-----------------|
| dbt | Local project or Git repo | Models, columns, tests, descriptions, tags |
| MetricFlow | Local project or Git repo | Semantic models, metrics, dimensions, entities |
| LookML | Local files or Git repo | Views, explores, dimensions, measures, joins |
| Looker | API URL and credentials | Explores, looks, dashboards, model metadata |
| Metabase | API URL and key | Questions, dashboards, BI database mappings |
| Notion | Integration token and crawl settings | Business docs and knowledge pages |
For **dbt**, point KTX at a local path or git URL. KTX reads your `dbt_project.yml` and schema files to extract model metadata:
Setup maps BI and source metadata back to your primary warehouse connection so
generated context points at the right tables.
```
◆ dbt source location
│ ○ Local path
│ ○ Git URL
```
You can skip this step and add sources later by rerunning `ktx setup`.
For **Metabase** and **Looker**, you provide an API URL and credentials. KTX maps BI databases to your KTX primary source connections so it knows which warehouse tables the BI metadata refers to.
## Step 6: Build context
Context sources are saved to `ktx.yaml` and built during the next step.
The context build turns configured databases and sources into local artifacts
agents can read. It runs database ingest first, then source ingest and memory
updates.
## Step 5: Build context
Fast database ingest records deterministic schema grounding. Deep ingest adds
AI-enriched descriptions, embeddings, relationship evidence, and query-history
context when configured.
This is where KTX builds agent-ready context. It uses the database context
depth saved by setup and ingests metadata from any configured context sources.
When the build finishes, setup verifies that agent-ready context exists:
```
◆ Build KTX context for agents?
│ ○ Build context now (recommended)
│ ○ Leave context unbuilt and exit setup
```
Fast database context builds deterministic schema grounding. Deep database
context also generates AI descriptions, embeddings, and relationship evidence
when those capabilities are configured.
For a small database (under 50 tables), this can take a few minutes. Larger
warehouses can take longer. Context builds run in the foreground; press
<kbd>Ctrl+C</kbd> to stop the current run and rerun `ktx setup` or `ktx ingest`
when you are ready to try again.
When the build completes, KTX verifies that agent-ready context was produced:
```
```text
KTX context is ready for agents.
Databases:
postgres-warehouse: deep context complete
warehouse: deep context complete
Context sources:
dbt-main: memory update complete
dbt_main: memory update complete
Verification:
Agent context: ready
Semantic search: ready
```
## Step 6: Install agent integration
If a foreground build is interrupted, rerun `ktx setup` or build the same target
with `ktx ingest <connectionId>`.
The final step connects KTX to your coding agent. Choose how agents should access the project:
## Step 7: Install agent integration
```
◆ How should agents use this KTX project?
│ ○ CLI tools and skills
The final setup step installs project-local rules for your coding assistant.
Supported targets are Claude Code, Codex, Cursor, OpenCode, and universal
`.agents`.
You can also run this step later:
```bash
ktx setup --agents --target codex
```
Then select which agents to install for:
Claude Code and Codex also support global installs:
```
◆ Which agent targets should KTX install?
│ ◻ Claude Code
│ ◻ Codex
│ ◻ Cursor
│ ◻ OpenCode
│ ◻ Custom agent (.agents)
```bash
ktx setup --agents --target codex --global
```
**CLI mode** writes a skill file (e.g., `.claude/skills/ktx/SKILL.md`) that teaches the agent to call KTX commands directly.
**Custom agent** uses the universal `.agents` target for agents that can read project-local skills.
Agent rules are CLI-based. They point agents at the KTX CLI path that created
the file, so agents do not need a separate `ktx` binary in `PATH`. If the CLI
path changes after reinstalling or moving a checkout, rerun `ktx setup --agents`.
## Generated files
KTX writes project state as plain files so agents can inspect and edit changes in git.
KTX writes plain files so people and agents can inspect changes in git.
| Path | Created by | Purpose |
|------|------------|---------|
| `ktx.yaml` | `ktx setup` | Main project configuration: connections, LLM settings, embeddings, and context sources |
| `.ktx/secrets/*` | `ktx setup` when file-backed secrets are selected | Local secret files referenced from `ktx.yaml`; do not commit these |
| `semantic-layer/<connection-id>/*.yaml` | context build, ingestion, or direct file edits | Semantic source definitions agents use for SQL generation |
| `wiki/global/*.md` | ingestion, memory capture, or direct file edits | Shared business context and metric definitions |
| `wiki/user/<user-id>/*.md` | memory capture or direct file edits | User-scoped notes for one agent/user context |
| `.claude/skills/ktx/SKILL.md`, `.agents/skills/ktx/SKILL.md` | CLI-mode agent integration setup | Agent instructions for calling public `ktx` commands |
| Path | Purpose |
|------|---------|
| `ktx.yaml` | Project configuration for LLMs, embeddings, connections, context sources, and setup state |
| `.ktx/secrets/*` | Local secret files referenced from `ktx.yaml`; do not commit these |
| `.ktx/setup/*` | Local setup and context-build state |
| `.ktx/agents/install-manifest.json` | Manifest used to manage installed agent files |
| `semantic-layer/<connection-id>/*.yaml` | Semantic source definitions used for SQL generation |
| `wiki/global/*.md` | Shared business context and metric definitions |
| `wiki/user/<user-id>/*.md` | User-scoped notes and local context |
| `.claude/skills/ktx/SKILL.md` | Claude Code project skill |
| `.agents/skills/ktx/SKILL.md` | Codex or universal project skill |
| `.cursor/rules/ktx.mdc` | Cursor project rule |
| `.opencode/commands/ktx.md` | OpenCode project command |
## Verify it worked
## Verify setup
Check your project status:
Run:
```bash
ktx status
```
```
Example output:
```text
KTX project: /home/user/analytics
Project ready: yes
LLM ready: yes (claude-sonnet-4-6)
Embeddings ready: yes (text-embedding-3-small)
Databases configured: yes (postgres-warehouse)
Context sources configured: yes (dbt-main)
Databases configured: yes (warehouse)
Context sources configured: yes (dbt_main)
KTX context built: yes
Agent integration ready: yes (claude-code:project)
Agent integration ready: yes (codex:project)
```
Use JSON when an agent or script needs a structured readiness check:
```bash
ktx status --json
```
## Scripted setup example
Use non-interactive setup when creating repeatable fixtures or automation:
```bash
ktx setup \
--project-dir ./analytics \
--no-input \
--skip-llm \
--skip-embeddings \
--database postgres \
--new-database-connection-id warehouse \
--database-url env:DATABASE_URL \
--database-schema public
```
Then build context:
```bash
ktx ingest warehouse --fast
```
See [ktx setup](/docs/cli-reference/ktx-setup) for the full automation flag
surface.
## Common errors
| Error or symptom | Likely cause | Recovery |
|------------------|--------------|----------|
| `ktx: command not found` | The KTX package is not installed globally, or the shell cannot find the global binary | Run `npm install -g @kaelio/ktx` and open a new shell |
| LLM health check fails | Missing, invalid, or unauthorized Anthropic API key | Export `ANTHROPIC_API_KEY` or rerun `ktx setup` and choose the file-backed secret option |
| OpenAI embedding check fails | `OPENAI_API_KEY` is missing when OpenAI embeddings are selected | Export `OPENAI_API_KEY`, or rerun setup and choose local sentence-transformers embeddings |
| Local embeddings hang or fail | The managed Python runtime cannot start or the local model runtime is unavailable | Install `uv`, run `ktx dev runtime status`, then run `ktx dev runtime install --feature local-embeddings --yes` and rerun setup |
| Database connection test fails | Credentials, network access, warehouse, database, or schema value is wrong | Test the same URL with the database's native client, then rerun `ktx setup` and reconfigure the connection |
| `KTX context built: no` in `ktx status` | Setup saved configuration but did not build context | Run `ktx setup` and choose to build context now |
| Agent integration is incomplete | Setup skipped the agents step or the target was not installed | Run `ktx setup --agents --target codex` using the target you need |
| Symptom | Likely cause | Recovery |
|---------|--------------|----------|
| `ktx: command not found` | The global package is not installed or your shell cannot find it | Reinstall `@kaelio/ktx` and open a new shell |
| Setup resumes the wrong project | `KTX_PROJECT_DIR` or the nearest `ktx.yaml` points somewhere else | Pass `--project-dir <path>` |
| Anthropic health check fails | API key, model id, or access is invalid | Fix `ANTHROPIC_API_KEY` or rerun setup with a different key or model |
| Vertex AI health check fails | Vertex API, Claude access, project, location, or IAM permissions are missing | Check the project, location, Application Default Credentials, and Vertex AI permissions |
| OpenAI embeddings fail | `OPENAI_API_KEY` is missing or invalid | Export the key or choose local sentence-transformers embeddings |
| Local embeddings fail | Managed Python runtime cannot install or start | Run `ktx dev runtime status`, then install the local embeddings runtime |
| Database test fails | Credentials, network access, database, warehouse, or schema is wrong | Test the same values with the database's native client, then rerun setup |
| Context is not built | Setup saved configuration but skipped or interrupted the build | Run `ktx setup` or `ktx ingest --all` |
| Agent integration is incomplete | Setup skipped the agents step or installed a different target | Run `ktx setup --agents --target <target>` |
## Next steps
- **Build more context** - learn about [database ingest](/docs/guides/building-context), relationship detection, and source ingestion workflows in the Building Context guide.
- **Refine your semantic layer** - the [Writing Context](/docs/guides/writing-context) guide covers source YAML, measures, joins, and wiki pages.
- **Understand the architecture** - read [The Context Layer](/docs/concepts/the-context-layer) to learn why a context layer is more than a semantic layer.
- **Connect more agents** - see the [Agent Clients](/docs/integrations/agent-clients) integration page for per-tool setup details.
- Build and refresh context with [Building Context](/docs/guides/building-context).
- Edit semantic sources and wiki pages with [Writing Context](/docs/guides/writing-context).
- Connect more tools with [Agent Clients](/docs/integrations/agent-clients).
- Read [The Context Layer](/docs/concepts/the-context-layer) to understand the architecture.

View file

@ -1,171 +1,195 @@
---
title: Building Context
description: Build database and source context from configured KTX connections.
description: Build and refresh KTX context from databases, source tools, query history, and text.
---
Building context reads your configured connections and writes local context that
agents can use. Database connections produce schema context, and source
connections such as dbt, Looker, Metabase, and Notion produce semantic sources
and wiki pages.
Building context turns configured connections into local semantic-layer sources
and wiki pages. Agents use those files to understand your schema, business
definitions, metric logic, joins, and known caveats before they write SQL.
Use this guide after `ktx setup` has created `ktx.yaml` and at least one
database or context-source connection.
## The build loop
Most projects use this loop:
1. Check readiness with `ktx status`.
2. Build one connection with `ktx ingest <connectionId>`, or build everything
with `ktx ingest --all`.
3. Search or inspect the generated files under `semantic-layer/` and `wiki/`.
4. Edit source YAML or Markdown when business logic needs refinement.
5. Validate and query representative sources before handing the context to an
agent.
`ktx ingest --all` runs database connections first, then context-source
connections. That order lets dbt, BI, Notion, and text ingest attach context to
known warehouse tables.
## Database ingest
Database ingest connects to your warehouse and extracts structural metadata.
KTX stores the results locally so agents can understand your schema without
querying the database directly.
### Running database ingest
Database ingest connects to a configured warehouse and records local schema
context. It gives agents table, column, type, constraint, and row-count
grounding without requiring them to inspect the database directly.
```bash
ktx ingest <connection-id>
```
This runs a fast schema ingest by default. You can choose the depth with public
flags:
| Flag | What it does |
|------|-------------|
| `--fast` | Tables, columns, types, constraints, and row counts |
| `--deep` | Fast ingest plus AI-enriched database context |
```bash
# Build one connection quickly
ktx ingest my-postgres --fast
# Build AI-enriched database context
ktx ingest my-postgres --deep
# Build one configured database connection
ktx ingest warehouse
# Build all configured connections
ktx ingest --all
```
### Checking results
Depth controls how much context KTX builds:
Every ingest prints a summary and writes local artifacts. Use `ktx status`
after ingest to review project readiness and follow-up setup work:
| Flag | Best for | What it does |
|------|----------|--------------|
| `--fast` | First setup, quick refreshes, CI smoke checks | Deterministic schema ingest with tables, columns, types, constraints, and row counts |
| `--deep` | Agent-ready context for real analysis | Fast ingest plus AI-enriched descriptions, embeddings, relationship evidence, and optional query history |
Examples:
```bash
ktx status
ktx ingest warehouse --fast
ktx ingest warehouse --deep
ktx ingest --all --deep
```
### Relationship detection
Deep ingest needs LLM and embedding readiness. If those providers are not
configured, run `ktx setup` or use `--fast`.
Many databases lack declared foreign keys. KTX infers relationships by scoring column pairs across seven signals - name similarity, type compatibility, value overlap, embedding similarity, profile uniqueness, null rate, and structural priors. The weighted score determines each candidate's status:
## Query history
| Score range | Status | Meaning |
|-------------|--------|---------|
| &ge; 0.85 | `accepted` | High confidence - applied automatically |
| 0.55 &ndash; 0.84 | `review` | Plausible - needs human review |
| &lt; 0.55 | `rejected` | Low confidence - not applied |
PostgreSQL, BigQuery, and Snowflake can add query-history context. This helps
KTX learn common joins, filters, service-account patterns, redaction rules, and
usage-heavy query templates.
Deep database ingest can include relationship evidence where the connector can
provide it. Relationship review and calibration subcommands are not part of the
current public CLI surface.
## Ingestion
Ingestion pulls semantic context from your existing analytics tools - dbt projects, Looker models, Metabase questions, and more - and writes it into your KTX project as semantic sources and wiki pages.
### How it works
Each ingest run follows this flow:
1. An **adapter** extracts metadata from your tool (dbt manifest, LookML files, Metabase API, etc.)
2. An **LLM agent** reconciles the extracted metadata with your existing context - it merges intelligently rather than overwriting
3. **Semantic sources** (YAML) and **wiki pages** (Markdown) are written to your project directory
### Running an ingest
Enable it during setup, store it under `connections.<id>.context.queryHistory`,
or request it for one run:
```bash
ktx ingest my-dbt-source
ktx ingest warehouse --deep --query-history
ktx ingest warehouse --query-history-window-days 30
```
Useful output flags:
Use `--no-query-history` when you want to skip a stored query-history setting
for one run.
## Relationship evidence
Many databases do not declare all foreign keys. KTX can score relationship
candidates using signals such as name similarity, type compatibility, value
overlap, embedding similarity, uniqueness, null rate, and structural priors.
The public CLI does not expose separate relationship review subcommands.
Relationship evidence is built as part of deep database ingest when the
connector and readiness checks support it.
## Context-source ingest
Context-source connections pull business metadata from tools your team already
uses. The current public `ktx ingest` command is connection-centric: pass one
configured connection id, or pass `--all`.
```bash
# Build one source connection
ktx ingest dbt_main
# Build every configured database and source connection
ktx ingest --all
```
Supported source types:
| Driver | Typical source | Output |
|--------|----------------|--------|
| `dbt` | dbt project or Git repo | Semantic sources with model, column, test, tag, and description metadata |
| `metricflow` | MetricFlow project or Git repo | Metrics, dimensions, entities, and semantic joins |
| `lookml` | LookML files or Git repo | Views, explores, dimensions, measures, and joins |
| `looker` | Looker API | Explores, looks, dashboards, and model metadata |
| `metabase` | Metabase API | Questions, dashboards, table metadata, and mappings |
| `notion` | Notion API | Wiki pages and business knowledge |
Source ingest extracts metadata, reconciles it with existing local context, and
writes semantic-layer YAML plus wiki Markdown. It merges rather than blindly
overwriting local edits.
## Text ingest
Use `ktx ingest text` for notes, Markdown files, runbooks, Slack exports, or
other free-form knowledge that should become searchable KTX memory.
```bash
# Capture a Markdown file
ktx ingest text docs/revenue-notes.md --connection-id warehouse
# Capture one stdin item
printf "Refunds are excluded from net revenue." | ktx ingest text -
# Capture direct text
ktx ingest text --text "ARR excludes one-time implementation fees."
```
Useful flags:
| Flag | Description |
|------|-------------|
| `--json` | Output as JSON |
| `--plain` | Plain text output |
| `--connection-id <connectionId>` | Attach the captured memory to a KTX connection |
| `--user-id <id>` | Attribute capture to a user scope, default `local-cli` |
| `--json` | Print structured output |
| `--fail-fast` | Stop after the first failed text item |
Foreground context builds do not detach into background control sessions. If a
run is interrupted, rerun `ktx ingest <connection-id>` or `ktx ingest --all`.
Text ingest is a good fit for small, high-signal documents. For system-specific
connectors such as Notion, dbt, or Metabase, prefer configured source ingest so
KTX can preserve source metadata.
### Supported context sources
## Output and artifacts
| Driver | Source | What gets ingested |
|--------|--------|--------------------|
| `dbt` | dbt project | Model definitions, column descriptions, tests, tags |
| `metricflow` | MetricFlow semantic models | Metrics, dimensions, entities, semantic joins |
| `lookml` | LookML files | Views, explores, dimensions, measures, joins |
| `looker` | Looker API | Explores, looks, dashboard metadata |
| `metabase` | Metabase API | Questions, dashboards, table metadata |
| `notion` | Notion API | Database pages, knowledge articles |
Every ingest run prints a summary. Use `--json` when an agent or script needs a
structured plan and per-target results.
Query history is a database connection facet. Enable it with
`connections.<id>.context.queryHistory` or pass `--query-history` for a current
run. See [Context Sources](/docs/integrations/context-sources) for
driver-specific setup and auth configuration.
### What gets generated
A typical dbt ingest produces semantic sources and wiki pages in your project:
**Semantic source** (`semantic-layer/my-postgres/orders.yaml`):
```yaml title="semantic-layer/my-postgres/orders.yaml"
name: orders
table: public.orders
grain:
- order_id
columns:
- name: order_id
type: string
description: Unique order identifier
- name: customer_id
type: string
description: Foreign key to customers table
- name: order_date
type: time
role: time
description: Date the order was placed
- name: total_amount
type: number
description: Total order value in USD
measures:
- name: total_revenue
expr: SUM(total_amount)
description: Sum of all order values
- name: order_count
expr: COUNT(DISTINCT order_id)
description: Number of distinct orders
joins:
- to: customers
on: orders.customer_id = customers.customer_id
relationship: many_to_one
```bash
ktx ingest --all --json
```
**Wiki page** (`wiki/global/order-status-definitions.md`):
Typical generated files:
```markdown
---
summary: Business definitions for order status values
tags: [orders, definitions]
sl_refs: [orders]
---
| Path | Created by | Purpose |
|------|------------|---------|
| `semantic-layer/<connection-id>/*.yaml` | Database and source ingest | Queryable semantic source definitions |
| `wiki/global/*.md` | Source, text, and memory ingest | Shared business definitions and notes |
| `wiki/user/<user-id>/*.md` | Text and memory ingest | User-scoped context |
| `.ktx/setup/context-build.json` | Setup context build | Resume and readiness state for setup |
## Order Statuses
Ingest sessions also record transcripts with tool calls, LLM responses, and
write decisions. Inspect them when you need to debug why a source or wiki page
was written a certain way.
- **pending**: Order placed but not yet processed
- **confirmed**: Payment received, awaiting fulfillment
- **shipped**: Order dispatched to carrier
- **delivered**: Order received by customer
- **cancelled**: Order cancelled before shipment
## Example: first full refresh
Orders in "pending" status for more than 48 hours are flagged for review.
After interactive setup:
```bash
ktx status
ktx ingest --all --deep
ktx status
```
### Ingest transcripts
Then inspect what changed:
Every ingest session records a full transcript: tool calls, LLM responses, and
write decisions. Inspect the stored transcript files when you need to debug why
a source was written a certain way.
```bash
git status --short
ktx sl list --json
ktx wiki search "revenue" --json --limit 10
```
## Common errors
| Symptom | Likely cause | Recovery |
|---------|--------------|----------|
| Connection not configured | The connection id is missing from `ktx.yaml` | Add it with `ktx setup` |
| Deep readiness is missing | LLM or embeddings are not setup-ready | Run `ktx setup`, or rerun with `--fast` |
| Query history is unsupported | The selected database driver does not expose query history | Run schema ingest without query-history flags |
| No target selected | You omitted both a connection id and `--all` | Run `ktx ingest <connectionId>` or `ktx ingest --all` |
| Source flags have no effect | Depth and query-history flags were supplied for a source connector | Use those flags only for database connections |
| Text ingest stops early | `--fail-fast` stopped on the first failed item | Fix the item or rerun without `--fail-fast` |

View file

@ -1,59 +1,167 @@
---
title: Serving Agents
description: Expose your context to Claude Code, Cursor, Codex, and other coding agents.
description: Expose KTX context to Claude Code, Codex, Cursor, OpenCode, and custom agents.
---
Once you've built and refined your context, expose it to coding agents through
the public KTX CLI. Claude Code, Cursor, Codex, OpenCode, and custom agent
workflows can call the same commands you use at a terminal.
KTX serves agents through the public CLI and project-local instruction files.
Agents do not need a separate server. They read the generated rules, call KTX
commands, inspect local context files, and use JSON output when they need
structured results.
## CLI Commands
## Recommended setup
KTX public commands support JSON output for the context reads that agents use
most often. Use `--project-dir` when the agent is not already running inside the
KTX project directory.
### Available commands
Run the agent install step from a KTX project:
```bash
ktx setup --agents
```
Or install a specific target:
```bash
ktx setup --agents --target codex
```
Supported targets:
| Target | Generated project file |
|--------|------------------------|
| Claude Code | `.claude/skills/ktx/SKILL.md` |
| Codex | `.agents/skills/ktx/SKILL.md` |
| Cursor | `.cursor/rules/ktx.mdc` |
| OpenCode | `.opencode/commands/ktx.md` |
| Universal `.agents` | `.agents/skills/ktx/SKILL.md` |
Claude Code and Codex also support global installs:
```bash
ktx setup --agents --target claude-code --global
ktx setup --agents --target codex --global
```
KTX records installed files in `.ktx/agents/install-manifest.json`. Rerun
`ktx setup --agents` after moving a checkout or reinstalling the CLI so the
generated instructions point at the current CLI path.
## Agent command set
All supported agent clients use the same command surface. Use `--project-dir`
when the agent is running outside the KTX project directory.
### Readiness
```bash
# Check setup and context readiness
ktx status --json
```
**Semantic layer:**
Agents should run this before relying on context. It reports project, LLM,
embedding, database, context-source, context-build, and agent-integration
readiness.
### Semantic layer discovery
```bash
# List sources
ktx sl list --json
ktx sl list --json --connection-id my-postgres
ktx sl search "revenue" --json
ktx sl list --connection-id warehouse --json
ktx sl search "revenue" --json --limit 10
```
# Run a query from a JSON file
ktx sl query --json \
--connection-id my-postgres \
--query-file query.json \
Agents use these commands to discover source names, connection ids, measures,
dimensions, and likely files to inspect.
### Semantic-layer validation and queries
```bash
ktx sl validate orders --connection-id warehouse
```
Compile SQL before executing:
```bash
ktx sl query \
--connection-id warehouse \
--measure orders.total_revenue \
--dimension orders.created_date \
--format sql
```
Execute only when the task calls for live data:
```bash
ktx sl query \
--connection-id warehouse \
--measure orders.total_revenue \
--dimension orders.status \
--execute \
--max-rows 100
```
**Wiki:**
For complex calls, agents can write a JSON query object and pass it with
`--query-file`.
### Wiki context
```bash
# Search wiki pages
ktx wiki list --json
ktx wiki search "revenue recognition" --json --limit 10
```
## Setting Up Your Agent
Agents should search wiki context when a question depends on business
definitions, metric caveats, process rules, or terms that are not obvious from
schema names.
The fastest way to connect an agent is through the setup wizard:
### Context refresh
Agents can refresh context when the user asks them to:
```bash
ktx setup
ktx ingest warehouse --fast
ktx ingest --all
ktx ingest text docs/revenue-notes.md --connection-id warehouse
```
The agents step auto-detects installed tools and generates the right
configuration. For manual setup or per-tool details, see the
[Agent Clients](/docs/integrations/agent-clients) integration page.
Use `--deep` only when LLM and embedding setup is ready and the user expects an
AI-enriched refresh.
After configuration, the agent can immediately call KTX commands to list
sources, search wiki pages, and query your semantic layer.
## Good agent behavior
Agents should:
- Run `ktx status --json` before using KTX context.
- Use `ktx sl search` and `ktx wiki search` before writing SQL from memory.
- Inspect the relevant YAML or Markdown files after search returns candidates.
- Compile SQL with `ktx sl query --format sql` before executing.
- Use `--max-rows` whenever executing a live query.
- Validate edited semantic sources with `ktx sl validate`.
- Keep generated context changes reviewable in git.
Agents should not assume a background server, ORPC route, frontend app, or
external migration system exists. KTX is a local context layer with a CLI and
plain project files.
## Manual setup
Manual setup is useful for custom agents that can read project-local
instructions but are not yet a named target.
1. Install the universal target:
```bash
ktx setup --agents --target universal
```
2. Configure the agent to read `.agents/skills/ktx/SKILL.md`.
3. Open the agent in the KTX project directory.
4. Ask it to run `ktx status --json` and summarize readiness.
For per-client notes, see [Agent Clients](/docs/integrations/agent-clients).
## Troubleshooting
| Symptom | Likely cause | Recovery |
|---------|--------------|----------|
| Agent says KTX is unavailable | Agent did not load the generated instruction file | Rerun `ktx setup --agents --target <target>` and restart the agent session |
| Agent command cannot find the project | Agent is running outside the KTX directory | Add `--project-dir <path>` or open the agent in the project root |
| Generated rules point at a missing CLI path | CLI was moved, rebuilt, or reinstalled | Rerun `ktx setup --agents` |
| Agent cannot find a metric | Context is missing or stale | Run `ktx sl search`, inspect source YAML, then refresh with `ktx ingest` if needed |
| Agent query returns too many rows | The command executed without a result cap | Require `--max-rows` for executed queries |

View file

@ -1,295 +1,341 @@
---
title: Writing Context
description: Write and refine semantic sources and wiki pages.
description: Edit semantic sources and wiki pages so agents use your business logic.
---
After building context through scanning and ingestion, you'll want to refine it - edit semantic sources to match your business logic, add wiki pages that capture tribal knowledge, and query your data through the semantic layer to verify everything works.
KTX context is meant to be edited. Ingest gives you a grounded first draft, then
you refine source YAML and wiki Markdown until agents can answer data questions
with the same definitions your team uses.
## Agent workflow summary
Use this guide when you are adding measures, fixing joins, documenting business
rules, or reviewing context changes made by an agent.
Agents should refine context in this order:
## Editing workflow
1. `ktx sl list --json` - discover available sources and connection ids.
2. `ktx sl search <query> --json` - find source candidates for a concept.
3. Edit the source YAML directly in `semantic-layer/<connection-id>/`.
4. `ktx sl validate <source> --connection-id <id>` - verify columns, joins, and table references.
5. `ktx sl query ... --format sql` - compile a representative query without executing it.
6. `ktx wiki search ...` - check business context captured by ingest or memory.
Use this order for most context changes:
## Semantic Sources
1. Discover existing context.
Semantic sources are YAML files that describe your tables, columns, measures, and joins. They're the core of the context layer - the structured definitions that agents use to generate correct SQL.
```bash
ktx sl list --json
ktx sl search "revenue" --json
ktx wiki search "revenue recognition" --json --limit 10
```
### Listing sources
2. Edit the smallest relevant files under `semantic-layer/<connection-id>/` or
`wiki/`.
3. Validate semantic source changes.
```bash
# List all sources across connections
ktx sl list
```bash
ktx sl validate orders --connection-id warehouse
```
# List sources for a specific connection
ktx sl list --connection-id my-postgres
4. Compile a representative query before executing it.
# Output as JSON
ktx sl list --json
```bash
ktx sl query \
--connection-id warehouse \
--measure orders.total_revenue \
--dimension orders.created_date \
--format sql
```
5. Search again using likely user wording to confirm the new context is
discoverable.
## Semantic sources
Semantic sources are YAML files that describe queryable entities. A source is
usually a table, but it can also point at a custom SQL expression. Sources
define the vocabulary agents use for measures, dimensions, segments, joins, and
grain-aware query planning.
Source files live at:
```text
semantic-layer/<connection-id>/<source-name>.yaml
```
### Searching sources
```bash
ktx sl search "revenue" --connection-id my-postgres --json
```
Search returns ranked source summaries. To inspect or edit a source, open the
YAML file under `semantic-layer/<connection-id>/`.
### The source schema
A semantic source defines a single queryable entity - usually a table or a SQL expression. Here's a fully annotated example:
### Minimal source
```yaml
name: orders
description: Customer orders with line-item totals
table: public.orders # or use `sql:` for a custom SQL expression
description: Customer orders with booked revenue.
table: public.orders
grain:
- order_id # columns that uniquely identify a row
- order_id
columns:
- name: order_id
type: string
description: Unique order identifier.
- name: order_date
type: time
role: time
description: Date the order was placed.
- name: total_amount
type: number
description: Booked order value in USD.
measures:
- name: total_revenue
expr: SUM(total_amount)
description: Sum of booked order value before refunds.
```
### Full source shape
```yaml
name: orders
description: Customer orders with line-item totals.
table: public.orders
grain:
- order_id
columns:
- name: order_id
type: string # string | number | time | boolean
description: Unique order identifier
type: string
description: Unique order identifier.
- name: order_date
type: time
role: time # marks this as the default time dimension
description: Date the order was placed
role: time
description: Date the order was placed.
- name: status
type: string
visibility: public # public (default) | internal | hidden
description: Current order status
visibility: public
description: Current order status.
- name: _etl_loaded_at
type: time
visibility: hidden # hidden columns are excluded from agent queries
description: Internal ETL timestamp
visibility: hidden
description: Internal load timestamp.
- name: total_amount
type: number
description: Order total in USD
description: Order total in USD.
measures:
- name: total_revenue
expr: SUM(total_amount)
description: Sum of all order values
description: Sum of all order values.
- name: order_count
expr: COUNT(DISTINCT order_id)
description: Number of distinct orders
description: Number of distinct orders.
- name: avg_order_value
expr: AVG(total_amount)
description: Average order value
description: Average booked order value.
- name: high_value_revenue
expr: SUM(total_amount)
filter: total_amount > 100
description: Revenue from orders over $100
description: Revenue from orders over $100.
segments:
- name: us_orders
expr: country = 'US'
description: Orders from US customers
- name: completed_orders
expr: status = 'completed'
description: Orders that completed fulfillment.
joins:
- to: customers
on: orders.customer_id = customers.customer_id
relationship: many_to_one # many_to_one | one_to_many | one_to_one
relationship: many_to_one
- to: order_items
on: orders.order_id = order_items.order_id
relationship: one_to_many
alias: items # optional alias for the joined source
alias: items
```
Key fields:
### Source fields
| Field | Required | Description |
|-------|----------|-------------|
| `name` | Yes | Source identifier (lowercase, underscores) |
| `table` or `sql` | Yes | Database table or custom SQL expression (exactly one) |
| `grain` | Yes | Columns that define row uniqueness |
| `columns` | No | Column definitions with type, role, visibility |
| `measures` | No | Aggregation expressions (SUM, COUNT, AVG, etc.) |
| `joins` | No | Relationships to other sources |
| `segments` | No | Named filter conditions |
| `inherits_columns_from` | No | Inherit column metadata from a manifest entry |
| `name` | Yes | Source identifier. Use lowercase words and underscores. |
| `table` or `sql` | Yes | Database table or custom SQL expression. Use exactly one. |
| `grain` | Yes | Columns that uniquely identify a row at the source grain. |
| `columns` | No | Column definitions with type, role, visibility, and descriptions. |
| `measures` | No | Aggregation expressions such as `SUM`, `COUNT`, and `AVG`. |
| `segments` | No | Named predicates agents can reuse. |
| `joins` | No | Relationships to other semantic sources. |
| `inherits_columns_from` | No | Inherit column metadata from a manifest entry. |
Source component fields:
### Component fields
| Component | Field | Required | Description |
|-----------|-------|----------|-------------|
| Column | `name` | Yes | Column identifier as used in SQL expressions |
| Column | `type` | Yes | Agent-facing type: `string`, `number`, `time`, or `boolean` |
| Column | `role` | No | Special role such as `time` for default time dimensions |
| Column | `visibility` | No | `public`, `internal`, or `hidden` |
| Column | `description` | Strongly recommended | Human-readable business meaning |
| Measure | `name` | Yes | Queryable metric name |
| Measure | `expr` | Yes | SQL aggregation expression at the source grain |
| Measure | `filter` | No | SQL predicate applied only to this measure |
| Measure | `description` | Strongly recommended | Definition agents can cite and compare |
| Segment | `name` | Yes | Reusable filter name |
| Segment | `expr` | Yes | SQL predicate for the segment |
| Join | `to` | Yes | Target semantic source name |
| Join | `on` | Yes | SQL join condition using source names or aliases |
| Join | `relationship` | Yes | `many_to_one`, `one_to_many`, or `one_to_one` |
| Join | `alias` | No | Query alias for repeated or clearer joins |
| Column | `name` | Yes | Column identifier used in SQL expressions. |
| Column | `type` | Yes | Agent-facing type: `string`, `number`, `time`, or `boolean`. |
| Column | `role` | No | Special role such as `time` for default time dimensions. |
| Column | `visibility` | No | `public`, `internal`, or `hidden`. |
| Column | `description` | Strongly recommended | Business meaning and usage notes. |
| Measure | `name` | Yes | Queryable metric name. |
| Measure | `expr` | Yes | SQL aggregation expression at the source grain. |
| Measure | `filter` | No | SQL predicate applied only to this measure. |
| Measure | `description` | Strongly recommended | Definition agents can cite and compare. |
| Segment | `name` | Yes | Reusable filter name. |
| Segment | `expr` | Yes | SQL predicate for the segment. |
| Join | `to` | Yes | Target semantic source name. |
| Join | `on` | Yes | SQL join condition using source names or aliases. |
| Join | `relationship` | Yes | `many_to_one`, `one_to_many`, or `one_to_one`. |
| Join | `alias` | No | Query alias for repeated or clearer joins. |
Column visibility controls what agents see:
### Visibility
| Visibility | Behavior |
|------------|----------|
| `public` | Included in agent queries and listings (default) |
| `internal` | Available for joins and measures but not shown to agents |
| `hidden` | Excluded entirely - useful for ETL columns |
| Visibility | Agent behavior |
|------------|----------------|
| `public` | Included in listings and available for agent queries. |
| `internal` | Available for joins and measures, but not highlighted to agents. |
| `hidden` | Excluded from agent-facing context. Use for ETL fields and sensitive internals. |
### Editing a source
## Measures
Edit source files directly. They live at
`semantic-layer/<connection-id>/<source-name>.yaml` in your project directory.
Good measures have precise names, SQL expressions at the correct grain, and
descriptions that say what is included and excluded.
### Validating sources
Validation checks a source definition against the actual database schema:
```bash
ktx sl validate orders --connection-id my-postgres
```yaml
measures:
- name: net_revenue
expr: SUM(total_amount - refunded_amount)
filter: status = 'completed'
description: Completed order revenue after refunds, excluding cancelled orders.
```
This catches mismatches - columns that don't exist in the table, type mismatches, invalid join targets - before an agent tries to use the source.
Prefer one canonical measure plus wiki synonyms over several nearly identical
measures. If your team uses multiple definitions, document the distinction in a
wiki page and link it with `sl_refs`.
### Querying
## Joins and grain
The semantic layer compiles your measures and dimensions into SQL, optionally executing it against the database:
`grain` and `relationship` prevent agents from producing double-counted SQL.
State the row grain even when it seems obvious.
```yaml
grain:
- order_id
joins:
- to: customers
on: orders.customer_id = customers.customer_id
relationship: many_to_one
```
Use `many_to_one` for dimensions such as customer, account, product, or plan.
Use `one_to_many` only when the target can fan out the source rows, such as
orders to order items.
## Validate and query
Validation checks source YAML against the live database schema:
```bash
ktx sl validate orders --connection-id warehouse
```
It catches missing columns, invalid join targets, and table-reference problems
before an agent relies on the source.
Compile a query to inspect generated SQL:
```bash
# Compile a query to SQL
ktx sl query \
--connection-id my-postgres \
--measure total_revenue \
--measure order_count \
--dimension "order_date" \
--filter "status = 'completed'" \
--order-by order_date:desc \
--connection-id warehouse \
--measure orders.total_revenue \
--dimension orders.order_date \
--filter "orders.status = 'completed'" \
--order-by orders.order_date:desc \
--limit 10 \
--format sql
```
This outputs the compiled SQL without executing it. To run the query:
Execute only when you need live rows:
```bash
# Execute and return results
ktx sl query \
--connection-id my-postgres \
--measure total_revenue \
--dimension "order_date" \
--connection-id warehouse \
--measure orders.total_revenue \
--dimension orders.status \
--execute \
--max-rows 100
```
Query flags:
## Wiki pages
| Flag | Description |
|------|-------------|
| `--measure <name>` | Measure to query (repeatable, at least one required) |
| `--dimension <name>` | Dimension to group by (repeatable) |
| `--filter <expr>` | Filter expression (repeatable) |
| `--segment <name>` | Named segment to apply (repeatable) |
| `--order-by <field[:dir]>` | Sort field, optionally with `:asc` or `:desc` (repeatable) |
| `--limit <n>` | Maximum rows in the compiled query |
| `--format <mode>` | Output format: `json` (default) or `sql` |
| `--execute` | Execute the query against the database |
| `--max-rows <n>` | Maximum rows to return when executing |
| `--include-empty` | Include empty/null rows in results |
Wiki pages capture business context that does not belong in a single source
file: metric policies, dashboard caveats, company vocabulary, data freshness,
known issues, and source-of-truth notes.
The query planner is grain-aware - it understands the cardinality of joins and avoids chasm traps (double-counting caused by many-to-many fan-outs). When you query measures that span multiple sources, KTX generates sub-queries at the correct grain before joining.
Wiki files live under:
### Workflow: edit and validate a source
1. Open `semantic-layer/my-postgres/orders.yaml`.
2. Edit the file to add columns, measures, joins, or descriptions.
3. `ktx sl validate orders --connection-id my-postgres` - check the definition against the live schema.
4. `ktx sl query --connection-id my-postgres --measure total_revenue --dimension order_date --format sql` - compile a representative query.
If validation fails, fix the YAML before asking an agent to use the source. Common validation failures are missing columns, invalid join targets, and measure expressions that reference fields outside the source.
## Wiki Pages
Wiki pages are Markdown files that capture business context - definitions, rules, gotchas, and anything an agent needs to understand beyond what the schema tells it.
### What they are
When an agent asks "what counts as an active user?" or "why do revenue numbers differ between the dashboard and the SQL query?", the answer isn't in the schema. It's tribal knowledge that lives in Slack threads, Notion pages, or someone's head. Wiki pages make that context searchable and available to agents.
### Organization
Wiki pages are organized by scope:
```
```text
wiki/
├── global/ # Cross-cutting definitions
│ ├── order-status-definitions.md
│ ├── revenue-recognition-rules.md
│ └── data-freshness-sla.md
└── user/
└── local/ # User-scoped context
├── schema-conventions.md
└── known-data-issues.md
global/
user/<user-id>/
```
- **Global pages** apply across all connections - business definitions, metric standards, company terminology.
- **User-scoped pages** are private to a user ID - personal notes, local gotchas, or context you do not want shared globally.
Use global pages for shared business rules. Use user-scoped pages for local
notes, personal conventions, or context that should not be shared broadly.
### Editing pages
### Wiki page example
Create and edit wiki pages directly as Markdown files in the `wiki/`
directory. Ingest and memory capture also create these pages automatically.
```markdown
---
summary: Revenue recognition rules for finance reporting.
tags: [revenue, finance, reporting]
sl_refs: [orders]
external_refs:
- type: notion
id: finance-revenue-policy
---
Wiki page fields:
## Recognized Revenue
Recognized revenue includes completed orders after refunds. It excludes
cancelled orders, test orders, implementation fees, and tax.
Finance reporting uses order completion date, not invoice creation date.
```
Useful frontmatter:
| Field | Required | Description |
|-------|----------|-------------|
| Key | Yes | Stable page identifier used as the Markdown filename |
| Summary | Yes | Short text shown in search results |
| Content | Yes | Full Markdown business context |
| Scope | No | `global` for shared context or `user` for user-scoped notes |
| Tags | No | Search and organization labels |
| External refs | No | Links or identifiers for source-of-truth systems |
| Semantic-layer refs | No | Source names the page explains or constrains |
| `summary` | Yes | Short text shown in search results. |
| `tags` | No | Business terms and synonyms that improve search. |
| `sl_refs` | No | Semantic source names the page explains or constrains. |
| `external_refs` | No | Source-of-truth system links or ids. |
### Listing pages
## Add searchable business context
1. Search first.
```bash
ktx wiki search "active customer definition" --json --limit 10
```
2. If no page covers the rule, create or edit a Markdown file under
`wiki/global/`.
3. Write a compact `summary` with the wording users are likely to ask.
4. Add tags for synonyms and related business areas.
5. Add `sl_refs` for relevant semantic sources.
6. Search again with a user-like phrase.
## Review context changes
Before accepting agent-written context:
```bash
ktx wiki list
git diff -- semantic-layer wiki
ktx sl validate orders --connection-id warehouse
ktx sl search "revenue" --json
ktx wiki search "revenue recognition" --json --limit 10
```
### Searching
```bash
ktx wiki search "revenue recognition"
```
Search uses both full-text matching and semantic similarity - it finds relevant pages even when the exact terms don't match. Agents call this automatically when they need business context to answer a question.
### Workflow: add searchable business context
1. Search first: `ktx wiki search "order status definitions"`.
2. If no page already covers the rule, create or edit a Markdown file under `wiki/global/`.
3. Include concise frontmatter; agents see the summary before loading full content.
4. Add `tags` values for the business area and `sl_refs` values for related semantic sources.
5. Search again with the user's likely wording to confirm the page is discoverable.
Check that definitions are specific, hidden columns stay hidden, joins have
explicit relationships, and measures compile into the expected SQL.
## Common errors
| Error or symptom | Likely cause | Recovery |
|------------------|--------------|----------|
| `ktx sl validate` reports a missing column | YAML references a column that is absent from the scanned table | Run a fresh scan or update the YAML to match the warehouse schema |
| Query compilation double-counts a measure | Join relationship or grain is missing or wrong | Add `grain` and explicit `relationship` values, then validate and recompile |
| Agent cannot find a metric | Measure name or description does not match business terminology | Add a measure description and a wiki page with common synonyms |
| Wiki search misses a page | Summary and tags do not include likely user wording | Rewrite the summary and add relevant tags, then search again |
| Semantic-layer changes are hard to review | The YAML edit is too large or unfocused | Split the change into smaller source-file edits, then review the git diff |
| Symptom | Likely cause | Recovery |
|---------|--------------|----------|
| `ktx sl validate` reports a missing column | YAML references a column absent from the scanned table | Refresh database context or update the YAML |
| Query compilation double-counts a measure | `grain` or join `relationship` is missing or wrong | Add explicit grain and relationship values, then recompile |
| Agent cannot find a metric | Measure name and description do not match business terminology | Add a clearer measure description and a wiki page with synonyms |
| Wiki search misses a page | Summary, tags, or content do not match user wording | Rewrite the summary and add likely synonyms |
| Context diff is hard to review | One edit changed too many concepts | Split the change into focused source and wiki edits |

View file

@ -7,7 +7,46 @@ KTX integrates with coding agents through CLI skills and command files. These
files teach agents to call public `ktx` commands directly from the terminal for
semantic-layer context and wiki knowledge.
Run `ktx setup` and select your agent targets, or configure manually using the snippets below.
Run `ktx setup` and select your agent targets, or configure manually using the
snippets below. Setup pins generated skill files to the KTX CLI path that
created them, so agents do not need `ktx` on `PATH`.
## Install with setup
```bash
ktx setup --agents
```
Use `--target` for one target:
```bash
ktx setup --agents --target codex
```
Use `--global` only with `claude-code` or `codex`:
```bash
ktx setup --agents --target claude-code --global
ktx setup --agents --target codex --global
```
KTX records installed files in `.ktx/agents/install-manifest.json`. That
manifest lets status checks report agent readiness and lets future cleanup
remove only files KTX installed.
## Generated files
| Target | Project-scoped files | Global files |
|--------|----------------------|--------------|
| Claude Code | `.claude/skills/ktx/SKILL.md`, `.claude/rules/ktx.md` | `~/.claude/skills/ktx/SKILL.md`, `~/.claude/rules/ktx.md` |
| Codex | `.agents/skills/ktx/SKILL.md`, `.codex/instructions/ktx.md` | `$CODEX_HOME/skills/ktx/SKILL.md`, `$CODEX_HOME/instructions/ktx.md` |
| Cursor | `.cursor/rules/ktx.mdc` | Not supported |
| OpenCode | `.opencode/commands/ktx.md` | Not supported |
| Universal `.agents` | `.agents/skills/ktx/SKILL.md` | Not supported |
Skill files list pinned `ktx` commands. Rule files tell the agent when KTX is
appropriate, such as data schemas, metrics, dimensions, database structure, and
SQL questions.
## Claude Code
@ -15,11 +54,12 @@ Run `ktx setup` and select your agent targets, or configure manually using the s
During setup, select **Claude Code** from the agent targets. KTX writes:
| Mode | File |
|------|------|
| CLI skills | `.claude/skills/ktx/SKILL.md` |
| Scope | Files |
|-------|-------|
| Project | `.claude/skills/ktx/SKILL.md`, `.claude/rules/ktx.md` |
| Global | `~/.claude/skills/ktx/SKILL.md`, `~/.claude/rules/ktx.md` |
Both project-scoped and global installations are supported. Global installs write to `~/.claude/skills/ktx/SKILL.md`.
Both project-scoped and global installations are supported.
### Manual CLI skills configuration
@ -42,6 +82,7 @@ Available commands:
### Workflow tips
- Claude Code discovers skills automatically from `.claude/skills/`.
- Claude rules in `.claude/rules/` tell Claude when KTX should be used.
- Global installation makes KTX available in all projects without per-project setup.
- Keep generated skills committed only when your team wants project-local agent instructions in git.
@ -76,11 +117,13 @@ Create `.cursor/rules/ktx.mdc` with the same content structure as the Claude Cod
During setup, select **Codex** from the agent targets. KTX writes:
| Mode | File |
|------|------|
| CLI skills | `.agents/skills/ktx/SKILL.md` |
| Scope | Files |
|-------|-------|
| Project | `.agents/skills/ktx/SKILL.md`, `.codex/instructions/ktx.md` |
| Global | `$CODEX_HOME/skills/ktx/SKILL.md`, `$CODEX_HOME/instructions/ktx.md` |
Both project-scoped and global installations are supported. Global installs write to `$CODEX_HOME/skills/ktx/SKILL.md` (defaults to `~/.codex/skills/ktx/SKILL.md`).
Both project-scoped and global installations are supported. `CODEX_HOME`
defaults to `~/.codex`.
### Manual CLI skills configuration
@ -90,6 +133,7 @@ Create `.agents/skills/ktx/SKILL.md` with the same content structure as Claude C
- Set `CODEX_HOME` to customize the global installation directory.
- Codex shares the `.agents/` directory structure with the universal format.
- Codex instructions in `.codex/instructions/` tell Codex when KTX should be used.
- Global installation makes KTX available across all Codex sessions.
---
@ -143,4 +187,5 @@ All supported agent clients call the same KTX CLI commands:
|---|---|---|---|---|
| CLI skills | Yes | Yes (.mdc) | Yes | Yes |
| Global install | Yes | No | Yes | No |
| Config location | `.claude/skills/ktx/SKILL.md` | `.cursor/rules/ktx.mdc` | `.agents/skills/ktx/SKILL.md` | `.opencode/commands/ktx.md` |
| Rule or instruction file | `.claude/rules/ktx.md` | `.cursor/rules/ktx.mdc` | `.codex/instructions/ktx.md` | `.opencode/commands/ktx.md` |
| Skill file | `.claude/skills/ktx/SKILL.md` | Not separate | `.agents/skills/ktx/SKILL.md` | Not separate |

View file

@ -0,0 +1,70 @@
---
title: Integrations
description: Connect KTX to warehouses, analytics tools, and coding agents.
---
KTX integrations bring trusted context into an analytics project and make that
context available to coding agents through the CLI. Start with `ktx setup` when
you want the guided flow, then use the integration reference pages for exact
configuration fields, generated files, and manual setup.
## Integration types
| Type | What it connects | Start here |
|------|------------------|------------|
| Primary sources | Warehouses and databases that KTX scans for schemas, constraints, row counts, and optional query history | [Primary Sources](/docs/integrations/primary-sources) |
| Context sources | Existing analytics and knowledge tools such as dbt, MetricFlow, LookML, Metabase, Looker, and Notion | [Context Sources](/docs/integrations/context-sources) |
| Agent clients | Claude Code, Codex, Cursor, OpenCode, and universal `.agents` consumers | [Agent Clients](/docs/integrations/agent-clients) |
## Recommended setup flow
Use this order for a new project:
1. Run `ktx setup` from the analytics project directory.
2. Configure an LLM backend and embeddings so KTX can enrich and search context.
3. Add at least one primary source connection.
4. Add optional context sources that describe the same warehouse or business domain.
5. Build context during setup, or run `ktx ingest <connectionId>` later.
6. Install agent integration with `ktx setup --agents` when the context is ready.
For repeatable setup, pass `--project-dir`, `--no-input`, and the relevant
automation flags documented in [`ktx setup`](/docs/cli-reference/ktx-setup).
## What setup writes
| Path | Purpose |
|------|---------|
| `ktx.yaml` | Main project configuration for providers, embeddings, connections, source mappings, query history, and setup state |
| `.ktx/secrets/*` | Local file-backed secrets when you choose file references during setup |
| `semantic-layer/<connection-id>/` | YAML semantic sources generated by database and source ingestion |
| `wiki/` | Markdown business context, definitions, and ingested knowledge |
| `.ktx/agents/install-manifest.json` | Manifest of agent integration files installed by `ktx setup --agents` |
| Agent client files | Skills, rules, or commands that teach agents when and how to call KTX |
## Common commands
```bash
# Start or resume the guided flow
ktx setup
# Add or refresh every configured integration
ktx ingest --all
# Refresh one configured warehouse, source, or knowledge integration
ktx ingest warehouse
# Install one project-scoped agent target
ktx setup --agents --target codex
# Check whether integrations are ready
ktx status
```
## Choosing docs
Read [Primary Sources](/docs/integrations/primary-sources) when you need
database driver fields, authentication formats, query history support, or
warehouse-specific notes. Read [Context Sources](/docs/integrations/context-sources)
when you need source adapter fields, repository authentication, BI tool mapping,
or Notion crawl options. Read [Agent Clients](/docs/integrations/agent-clients)
when you need generated file locations or manual agent configuration.

View file

@ -1,5 +1,5 @@
{
"title": "Integrations",
"defaultOpen": true,
"pages": ["primary-sources", "context-sources", "agent-clients"]
"pages": ["index", "primary-sources", "context-sources", "agent-clients"]
}

View file

@ -1,6 +1,6 @@
/// <reference types="next" />
/// <reference types="next/image-types/global" />
import "./.next/types/routes.d.ts";
import "./.next/dev/types/routes.d.ts";
// NOTE: This file should not be edited
// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.

View file

@ -0,0 +1,11 @@
<svg viewBox="0 0 200 200" xmlns="http://www.w3.org/2000/svg" role="img" aria-label="ktx mascot">
<g fill="none" stroke="#F5F1EA" stroke-width="16" stroke-linecap="round">
<path d="M 62 110 Q 32 130 44 152"/>
<path d="M 88 116 Q 80 152 70 174"/>
<path d="M 112 116 Q 120 152 130 174"/>
</g>
<path d="M 134 108 C 162 116, 172 96, 162 78 C 154 64, 168 56, 178 60" fill="none" stroke="#FF8A4C" stroke-width="16" stroke-linecap="round"/>
<path d="M 48 102 C 48 56, 78 30, 100 30 C 122 30, 152 56, 152 102 C 152 116, 132 120, 100 120 C 68 120, 48 116, 48 102 Z" fill="#F5F1EA"/>
<path d="M 80 84 Q 86 77 92 84" fill="none" stroke="#1B3139" stroke-width="3.5" stroke-linecap="round"/>
<path d="M 108 84 Q 114 77 120 84" fill="none" stroke="#1B3139" stroke-width="3.5" stroke-linecap="round"/>
</svg>

After

Width:  |  Height:  |  Size: 818 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.9 KiB

View file

@ -0,0 +1,11 @@
<svg viewBox="0 0 200 200" xmlns="http://www.w3.org/2000/svg" role="img" aria-label="ktx mascot">
<g fill="none" stroke="#1B3139" stroke-width="16" stroke-linecap="round">
<path d="M 62 110 Q 32 130 44 152"/>
<path d="M 88 116 Q 80 152 70 174"/>
<path d="M 112 116 Q 120 152 130 174"/>
</g>
<path d="M 134 108 C 162 116, 172 96, 162 78 C 154 64, 168 56, 178 60" fill="none" stroke="#FF8A4C" stroke-width="16" stroke-linecap="round"/>
<path d="M 48 102 C 48 56, 78 30, 100 30 C 122 30, 152 56, 152 102 C 152 116, 132 120, 100 120 C 68 120, 48 116, 48 102 Z" fill="#1B3139"/>
<path d="M 80 84 Q 86 77 92 84" fill="none" stroke="#F5F1EA" stroke-width="3.5" stroke-linecap="round"/>
<path d="M 108 84 Q 114 77 120 84" fill="none" stroke="#F5F1EA" stroke-width="3.5" stroke-linecap="round"/>
</svg>

After

Width:  |  Height:  |  Size: 818 B

View file

@ -0,0 +1,808 @@
# Connection Driver Discriminated Union Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Replace the loose `connectionSchema` in `packages/context/src/project/config.ts` with a Zod 4 discriminated union keyed on `driver`, so that every driver's documented connection fields — including the `mappings` block — appear in the JSON schema emitted by `ktx dev schema`.
**Architecture:** Add a new module `packages/context/src/project/driver-schemas.ts` that defines one `z.looseObject({ driver: z.literal('x'), ... })` per supported driver and combines them with `z.discriminatedUnion('driver', [...])`. Reuse the existing Metabase/Looker/LookML mapping shapes from `mappings-yaml-schema.ts` by exporting them. Wire the union into `config.ts`. Each per-driver shape stays `looseObject` so today's existing yaml configs with extra fields keep parsing.
**Tech Stack:** TypeScript (Node 22+, ESM, `NodeNext`), Zod 4 (`^4.4.3`), Vitest, pnpm workspace.
---
## File Structure
**Create:**
- `packages/context/src/project/driver-schemas.ts` — per-driver Zod schemas + the discriminated union and exported types.
- `packages/context/src/project/driver-schemas.test.ts` — unit tests for each driver schema and the union.
**Modify:**
- `packages/context/src/project/mappings-yaml-schema.ts` — export the three mapping shapes (`metabaseMappingsSchema`, `lookerMappingsSchema`, `lookmlMappingsSchema`) with `.describe()` annotations and a small description on each field so they surface meaningfully in JSON Schema.
- `packages/context/src/project/config.ts:209-214` — replace `connectionSchema` with the discriminated union imported from `driver-schemas.ts`. Update `KtxProjectConnectionConfig` (line `272`) to be `z.infer<typeof connectionSchema>` — still works because `connectionSchema` is the union name we keep.
- `packages/context/src/project/index.ts` — re-export `KtxConnectionConfig` per-driver type aliases if useful (optional; only if tests need them).
- `packages/context/src/project/config.test.ts` — add a test that the JSON schema now describes `mappings` for metabase/looker/lookml.
**No changes needed:**
- `packages/context/src/project/mappings-yaml-schema.ts` parsing helpers (`parseMetabaseMappingBootstrap`, etc.) keep working because `KtxProjectConnectionConfig` still has loose-object semantics per driver.
- Doc files in `docs-site/` already show the `mappings` blocks correctly.
---
## Drivers In Scope
The discriminated union enumerates the drivers actually used in code, fixtures, and docs (no `fake`/test-only driver — none exist in fixtures, verified via `grep "driver:\s*fake"`).
Warehouse drivers (read `driver`, `url`; nothing else schema-modeled — kept `looseObject` so warehouse-specific overrides like `historicSql`/`context.queryHistory` pass through):
- `postgres`, `postgresql` (separate literals; KTX normalizes `postgresql``postgres` at runtime, but ktx.yaml accepts both)
- `mysql`
- `snowflake`
- `bigquery`
- `sqlite`
- `clickhouse`
- `sqlserver`
Context-source drivers (model documented fields):
- `metabase``api_url`, `api_key`, `api_key_ref`, `network_proxy`/`networkProxy`, `mappings` (metabaseMappingsSchema).
- `looker``base_url`, `client_id`, `client_secret`, `client_secret_ref`, `mappings` (lookerMappingsSchema).
- `lookml``repoUrl` (camelCase intentional — matches code at `setup-sources.ts:1466`), `branch`, `path`, `auth_token_ref`, `mappings` (lookmlMappingsSchema).
- `notion``auth_token`, `auth_token_ref`, `crawl_mode` (`'selected_roots' | 'all_accessible'`), `root_page_ids`, `root_database_ids`, `root_data_source_ids`, `max_pages_per_run`, `max_knowledge_creates_per_run`, `max_knowledge_updates_per_run`.
- `dbt``source_dir`, `repo_url`, `branch`, `path`, `auth_token_ref`, `profiles_path`, `target`, `project_name`.
- `metricflow``metricflow` (nested object: `repoUrl`, `branch`, `path`, `auth_token_ref`).
Why not strict-object: existing warehouse connections may carry `historicSql` / `context.queryHistory` blocks and other driver-tunable fields not modeled here. `looseObject` preserves the current pass-through behavior while still surfacing the documented fields in JSON Schema.
---
## Task 1: Export and describe mapping shapes
Make the three existing mapping schemas reusable and documented.
**Files:**
- Modify: `packages/context/src/project/mappings-yaml-schema.ts:4-31`
- Test: `packages/context/src/project/mappings-yaml-schema.test.ts` (no behavior change — existing tests must still pass)
- [ ] **Step 1: Add a failing test that imports the new exports**
Append to `packages/context/src/project/mappings-yaml-schema.test.ts` (inside the existing `describe` block):
```typescript
import {
metabaseMappingsSchema,
lookerMappingsSchema,
lookmlMappingsSchema,
} from './mappings-yaml-schema.js';
// ...inside describe(...)
it('exports mapping shapes that parse documented examples', () => {
expect(metabaseMappingsSchema.parse({ databaseMappings: { '1': 'wh' } })).toMatchObject({
databaseMappings: { '1': 'wh' },
syncMode: 'ALL',
});
expect(lookerMappingsSchema.parse({ connectionMappings: { x: 'wh' } })).toEqual({
connectionMappings: { x: 'wh' },
});
expect(lookmlMappingsSchema.parse({ expectedLookerConnectionName: 'x' })).toEqual({
expectedLookerConnectionName: 'x',
});
});
```
- [ ] **Step 2: Run test to verify it fails**
Run: `pnpm --filter @ktx/context exec vitest run src/project/mappings-yaml-schema.test.ts`
Expected: FAIL with `metabaseMappingsSchema is not exported` or equivalent module-resolution error.
- [ ] **Step 3: Add `export` and `.describe()` to the three schemas**
In `packages/context/src/project/mappings-yaml-schema.ts`, change the three internal `const` declarations:
```typescript
export const metabaseMappingsSchema = z
.object({
databaseMappings: z
.record(z.string(), stringTargetSchema)
.default({})
.describe('Map of Metabase database ID (positive integer string) to KTX connection ID. Use null to explicitly unmap.'),
syncEnabled: z
.record(z.string(), z.boolean())
.default({})
.describe('Per-Metabase-database sync toggle, keyed by Metabase database ID string.'),
syncMode: metabaseSyncModeSchema
.default('ALL')
.describe('Sync scope: ALL ingests every mapped DB; ONLY restricts to syncEnabled=true; EXCEPT excludes syncEnabled=true.'),
selections: metabaseSelectionsSchema
.default({ collections: [], items: [] })
.describe('Optional Metabase collection and item IDs to scope ingest.'),
defaultTagNames: z
.array(z.string().min(1))
.default([])
.describe('Default tag names applied to ingested Metabase artifacts.'),
})
.describe('Metabase database-to-warehouse mapping and sync configuration.');
export const lookerMappingsSchema = z
.object({
connectionMappings: z
.record(z.string().min(1), stringTargetSchema)
.default({})
.describe('Map of Looker connection name to KTX connection ID. Use null to explicitly unmap.'),
})
.describe('Looker connection-to-warehouse mapping configuration.');
export const lookmlMappingsSchema = z
.object({
expectedLookerConnectionName: z
.string()
.min(1)
.nullable()
.default(null)
.describe('Looker connection name that LookML models must declare; mismatches block sl_write_source at ingest time.'),
})
.describe('LookML connection-name expectation for ingest gating.');
```
Leave `metabaseSyncModeSchema`, `metabaseSelectionsSchema`, `stringTargetSchema`, and `positiveIntegerValueSchema` private (no need to export). Leave all parsing helpers (`parseMetabaseMappingBootstrap` etc.) unchanged — they keep working because `.describe()` does not change runtime behavior.
- [ ] **Step 4: Run test to verify it passes and existing tests still pass**
Run: `pnpm --filter @ktx/context exec vitest run src/project/mappings-yaml-schema.test.ts`
Expected: PASS for all tests including the new one.
- [ ] **Step 5: Type-check the package**
Run: `pnpm --filter @ktx/context run type-check`
Expected: PASS.
- [ ] **Step 6: Commit**
```bash
git add packages/context/src/project/mappings-yaml-schema.ts packages/context/src/project/mappings-yaml-schema.test.ts
git commit -m "refactor(context): export and describe mapping shape schemas"
```
---
## Task 2: Create the driver-schemas module — warehouse drivers
Add the new module with the seven warehouse driver schemas first. Smaller surface, easier to validate.
**Files:**
- Create: `packages/context/src/project/driver-schemas.ts`
- Test: `packages/context/src/project/driver-schemas.test.ts`
- [ ] **Step 1: Write failing tests for warehouse driver schemas**
Create `packages/context/src/project/driver-schemas.test.ts`:
```typescript
import { describe, expect, it } from 'vitest';
import { connectionConfigSchema } from './driver-schemas.js';
describe('connectionConfigSchema (driver discriminated union)', () => {
it.each([
['postgres', 'postgres://user:pass@host:5432/db'], // pragma: allowlist secret
['postgresql', 'postgresql://user:pass@host:5432/db'], // pragma: allowlist secret
['mysql', 'mysql://user:pass@host:3306/db'], // pragma: allowlist secret
['snowflake', 'snowflake://account/db'],
['bigquery', 'bigquery://project/dataset'],
['sqlite', 'sqlite:///tmp/db.sqlite'],
['clickhouse', 'clickhouse://host:8123/db'],
['sqlserver', 'sqlserver://host:1433;database=db'],
])('parses %s warehouse connection', (driver, url) => {
expect(connectionConfigSchema.parse({ driver, url })).toMatchObject({ driver, url });
});
it('preserves unknown warehouse fields via looseObject passthrough', () => {
const parsed = connectionConfigSchema.parse({
driver: 'postgres',
url: 'postgres://x',
historicSql: { enabled: true },
context: { queryHistory: { enabled: false } },
});
expect(parsed).toMatchObject({
driver: 'postgres',
historicSql: { enabled: true },
context: { queryHistory: { enabled: false } },
});
});
it('rejects an unknown driver', () => {
expect(() => connectionConfigSchema.parse({ driver: 'nope', url: 'x' })).toThrow();
});
});
```
- [ ] **Step 2: Run tests to verify they fail**
Run: `pnpm --filter @ktx/context exec vitest run src/project/driver-schemas.test.ts`
Expected: FAIL — `driver-schemas.js` not found.
- [ ] **Step 3: Create `driver-schemas.ts` with warehouse drivers only**
Create `packages/context/src/project/driver-schemas.ts`:
```typescript
import * as z from 'zod';
const warehouseDrivers = [
'postgres',
'postgresql',
'mysql',
'snowflake',
'bigquery',
'sqlite',
'clickhouse',
'sqlserver',
] as const;
function warehouseConnectionSchema(driver: (typeof warehouseDrivers)[number]) {
return z
.looseObject({
driver: z.literal(driver),
url: z
.string()
.min(1)
.optional()
.describe('Warehouse connection URL or DSN; may contain environment-variable references like env:DATABASE_URL.'),
})
.describe(`${driver} warehouse connection. Additional driver-tunable fields (e.g. historicSql, context.queryHistory) are accepted and passed through.`);
}
export const connectionConfigSchema = z.discriminatedUnion(
'driver',
warehouseDrivers.map(warehouseConnectionSchema),
);
export type KtxConnectionConfig = z.infer<typeof connectionConfigSchema>;
```
- [ ] **Step 4: Run tests to verify they pass**
Run: `pnpm --filter @ktx/context exec vitest run src/project/driver-schemas.test.ts`
Expected: PASS for all eight warehouse drivers + passthrough + unknown-driver rejection.
- [ ] **Step 5: Type-check**
Run: `pnpm --filter @ktx/context run type-check`
Expected: PASS.
- [ ] **Step 6: Commit**
```bash
git add packages/context/src/project/driver-schemas.ts packages/context/src/project/driver-schemas.test.ts
git commit -m "feat(context): add driver-schemas module with warehouse drivers"
```
---
## Task 3: Add Metabase, Looker, LookML driver schemas (the mapping-bearing ones)
These are the most important drivers — they're why we're doing this refactor.
**Files:**
- Modify: `packages/context/src/project/driver-schemas.ts`
- Modify: `packages/context/src/project/driver-schemas.test.ts`
- [ ] **Step 1: Write failing tests**
Append to `packages/context/src/project/driver-schemas.test.ts`:
```typescript
describe('connectionConfigSchema — context source drivers with mappings', () => {
it('parses a metabase connection with mappings', () => {
const parsed = connectionConfigSchema.parse({
driver: 'metabase',
api_url: 'https://metabase.example.com',
api_key_ref: 'env:METABASE_API_KEY', // pragma: allowlist secret
mappings: {
databaseMappings: { '3': 'prod-warehouse' },
syncEnabled: { '3': true },
syncMode: 'ONLY',
},
});
expect(parsed).toMatchObject({
driver: 'metabase',
api_url: 'https://metabase.example.com',
mappings: {
databaseMappings: { '3': 'prod-warehouse' },
syncMode: 'ONLY',
},
});
});
it('parses a looker connection with connectionMappings', () => {
const parsed = connectionConfigSchema.parse({
driver: 'looker',
base_url: 'https://looker.example.com',
client_id: 'abc',
client_secret_ref: 'env:LOOKER_CLIENT_SECRET', // pragma: allowlist secret
mappings: { connectionMappings: { bigquery_prod: 'wh' } },
});
expect(parsed.mappings).toEqual({ connectionMappings: { bigquery_prod: 'wh' } });
});
it('parses a lookml connection with expectedLookerConnectionName', () => {
const parsed = connectionConfigSchema.parse({
driver: 'lookml',
repoUrl: 'https://github.com/acme/looker.git',
branch: 'main',
mappings: { expectedLookerConnectionName: 'bigquery_prod' },
});
expect(parsed.mappings).toEqual({ expectedLookerConnectionName: 'bigquery_prod' });
});
it('rejects metabase mapping with non-integer database key', () => {
expect(() =>
connectionConfigSchema.parse({
driver: 'metabase',
api_url: 'https://x',
mappings: { databaseMappings: { 'abc': 'wh' } },
}),
).toThrow();
});
});
```
- [ ] **Step 2: Run tests to verify they fail**
Run: `pnpm --filter @ktx/context exec vitest run src/project/driver-schemas.test.ts`
Expected: FAIL — `driver: 'metabase'` is not in the discriminated union.
- [ ] **Step 3: Extend `driver-schemas.ts` with metabase/looker/lookml schemas**
Edit `packages/context/src/project/driver-schemas.ts` — add imports and the three new schemas, and include them in the union:
```typescript
import * as z from 'zod';
import {
lookerMappingsSchema,
lookmlMappingsSchema,
metabaseMappingsSchema,
} from './mappings-yaml-schema.js';
// ... (warehouseDrivers + warehouseConnectionSchema stay as-is) ...
const positiveIntKeyMessage = (field: string) =>
`${field} keys must be positive-integer strings (e.g. "1", "42")`;
const positiveIntKeyRegex = /^[1-9]\d*$/;
const metabaseMappingsStrictSchema = metabaseMappingsSchema.superRefine((value, ctx) => {
for (const key of Object.keys(value.databaseMappings ?? {})) {
if (!positiveIntKeyRegex.test(key)) {
ctx.addIssue({ code: 'custom', path: ['databaseMappings', key], message: positiveIntKeyMessage('databaseMappings') });
}
}
for (const key of Object.keys(value.syncEnabled ?? {})) {
if (!positiveIntKeyRegex.test(key)) {
ctx.addIssue({ code: 'custom', path: ['syncEnabled', key], message: positiveIntKeyMessage('syncEnabled') });
}
}
});
const metabaseConnectionSchema = z
.looseObject({
driver: z.literal('metabase'),
api_url: z.string().url().describe('Metabase instance API URL (e.g. https://metabase.example.com).'),
api_key: z.string().min(1).optional().describe('Literal Metabase API key. Prefer api_key_ref for safety.'),
api_key_ref: z
.string()
.min(1)
.optional()
.describe('Reference to Metabase API key (e.g. env:METABASE_API_KEY or file:/path).'),
network_proxy: z
.looseObject({})
.optional()
.describe('Optional network proxy configuration (snake_case form).'),
networkProxy: z
.looseObject({})
.optional()
.describe('Optional network proxy configuration (camelCase form).'),
mappings: metabaseMappingsStrictSchema.optional().describe('Metabase database-to-warehouse mappings and sync configuration.'),
})
.describe('Metabase context-source connection.');
const lookerConnectionSchema = z
.looseObject({
driver: z.literal('looker'),
base_url: z.string().url().describe('Looker instance base URL (e.g. https://looker.example.com).'),
client_id: z.string().min(1).describe('Looker OAuth client ID.'),
client_secret: z.string().min(1).optional().describe('Literal Looker OAuth client secret. Prefer client_secret_ref.'),
client_secret_ref: z
.string()
.min(1)
.optional()
.describe('Reference to Looker OAuth client secret (e.g. env:LOOKER_CLIENT_SECRET).'),
mappings: lookerMappingsSchema.optional().describe('Looker connection-name to KTX warehouse mappings.'),
})
.describe('Looker context-source connection.');
const lookmlConnectionSchema = z
.looseObject({
driver: z.literal('lookml'),
repoUrl: z
.string()
.min(1)
.describe('Git URL of the LookML project (https, ssh, or file:). Field is camelCase by convention.'),
branch: z.string().min(1).optional().describe('Git branch (default "main" downstream).'),
path: z.string().optional().describe('Subdirectory within the repo when the LookML project lives in a monorepo.'),
auth_token_ref: z.string().min(1).optional().describe('Reference to Git auth token for private repos (e.g. env:GITHUB_TOKEN).'),
mappings: lookmlMappingsSchema.optional().describe('LookML expected-connection mapping for ingest gating.'),
})
.describe('LookML context-source connection.');
export const connectionConfigSchema = z.discriminatedUnion(
'driver',
[
...warehouseDrivers.map(warehouseConnectionSchema),
metabaseConnectionSchema,
lookerConnectionSchema,
lookmlConnectionSchema,
],
);
```
Important: the existing `parseMetabaseMappingBootstrap` in `mappings-yaml-schema.ts` already enforces positive-integer keys via `assertPositiveIntegerKeys`. Adding `metabaseMappingsStrictSchema` here gives the same guarantee at the top-level config parse, so a malformed ktx.yaml fails fast at `parseKtxProjectConfig` time rather than at ingest time.
- [ ] **Step 4: Run tests to verify they pass**
Run: `pnpm --filter @ktx/context exec vitest run src/project/driver-schemas.test.ts`
Expected: PASS.
- [ ] **Step 5: Type-check**
Run: `pnpm --filter @ktx/context run type-check`
Expected: PASS.
- [ ] **Step 6: Commit**
```bash
git add packages/context/src/project/driver-schemas.ts packages/context/src/project/driver-schemas.test.ts
git commit -m "feat(context): add metabase, looker, lookml driver schemas with mappings"
```
---
## Task 4: Add Notion, dbt, MetricFlow driver schemas
The remaining context-source drivers; no `mappings` for these, but plenty of driver-specific fields.
**Files:**
- Modify: `packages/context/src/project/driver-schemas.ts`
- Modify: `packages/context/src/project/driver-schemas.test.ts`
- [ ] **Step 1: Write failing tests**
Append to `packages/context/src/project/driver-schemas.test.ts`:
```typescript
describe('connectionConfigSchema — notion / dbt / metricflow', () => {
it('parses a notion connection with selected_roots crawl', () => {
const parsed = connectionConfigSchema.parse({
driver: 'notion',
auth_token_ref: 'env:NOTION_TOKEN',
crawl_mode: 'selected_roots',
root_page_ids: ['abc', 'def'],
max_pages_per_run: 500,
});
expect(parsed).toMatchObject({
driver: 'notion',
crawl_mode: 'selected_roots',
root_page_ids: ['abc', 'def'],
max_pages_per_run: 500,
});
});
it('rejects notion with unknown crawl_mode', () => {
expect(() =>
connectionConfigSchema.parse({
driver: 'notion',
auth_token_ref: 'env:NOTION_TOKEN',
crawl_mode: 'everything',
}),
).toThrow();
});
it('parses a dbt connection from a local source_dir', () => {
const parsed = connectionConfigSchema.parse({
driver: 'dbt',
source_dir: '/tmp/dbt-project',
target: 'dev',
});
expect(parsed).toMatchObject({ driver: 'dbt', source_dir: '/tmp/dbt-project', target: 'dev' });
});
it('parses a metricflow connection with nested config', () => {
const parsed = connectionConfigSchema.parse({
driver: 'metricflow',
metricflow: {
repoUrl: 'https://github.com/acme/sl.git',
branch: 'main',
},
});
expect(parsed).toMatchObject({
driver: 'metricflow',
metricflow: { repoUrl: 'https://github.com/acme/sl.git' },
});
});
});
```
- [ ] **Step 2: Run tests to verify they fail**
Run: `pnpm --filter @ktx/context exec vitest run src/project/driver-schemas.test.ts`
Expected: FAIL — `driver: 'notion'` etc. not in union.
- [ ] **Step 3: Extend `driver-schemas.ts`**
Add to `packages/context/src/project/driver-schemas.ts` before the final `connectionConfigSchema` export:
```typescript
const notionConnectionSchema = z
.looseObject({
driver: z.literal('notion'),
auth_token: z.string().min(1).optional().describe('Literal Notion integration token. Prefer auth_token_ref.'),
auth_token_ref: z
.string()
.min(1)
.optional()
.describe('Reference to Notion integration token (e.g. env:NOTION_TOKEN).'),
crawl_mode: z
.enum(['selected_roots', 'all_accessible'])
.optional()
.describe('Crawl scope. "selected_roots" requires at least one of root_page_ids, root_database_ids, root_data_source_ids.'),
root_page_ids: z.array(z.string().min(1)).optional().describe('Notion page IDs to crawl when crawl_mode is selected_roots.'),
root_database_ids: z.array(z.string().min(1)).optional().describe('Notion database IDs to crawl when crawl_mode is selected_roots.'),
root_data_source_ids: z
.array(z.string().min(1))
.optional()
.describe('Notion data source IDs to crawl when crawl_mode is selected_roots.'),
max_pages_per_run: z
.number()
.int()
.min(1)
.max(10000)
.optional()
.describe('Maximum Notion pages fetched in a single ingest run.'),
max_knowledge_creates_per_run: z
.number()
.int()
.min(0)
.max(25)
.optional()
.describe('Maximum new wiki pages created per run.'),
max_knowledge_updates_per_run: z
.number()
.int()
.min(0)
.max(100)
.optional()
.describe('Maximum existing wiki pages updated per run.'),
})
.describe('Notion context-source connection.');
const dbtConnectionSchema = z
.looseObject({
driver: z.literal('dbt'),
source_dir: z.string().min(1).optional().describe('Absolute or project-relative path to a local dbt project.'),
repo_url: z.string().min(1).optional().describe('Git URL of the dbt project (https, ssh, or file:).'),
branch: z.string().min(1).optional().describe('Git branch when using repo_url.'),
path: z.string().optional().describe('Subdirectory within the repo when the dbt project lives in a monorepo.'),
auth_token_ref: z.string().min(1).optional().describe('Reference to Git auth token for private repos.'),
profiles_path: z.string().optional().describe('Override path to dbt profiles.yml.'),
target: z.string().min(1).optional().describe('dbt target name (e.g. dev, prod).'),
project_name: z.string().min(1).optional().describe('Override auto-detected dbt project name.'),
})
.describe('dbt context-source connection.');
const metricflowConnectionSchema = z
.looseObject({
driver: z.literal('metricflow'),
metricflow: z
.looseObject({
repoUrl: z.string().min(1).describe('Git URL of the MetricFlow / SL project.'),
branch: z.string().min(1).optional().describe('Git branch (default "main").'),
path: z.string().optional().describe('Subdirectory within the repo when the SL config lives in a monorepo.'),
auth_token_ref: z.string().min(1).optional().describe('Reference to Git auth token for private repos.'),
})
.describe('Nested MetricFlow configuration block.'),
})
.describe('MetricFlow / SL context-source connection.');
```
Then update the final union:
```typescript
export const connectionConfigSchema = z.discriminatedUnion(
'driver',
[
...warehouseDrivers.map(warehouseConnectionSchema),
metabaseConnectionSchema,
lookerConnectionSchema,
lookmlConnectionSchema,
notionConnectionSchema,
dbtConnectionSchema,
metricflowConnectionSchema,
],
);
```
- [ ] **Step 4: Run tests to verify they pass**
Run: `pnpm --filter @ktx/context exec vitest run src/project/driver-schemas.test.ts`
Expected: PASS.
- [ ] **Step 5: Type-check**
Run: `pnpm --filter @ktx/context run type-check`
Expected: PASS.
- [ ] **Step 6: Commit**
```bash
git add packages/context/src/project/driver-schemas.ts packages/context/src/project/driver-schemas.test.ts
git commit -m "feat(context): add notion, dbt, metricflow driver schemas"
```
---
## Task 5: Wire the discriminated union into `config.ts`
Now switch the top-level `connectionSchema` to the new union. This is the change that flips JSON-schema output.
**Files:**
- Modify: `packages/context/src/project/config.ts:209-214, 272`
- Test: `packages/context/src/project/config.test.ts` — add a JSON-schema assertion.
- [ ] **Step 1: Write a failing test for the JSON schema output**
Append to `packages/context/src/project/config.test.ts`:
```typescript
import { generateKtxProjectConfigJsonSchema } from './config.js';
describe('generateKtxProjectConfigJsonSchema', () => {
it('emits the metabase mappings shape under connections', () => {
const schema = generateKtxProjectConfigJsonSchema();
const serialized = JSON.stringify(schema);
expect(serialized).toContain('databaseMappings');
expect(serialized).toContain('connectionMappings');
expect(serialized).toContain('expectedLookerConnectionName');
});
});
```
- [ ] **Step 2: Run test to verify it fails**
Run: `pnpm --filter @ktx/context exec vitest run src/project/config.test.ts`
Expected: FAIL — the strings are not in the emitted schema yet because `connectionSchema` is still loose.
- [ ] **Step 3: Replace `connectionSchema` in `config.ts`**
In `packages/context/src/project/config.ts`, delete lines `209-214`:
```typescript
const connectionSchema = z
.looseObject({
driver: z.string().min(1).optional().describe('Connector driver identifier (e.g. "postgres", "bigquery", "snowflake").'),
url: z.string().optional().describe('Connection URL or DSN. Format depends on the driver; may contain environment-variable references.'),
})
.describe('A single database/connector connection entry. Additional driver-specific fields are accepted and passed through.');
```
Replace with an import + re-bind at the top of the file (after the existing imports):
```typescript
import { connectionConfigSchema } from './driver-schemas.js';
const connectionSchema = connectionConfigSchema;
```
(Re-binding to the local name `connectionSchema` keeps the rest of the file unchanged, including the export of `KtxProjectConnectionConfig` at line `272`.)
- [ ] **Step 4: Run the new test plus existing config tests**
Run: `pnpm --filter @ktx/context exec vitest run src/project/`
Expected: PASS for all tests.
If any existing test fails (e.g. a fixture used an undocumented driver string), update the fixture or expand the union — do not loosen the union.
- [ ] **Step 5: Run the full context test suite to catch downstream regressions**
Run: `pnpm --filter @ktx/context run test`
Expected: PASS.
- [ ] **Step 6: Type-check the workspace**
Run: `pnpm run type-check`
Expected: PASS. `KtxProjectConnectionConfig` is now a union; any consumer that destructured fields not present on every driver branch will surface here.
If type-check fails in a consumer, the fix is usually `if (connection.driver === 'metabase')` style narrowing — or, for code that already does this dynamically (e.g. `String(connection.driver).toLowerCase() === 'metabase'`), an explicit cast at the call site is acceptable. Do not add `as any`; prefer narrowing.
- [ ] **Step 7: Commit**
```bash
git add packages/context/src/project/config.ts packages/context/src/project/config.test.ts
git commit -m "refactor(context): make connectionSchema a driver-discriminated union"
```
---
## Task 6: Verify the user-visible result and CLI smoke
Confirm the original bug is fixed and the CLI behavior is unchanged.
**Files:** none modified in this task.
- [ ] **Step 1: Build the CLI**
Run: `pnpm run build`
Expected: PASS.
- [ ] **Step 2: Confirm `ktx dev schema | rg -i mapping` now returns hits**
Run: `node scripts/run-ktx.mjs -- dev schema | rg -i mapping`
Expected: multiple lines, including the `databaseMappings`, `connectionMappings`, `expectedLookerConnectionName` keys and their descriptions.
- [ ] **Step 3: Run the CLI smoke**
Run: `pnpm --filter @ktx/cli run smoke`
Expected: PASS.
- [ ] **Step 4: Run the broader workspace test suite**
Run: `pnpm run test 2>&1 | tee /tmp/ktx-test-output.log`
Expected: PASS. Inspect `/tmp/ktx-test-output.log` if anything fails.
- [ ] **Step 5: Run pre-commit on changed files**
Run: `pnpm run check`
Expected: PASS.
- [ ] **Step 6: Knip dead-code sweep (in case we introduced unused exports)**
Run: `pnpm run dead-code`
Expected: PASS — or, if Knip flags `KtxConnectionConfig` as unused, decide whether to export it from `packages/context/src/project/index.ts` (preferred — it documents intent) or drop the export.
If exporting: add to `packages/context/src/project/index.ts`:
```typescript
export type { KtxConnectionConfig } from './driver-schemas.js';
```
- [ ] **Step 7: Final commit if any docs / index changes were made**
```bash
git status --short
# If only docs/index were touched in step 6:
git add packages/context/src/project/index.ts
git commit -m "chore(context): re-export KtxConnectionConfig from project package"
```
---
## Self-Review
**1. Spec coverage:** Original request was "I need to be able to see full schema" with chosen approach option 1 (discriminated union). Task 5 step 2 verifies that `ktx dev schema | rg -i mapping` now returns hits. Task 6 step 2 is the explicit end-to-end check. All catalogued drivers (warehouse + metabase + looker + lookml + notion + dbt + metricflow) have a schema and a test. ✅
**2. Placeholder scan:** No "TBD", "add validation", "similar to Task N", or skipped code. Every step has the actual code or command. ✅
**3. Type consistency:**
- `connectionConfigSchema` is defined in Task 2 and extended (not renamed) in Tasks 34. ✅
- `KtxConnectionConfig` (new type) appears only in `driver-schemas.ts` and the optional re-export in Task 6. `KtxProjectConnectionConfig` (existing type at `config.ts:272`) keeps its name. ✅
- `metabaseMappingsSchema`, `lookerMappingsSchema`, `lookmlMappingsSchema` — Task 1 exports them; Task 3 imports them by the same names. ✅
- `metabaseMappingsStrictSchema` is defined and used in Task 3 only. ✅
- The `warehouseDrivers` array and `warehouseConnectionSchema` helper are introduced in Task 2 and reused unchanged in Task 4's union extension. ✅
---
## Execution Handoff
Plan complete and saved to `docs/superpowers/plans/2026-05-14-connection-driver-discriminated-union.md`. Two execution options:
**1. Subagent-Driven (recommended)** — I dispatch a fresh subagent per task, review between tasks, fast iteration.
**2. Inline Execution** — Execute tasks in this session using executing-plans, batch execution with checkpoints.
Which approach?

View file

@ -59,6 +59,8 @@ type CommandPathNode = CommandWithGlobalOptions & {
const PROJECT_AWARE_ROOT_COMMANDS = new Set(['setup', 'connection', 'ingest', 'wiki', 'sl', 'status', 'mcp']);
const COMMANDS_THAT_CREATE_PROJECT = new Set(['setup', 'ktx dev init']);
const COMMANDS_WITH_OWN_MISSING_PROJECT_HANDLING = new Set(['status']);
const GLOBAL_OPTIONS_WITH_VALUE = new Set(['--project-dir']);
const GLOBAL_OPTIONS_WITHOUT_VALUE = new Set(['--debug', '--help', '-h', '--version', '-v']);
class KtxProjectMissingAbortError extends Error {
readonly isKtxProjectMissingAbort = true;
@ -73,24 +75,6 @@ function isKtxProjectMissingAbortError(error: unknown): error is KtxProjectMissi
(typeof error === 'object' && error !== null && (error as { isKtxProjectMissingAbort?: unknown }).isKtxProjectMissingAbort === true)
);
}
const REMOVED_COMMAND_PATHS = new Set([
'scan',
'wiki read',
'wiki write',
]);
const GLOBAL_OPTIONS_WITH_VALUE = new Set(['--project-dir']);
const OPTIONS_WITH_VALUE = new Set([
'--project-dir',
'--query-history-window-days',
'--user-id',
'--limit',
'--format',
'--connection-id',
'--source-name',
'--query-file',
'--max-rows',
]);
export interface CommandWithGlobalOptions {
opts: () => object;
optsWithGlobals?: () => object;
@ -337,43 +321,32 @@ function formatCliError(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
function commandPathFromArgv(argv: string[]): string[] {
const path: string[] = [];
for (let index = 0; index < argv.length && path.length < 2; index += 1) {
function firstTopLevelCommandToken(argv: string[]): string | null {
for (let index = 0; index < argv.length; index += 1) {
const arg = argv[index];
if (arg === undefined) {
continue;
}
if (arg === '--') {
break;
return null;
}
if ((path.length === 0 ? GLOBAL_OPTIONS_WITH_VALUE : OPTIONS_WITH_VALUE).has(arg)) {
if (GLOBAL_OPTIONS_WITH_VALUE.has(arg)) {
index += 1;
continue;
}
const optionsWithValue = path.length === 0 ? GLOBAL_OPTIONS_WITH_VALUE : OPTIONS_WITH_VALUE;
if ([...optionsWithValue].some((option) => arg.startsWith(`${option}=`))) {
if ([...GLOBAL_OPTIONS_WITH_VALUE].some((option) => arg.startsWith(`${option}=`))) {
continue;
}
if (path.length === 0 && arg === '--debug') {
if (GLOBAL_OPTIONS_WITHOUT_VALUE.has(arg) || arg.startsWith('-')) {
continue;
}
if (arg.startsWith('-')) {
continue;
}
path.push(arg);
return arg;
}
return path;
return null;
}
function removedCommandName(argv: string[]): string | null {
const path = commandPathFromArgv(argv);
if (path.length === 0) {
return null;
}
const pathKey = path.join(' ');
return REMOVED_COMMAND_PATHS.has(pathKey) ? path.at(-1) ?? null : null;
function isKnownTopLevelCommand(program: Command, commandName: string): boolean {
return program.commands.some((command) => command.name() === commandName || command.aliases().includes(commandName));
}
async function runBareInteractiveCommand(
@ -491,9 +464,9 @@ export async function runCommanderKtxCli(
return 0;
}
const removedCommand = removedCommandName(argv);
if (removedCommand) {
io.stderr.write(`error: unknown command '${removedCommand}'\n`);
const topLevelCommand = firstTopLevelCommandToken(argv);
if (topLevelCommand && !isKnownTopLevelCommand(program, topLevelCommand)) {
io.stderr.write(`error: unknown command '${topLevelCommand}'\n`);
return 1;
}

View file

@ -489,15 +489,17 @@ describe('runKtxConnection', () => {
it('rejects unknown drivers with a helpful error', async () => {
const projectDir = join(tempDir, 'project');
await initKtxProject({ projectDir });
await writeConnections(projectDir, {
mystery: { driver: 'duckdb' },
});
await writeFile(
join(projectDir, 'ktx.yaml'),
'connections:\n mystery:\n driver: duckdb\n',
'utf-8',
);
const io = makeIo();
await expect(
runKtxConnection({ command: 'test', projectDir, connectionId: 'mystery' }, io.io),
).resolves.toBe(1);
expect(io.stderr()).toContain('uses driver "duckdb"');
expect(io.stderr()).toContain('Supported:');
expect(io.stderr()).toContain('connections.mystery.driver');
expect(io.stderr()).toContain('postgres');
});
});

View file

@ -64,6 +64,11 @@ describe('formatDoctorReport', () => {
expect(output).toContain('Node 22+ · pnpm 10.20+');
expect(output).not.toContain('v22.16.0');
expect(output).toContain('Everything ready.');
expect(output).toContain('ktx status --json');
expect(output).toContain('ktx sl list');
expect(output).toContain('ktx wiki list');
expect(output).not.toContain('ktx scan');
expect(output).not.toContain('ktx sl ask');
});
it('shows the underlying detail for a single-check group on the group line', () => {
@ -462,6 +467,7 @@ describe('runKtxDoctor', () => {
it('includes Postgres query-history readiness in project doctor output', async () => {
process.env.ANTHROPIC_API_KEY = 'test-key'; // pragma: allowlist secret
process.env.OPENAI_API_KEY = 'test-key'; // pragma: allowlist secret
process.env.WAREHOUSE_DATABASE_URL = 'postgresql://reader@example.test/warehouse';
await writeFile(
join(tempDir, 'ktx.yaml'),
[
@ -516,8 +522,14 @@ describe('runKtxDoctor', () => {
expect(out).toContain('pg_stat_statements ready (PostgreSQL 16.4)');
expect(out).toContain('info: pg_stat_statements.max is 1000');
expect(out).not.toContain('Update the Postgres parameter group or config');
expect(out).toContain('ktx status --json');
expect(out).toContain('ktx sl list');
expect(out).toContain('ktx wiki list');
expect(out).not.toContain('ktx scan');
expect(out).not.toContain('ktx sl ask');
delete process.env.ANTHROPIC_API_KEY;
delete process.env.OPENAI_API_KEY;
delete process.env.WAREHOUSE_DATABASE_URL;
});
it('returns blocked verdict when LLM is not configured', async () => {
@ -543,6 +555,7 @@ describe('runKtxDoctor', () => {
).resolves.toBe(1);
expect(testIo.stdout()).toContain('no LLM configured');
expect(testIo.stdout()).not.toContain('ktx ask');
expect(testIo.stdout()).toContain('ktx setup');
});

View file

@ -5,6 +5,7 @@ import { join, resolve } from 'node:path';
import { fileURLToPath } from 'node:url';
import { promisify } from 'node:util';
import type { KtxConfigIssue } from '@ktx/context/project';
import { KTX_NEXT_STEP_DIRECT_COMMANDS } from './next-steps.js';
import type { BuildProjectStatusOptions } from './status-project.js';
const execFileAsync = promisify(execFile);
@ -287,7 +288,7 @@ interface RenderOptions {
command?: 'setup' | 'project';
}
const NEXT_STEPS_PROJECT = ['ktx scan', 'ktx wiki', 'ktx sl ask "…"'];
const NEXT_STEPS_PROJECT = KTX_NEXT_STEP_DIRECT_COMMANDS.map((step) => step.command);
export function formatDoctorReport(report: DoctorReport, options: Partial<RenderOptions> = {}): string {
const opts: RenderOptions = {

View file

@ -109,6 +109,7 @@ export async function writeWarehouseConfig(projectDir: string): Promise<void> {
'connections:',
' prod-metabase:',
' driver: metabase',
' api_url: https://metabase.example.test',
' warehouse_a:',
' driver: postgres',
'ingest:',

View file

@ -43,13 +43,13 @@ export interface PrintListArgs<Row> {
io: KtxCliIo;
}
export interface KtxJsonResultEnvelope<T> {
interface KtxJsonResultEnvelope<T> {
kind: string;
data: T;
meta?: Record<string, unknown>;
}
export function writeJsonResult<T>(io: KtxCliIo, envelope: KtxJsonResultEnvelope<T>): void {
function writeJsonResult<T>(io: KtxCliIo, envelope: KtxJsonResultEnvelope<T>): void {
io.stdout.write(`${JSON.stringify(envelope, null, 2)}\n`);
}

View file

@ -1,8 +1,9 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { initKtxProject } from '@ktx/context/project';
import { initKtxProject, loadKtxProject } from '@ktx/context/project';
import type { KtxEmbeddingPort } from '@ktx/context';
import { writeLocalKnowledgePage } from '@ktx/context/wiki';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { runKtxKnowledge } from './knowledge.js';
@ -40,6 +41,28 @@ class FakeEmbeddingPort implements KtxEmbeddingPort {
}
}
interface WikiPageFixture {
key?: string;
summary?: string;
content?: string;
tags?: string[];
slRefs?: string[];
}
async function seedWikiPage(projectDir: string, fixture: WikiPageFixture = {}): Promise<void> {
const project = await loadKtxProject({ projectDir });
await writeLocalKnowledgePage(project, {
key: fixture.key ?? 'metrics-revenue',
scope: 'GLOBAL',
userId: 'local',
summary: fixture.summary ?? 'Revenue',
content: fixture.content ?? 'Revenue is paid order value.',
tags: fixture.tags ?? ['finance'],
refs: [],
slRefs: fixture.slRefs ?? ['orders'],
});
}
describe('runKtxKnowledge', () => {
let tempDir: string;
@ -51,36 +74,10 @@ describe('runKtxKnowledge', () => {
await rm(tempDir, { recursive: true, force: true });
});
it('writes, reads, lists, and searches wiki pages', async () => {
it('lists and searches wiki pages', async () => {
const projectDir = join(tempDir, 'project');
await initKtxProject({ projectDir });
const writeIo = makeIo();
await expect(
runKtxKnowledge(
{
command: 'write',
projectDir,
key: 'metrics-revenue',
scope: 'GLOBAL',
userId: 'local',
summary: 'Revenue',
content: 'Revenue is paid order value.',
tags: ['finance'],
refs: [],
slRefs: ['orders'],
},
writeIo.io,
),
).resolves.toBe(0);
expect(writeIo.stdout()).toContain('Wrote wiki/global/metrics-revenue.md');
const readIo = makeIo();
await expect(
runKtxKnowledge({ command: 'read', projectDir, key: 'metrics-revenue', userId: 'local' }, readIo.io),
).resolves.toBe(0);
expect(readIo.stdout()).toContain('# metrics-revenue');
expect(readIo.stdout()).toContain('Revenue is paid order value.');
await seedWikiPage(projectDir);
const listIo = makeIo();
await expect(runKtxKnowledge({ command: 'list', projectDir, userId: 'local' }, listIo.io)).resolves.toBe(0);
@ -93,27 +90,10 @@ describe('runKtxKnowledge', () => {
expect(searchIo.stdout()).toContain('metrics-revenue');
});
it('prints wiki list, search, and read as public JSON envelopes', async () => {
it('prints wiki list and search as public JSON envelopes', async () => {
const projectDir = join(tempDir, 'project');
await initKtxProject({ projectDir });
await expect(
runKtxKnowledge(
{
command: 'write',
projectDir,
key: 'metrics-revenue',
scope: 'GLOBAL',
userId: 'local',
summary: 'Revenue',
content: 'Revenue is paid order value.',
tags: ['finance'],
refs: [],
slRefs: ['orders'],
},
makeIo().io,
),
).resolves.toBe(0);
await seedWikiPage(projectDir);
const listIo = makeIo();
await expect(runKtxKnowledge({ command: 'list', projectDir, userId: 'local', json: true }, listIo.io)).resolves.toBe(
@ -137,48 +117,6 @@ describe('runKtxKnowledge', () => {
data: { items: [expect.objectContaining({ key: 'metrics-revenue', summary: 'Revenue' })] },
meta: { command: 'wiki search' },
});
const readIo = makeIo();
await expect(
runKtxKnowledge({ command: 'read', projectDir, key: 'metrics-revenue', userId: 'local', json: true }, readIo.io),
).resolves.toBe(0);
expect(JSON.parse(readIo.stdout())).toMatchObject({
kind: 'wiki.page',
data: {
key: 'metrics-revenue',
summary: 'Revenue',
content: 'Revenue is paid order value.',
},
});
});
it('rejects slash-delimited write keys with a flat-key suggestion', async () => {
const projectDir = join(tempDir, 'project');
await initKtxProject({ projectDir });
const writeIo = makeIo();
await expect(
runKtxKnowledge(
{
command: 'write',
projectDir,
key: 'orbit/company-overview',
scope: 'GLOBAL',
userId: 'local',
summary: 'Orbit',
content: 'Orbit overview.',
tags: [],
refs: [],
slRefs: [],
},
writeIo.io,
),
).resolves.toBe(1);
expect(writeIo.stderr()).toContain(
'Invalid wiki key "orbit/company-overview". Wiki keys must be flat; use "orbit-company-overview".',
);
expect(writeIo.stdout()).toBe('');
});
it('explains empty search results for a project without wiki pages', async () => {
@ -198,24 +136,13 @@ describe('runKtxKnowledge', () => {
it('uses configured embeddings for semantic wiki search', async () => {
const projectDir = join(tempDir, 'semantic-project');
await initKtxProject({ projectDir });
await expect(
runKtxKnowledge(
{
command: 'write',
projectDir,
key: 'active-contract-arr-open-tickets',
scope: 'GLOBAL',
userId: 'local',
summary: 'Active Contract ARR Ranked by Open Support Ticket Count',
content: 'Accounts ranked by annual recurring contract value and support ticket load.',
tags: ['historic-sql'],
refs: [],
slRefs: [],
},
makeIo().io,
),
).resolves.toBe(0);
await seedWikiPage(projectDir, {
key: 'active-contract-arr-open-tickets',
summary: 'Active Contract ARR Ranked by Open Support Ticket Count',
content: 'Accounts ranked by annual recurring contract value and support ticket load.',
tags: ['historic-sql'],
slRefs: [],
});
const searchIo = makeIo();
await expect(

View file

@ -5,20 +5,16 @@ import {
} from '@ktx/context';
import { loadKtxProject } from '@ktx/context/project';
import {
type LocalKnowledgeScope,
type LocalKnowledgeSearchResult,
type LocalKnowledgeSummary,
listLocalKnowledgePages,
readLocalKnowledgePage,
searchLocalKnowledgePages,
writeLocalKnowledgePage,
} from '@ktx/context/wiki';
import { resolveOutputMode } from './io/mode.js';
import { printList, type PrintListColumn, writeJsonResult } from './io/print-list.js';
import { printList, type PrintListColumn } from './io/print-list.js';
export type KtxKnowledgeArgs =
| { command: 'list'; projectDir: string; userId: string; output?: string; json?: boolean }
| { command: 'read'; projectDir: string; key: string; userId: string; json?: boolean }
| {
command: 'search';
projectDir: string;
@ -27,18 +23,6 @@ export type KtxKnowledgeArgs =
output?: string;
json?: boolean;
limit?: number;
}
| {
command: 'write';
projectDir: string;
key: string;
scope: LocalKnowledgeScope;
userId: string;
summary: string;
content: string;
tags: string[];
refs: string[];
slRefs: string[];
};
type KtxKnowledgeIo = import('./cli-runtime.js').KtxCliIo;
@ -104,25 +88,6 @@ export async function runKtxKnowledge(
});
return 0;
}
if (args.command === 'read') {
const page = await readLocalKnowledgePage(project, { key: args.key, userId: args.userId });
if (!page) {
throw new Error(`Wiki page "${args.key}" was not found`);
}
if (args.json) {
writeJsonResult(io, {
kind: 'wiki.page',
data: page,
meta: { command: 'wiki read' },
});
return 0;
}
io.stdout.write(`# ${page.key}\n\n`);
io.stdout.write(`Scope: ${page.scope}\n`);
io.stdout.write(`Summary: ${page.summary}\n\n`);
io.stdout.write(`${page.content}\n`);
return 0;
}
if (args.command === 'search') {
const results = await searchLocalKnowledgePages(project, {
query: args.query,
@ -153,18 +118,6 @@ export async function runKtxKnowledge(
});
return 0;
}
const write = await writeLocalKnowledgePage(project, {
key: args.key,
scope: args.scope,
userId: args.userId,
summary: args.summary,
content: args.content,
tags: args.tags,
refs: args.refs,
slRefs: args.slRefs,
});
io.stdout.write(`Wrote ${write.path}\n`);
return 0;
} catch (error) {
io.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`);

View file

@ -92,7 +92,7 @@ describe('createKtxCliScanConnector', () => {
expect(bigQueryMock.constructorInputs[0]).not.toHaveProperty('maxBytesBilled');
});
it('throws for structural daemon-only fallback configs', async () => {
it('rejects daemon-only fallback driver configs at config parse time', async () => {
await initKtxProject({ projectDir: tempDir });
await writeFile(
join(tempDir, 'ktx.yaml'),
@ -105,14 +105,13 @@ describe('createKtxCliScanConnector', () => {
].join('\n'),
'utf-8',
);
const project = await loadKtxProject({ projectDir: tempDir });
await expect(createKtxCliScanConnector(project, 'warehouse')).rejects.toThrow(
'Connection "warehouse" uses driver "duckdb", which has no native standalone KTX scan connector',
await expect(loadKtxProject({ projectDir: tempDir })).rejects.toThrow(
/connections\.warehouse\.driver:.*Invalid discriminator value/,
);
});
it('throws a clear error when the connection block has no driver field', async () => {
it('rejects connection blocks with no driver field at config parse time', async () => {
await initKtxProject({ projectDir: tempDir });
await writeFile(
join(tempDir, 'ktx.yaml'),
@ -125,10 +124,9 @@ describe('createKtxCliScanConnector', () => {
].join('\n'),
'utf-8',
);
const project = await loadKtxProject({ projectDir: tempDir });
await expect(createKtxCliScanConnector(project, 'warehouse')).rejects.toThrow(
'Connection "warehouse" has no `driver` field in ktx.yaml',
await expect(loadKtxProject({ projectDir: tempDir })).rejects.toThrow(
/connections\.warehouse\.driver:.*Invalid discriminator value/,
);
});
});

View file

@ -85,7 +85,7 @@ describe('buildPublicIngestPlan', () => {
it('plans warehouse connections as scan targets and source connections as source ingest targets', () => {
const project = projectWithConnections({
warehouse: { driver: 'postgres' },
prod_metabase: { driver: 'metabase' },
prod_metabase: { driver: 'metabase', api_url: 'https://metabase.example.com' },
docs: { driver: 'notion' },
});
@ -745,7 +745,7 @@ describe('runKtxPublicIngest', () => {
const io = makeIo();
const project = projectWithConnections({
warehouse: { driver: 'postgres' },
prod_metabase: { driver: 'metabase' },
prod_metabase: { driver: 'metabase', api_url: 'https://metabase.example.com' },
});
const runScan = vi.fn(async () => 1);
const runIngest = vi.fn(async () => 0);

View file

@ -133,6 +133,50 @@ function warningLine(warning: KtxScanWarning): string {
return `${warning.code}: ${location}${warning.message}`;
}
function groupWarningsByCode(warnings: readonly KtxScanWarning[]): Map<string, KtxScanWarning[]> {
const groups = new Map<string, KtxScanWarning[]>();
for (const warning of warnings) {
const list = groups.get(warning.code);
if (list) {
list.push(warning);
} else {
groups.set(warning.code, [warning]);
}
}
return groups;
}
function describeWarningGroup(code: string, count: number): string {
switch (code) {
case 'sampling_failed':
return `${count} ${plural(count, 'table')} could not be sampled (retries exhausted); descriptions used metadata-only fallback or were skipped.`;
case 'description_fallback_used':
return `${count} ${plural(count, 'table')} got an AI description from column metadata only (no sample rows available).`;
case 'enrichment_failed':
return `${count} ${plural(count, 'table/column')} could not be enriched.`;
case 'connector_capability_missing':
return `${count} ${plural(count, 'table')} affected by missing connector capability.`;
case 'statistics_failed':
return `${count} statistics ${plural(count, 'lookup')} failed.`;
case 'llm_unavailable':
return 'LLM provider unavailable; AI enrichment was skipped.';
case 'embedding_unavailable':
return 'Embedding provider unavailable; embeddings were skipped.';
case 'relationship_validation_failed':
return `${count} relationship ${plural(count, 'validation')} could not run.`;
case 'relationship_llm_invalid_reference':
return `${count} LLM-proposed ${plural(count, 'relationship')} referenced unknown columns.`;
case 'relationship_llm_proposal_failed':
return `${count} LLM relationship ${plural(count, 'proposal')} failed.`;
case 'scan_enrichment_backend_not_configured':
return 'Scan enrichment backend is not configured; AI stages were skipped.';
case 'credential_redacted':
return `${count} ${plural(count, 'credential')} were redacted from scan output.`;
default:
return `${count} ${plural(count, 'warning')} (${code})`;
}
}
function managedDaemonOptionsForScanRun(args: Extract<KtxScanArgs, { command: 'run' }>, io: KtxCliIo) {
if (args.databaseIntrospectionUrl || !args.cliVersion || !args.runtimeInstallPolicy) {
return undefined;
@ -153,11 +197,26 @@ function writeNeedsAttention(report: KtxScanReport, io: KtxCliIo): void {
}
if (report.warnings.length > 0) {
io.stdout.write(` ${report.warnings.length} ${plural(report.warnings.length, 'warning')}\n`);
for (const warning of report.warnings.slice(0, 5)) {
io.stdout.write(` - ${warningLine(warning)}\n`);
}
if (report.warnings.length > 5) {
io.stdout.write(` - ${report.warnings.length - 5} more warnings in the JSON report\n`);
const groups = groupWarningsByCode(report.warnings);
for (const [code, warnings] of groups) {
io.stdout.write(` - ${describeWarningGroup(code, warnings.length)}\n`);
const first = warnings[0];
if (first) {
io.stdout.write(` ${warningLine(first)}\n`);
}
if (warnings.length > 1) {
const moreTables = warnings
.slice(1)
.map((warning) =>
warning.table ? (warning.column ? `${warning.table}.${warning.column}` : warning.table) : null,
)
.filter((value): value is string => value !== null)
.slice(0, 3);
if (moreTables.length > 0) {
const suffix = warnings.length - 1 > moreTables.length ? `, …` : '';
io.stdout.write(` also: ${moreTables.join(', ')}${suffix}\n`);
}
}
}
}
if (report.capabilityGaps.length > 0) {

View file

@ -1024,6 +1024,8 @@ describe('setup sources step', () => {
databaseMappings: { '1': 'warehouse' },
syncEnabled: { '1': true },
syncMode: 'ALL',
selections: { collections: [], items: [] },
defaultTagNames: [],
},
},
deps: {
@ -1181,6 +1183,8 @@ describe('setup sources step', () => {
databaseMappings: { '1': 'warehouse' },
syncEnabled: { '1': true },
syncMode: 'ALL',
selections: { collections: [], items: [] },
defaultTagNames: [],
},
});
const testPrompts = prompts({

View file

@ -451,6 +451,8 @@ function buildMetabaseConnection(args: KtxSetupSourcesArgs): KtxProjectConnectio
databaseMappings: { [String(args.metabaseDatabaseId)]: args.sourceWarehouseConnectionId },
syncEnabled: { [String(args.metabaseDatabaseId)]: true },
syncMode: 'ALL',
selections: { collections: [], items: [] },
defaultTagNames: [],
},
};
}

View file

@ -311,7 +311,7 @@ describe('setup status', () => {
' url: env:DATABASE_URL',
' metabase:',
' driver: metabase',
' url: env:METABASE_URL',
' api_url: https://metabase.example.test',
' api_key_ref: env:METABASE_API_KEY',
' warehouse_connection_id: warehouse',
'llm:',

View file

@ -213,7 +213,11 @@ export async function runKtxSl(args: KtxSlArgs, io: KtxSlIo = process, deps: Ktx
if (!source) {
throw new Error(`Semantic-layer source "${args.connectionId}/${args.sourceName}" was not found`);
}
const result = await validateLocalSlSource(source.yaml, { project, connectionId: args.connectionId });
const result = await validateLocalSlSource(source.yaml, {
project,
connectionId: args.connectionId,
sourceName: args.sourceName,
});
if (!result.valid) {
for (const error of result.errors) {
io.stderr.write(`${error}\n`);

View file

@ -9,6 +9,7 @@ import type {
} from '@ktx/context/project';
import type { PostgresPgssProbeResult } from '@ktx/context/ingest';
import type { DoctorCheck } from './doctor.js';
import { KTX_NEXT_STEP_DIRECT_COMMANDS } from './next-steps.js';
type ProjectStatusLevel = 'ok' | 'warn' | 'fail';
type ProjectVerdict = 'ready' | 'partial' | 'blocked';
@ -69,6 +70,8 @@ interface WarningItem {
fix?: string;
}
const PROJECT_READY_COMMANDS = KTX_NEXT_STEP_DIRECT_COMMANDS.map((step) => step.command);
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
@ -132,7 +135,7 @@ function buildLlmStatus(config: KtxProjectLlmConfig, env: NodeJS.ProcessEnv): Ll
backend,
model,
status: 'fail',
detail: 'no LLM configured — ktx ask will not work',
detail: 'no LLM configured; research agent will not run',
fix: 'Run: ktx setup (choose an LLM provider)',
};
}
@ -571,7 +574,7 @@ function buildVerdict(
if (llm.status === 'fail') {
return {
verdict: 'blocked',
reason: 'LLM not configured — `ktx ask` will not work.',
reason: 'LLM not configured; research agent will not run.',
nextActions: ['ktx setup'],
};
}
@ -605,7 +608,7 @@ function buildVerdict(
return {
verdict: 'ready',
reason: 'Ready.',
nextActions: ['ktx scan', 'ktx wiki', 'ktx sl ask "…"'],
nextActions: [...PROJECT_READY_COMMANDS],
};
}

View file

@ -100,7 +100,7 @@ const connection = {
dataset_id: 'analytics',
credentials_json: JSON.stringify({ project_id: 'project-1', client_email: 'reader@example.test' }),
location: 'US',
};
} as const;
describe('KtxBigQueryScanConnector', () => {
it('resolves configuration safely', () => {

View file

@ -153,6 +153,7 @@
"@types/node": "^25.7.0",
"@types/pg": "^8.20.0",
"@vitest/coverage-v8": "^4.1.6",
"ajv": "8.20.0",
"typescript": "^6.0.3",
"vitest": "^4.1.6"
},

View file

@ -14,14 +14,14 @@ Use this skill for **uploaded** dbt projects (`dbt_project.yml` at stage root, `
|-----|--------|--------|
| `models:` entry with `columns:` | **Overlay** on the manifest table with the same name (after `discover_data` / `entity_details`) | One SL source per physical table; model name may differ from DB name - resolve with `read_raw_file` + warehouse context. |
| `sources:``tables:` | Same as models; use `identifier` when present instead of logical `name`. | Schema + name must match how the connection sees tables. |
| Column `description` | `descriptions.user` or merged `descriptions` map on the column | Do not overwrite `dbt` description keys from sync. |
| Column `description` | `column_overrides[].descriptions.user` on the overlay | Do not overwrite `dbt` description keys from sync. |
| `data_tests: not_null` / `unique` | Short hint in column `descriptions` or notes: “dbt: not null”, “dbt: unique” | Full structured metadata lands in manifest via **sync**; the skill keeps bundle-time SL text useful for the agent. |
| `accepted_values` | Add a **brief** line in the column description: allowed values (truncate long lists) | Also mention enum-like use in `discover_data` / filters. |
| `relationships` | Add or confirm `joins:` on the overlay **only** when `to` resolves to a real table via `read_raw_file` + `discover_data` / `entity_details` | If the ref cannot be resolved, capture the intent in a wiki page instead. |
## Physical schema grounding
dbt YAML is documentation and test metadata; it is not permission to invent physical columns. Before writing any table-backed SL source, confirm the real warehouse shape with `discover_data`, `sl_discover`, or `entity_details` and use only confirmed column names in `columns:`, `grain:`, `joins:`, `segments:`, and `measures[].expr`.
dbt YAML is documentation and test metadata; it is not permission to invent physical columns. Before writing any table-backed SL source, confirm the real warehouse shape with `discover_data`, `sl_discover`, or `entity_details` and use only confirmed column names in `column_overrides:`, computed-only `columns:`, `grain:`, `joins:`, `segments:`, and `measures[].expr`.
For dbt context-source ingest, the dbt connection is usually not the warehouse connection. Call `sl_discover` without `connectionId` first, then write overlays to the connection that owns the matching manifest-backed source (for example `postgres-warehouse`), not to the dbt connection (for example `dbt-main`). If no matching manifest-backed source is visible on any warehouse connection, do not call `sl_write_source`; record `emit_unmapped_fallback` and keep the fact wiki-only.
@ -61,7 +61,7 @@ SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`:
## 1.1 test hints (descriptions / meta)
When YAML shows `accepted_values` or `not_null`, add **short** hints into `columns[].descriptions` (e.g. under `user`) or freeform column notes so chat and validation see intent before the next git sync refreshes `constraints` / `enum_values` in `_schema`. Keep hints under a few words when possible.
When YAML shows `accepted_values` or `not_null`, add **short** hints into `column_overrides[].descriptions` (for example under `user`) or freeform column notes so chat and validation see intent before the next git sync refreshes `constraints` / `enum_values` in `_schema`. Keep hints under a few words when possible.
## Overlap with MetricFlow
@ -71,6 +71,6 @@ If the same bundle also has MetricFlow `semantic_models:` / `metrics:`, the **`m
- Do not run `dbt` CLI or assume `target/` / `manifest.json` exists in the upload.
- Do not invent column names, grain keys, or measure expressions from dbt model names, descriptions, tests, or common naming patterns.
- Do not write `columns:`, `grain:`, or `measures:` for a dbt model unless those exact column names are confirmed by dbt YAML columns or warehouse schema discovery.
- Do not write computed `columns:`, `column_overrides:`, `grain:`, or `measures:` for a dbt model unless those exact column names are confirmed by dbt YAML columns or warehouse schema discovery.
- Do not invent joins from `relationships` tests if the target model/table is not found in SL or the warehouse.
- Do not read `peerFileIndex` paths - use `read_raw_file` only on `rawFiles` and `dependencyPaths` from the WorkUnit.

View file

@ -12,7 +12,7 @@ LookML views map to SL sources, `measure:` to measures, `explore: { join: }` to
| LookML | KTX form | Notes |
|---|---|---|
| `view: X { sql_table_name: …; measure:/dimension:/join: }` | **Overlay** at `<connId>/X.yaml` with `measures`, `columns` (computed), `joins`, `segments` | Manifest-backed; inherit grain/columns |
| `view: X { sql_table_name: …; measure:/dimension:/join: }` | **Overlay** at `<connId>/X.yaml` with `measures`, computed-only `columns`, `column_overrides`, `joins`, `segments` | Manifest-backed; inherit grain/columns |
| `view: X { derived_table: { sql: … } }` | **Standalone** with top-level `sql:`, explicit `grain:` + `columns:` | No manifest entry exists |
| `view: X { sql_always_where: <p> }` | **Standalone** with `sql: SELECT * FROM <base> WHERE <p>` | Enforcement, not opt-in |
| `explore: { join: Y { sql_on: …; relationship: … } }` | `joins:` entry `{ to: Y, on: "<local> = Y.<col>", relationship: … }` | On the overlay or standalone |
@ -136,7 +136,8 @@ KTX overlay at `<connId>/fct_labs.yaml`:
```yaml
name: fct_labs
description: "Lab-order fact table. One row per lab order event."
descriptions:
user: "Lab-order fact table. One row per lab order event."
columns:
- name: is_byol
type: boolean

View file

@ -79,7 +79,7 @@ SL source, `tables:` frontmatter, `sl_refs`, or `emit_unmapped_fallback`:
For each card:
1. Analyze `resolvedSql` + `resultMetadata`: identify base tables, aggregations, joins, filters, column types.
2. **REQUIRED before any write**: call `sl_discover` for every candidate target source name. The response tells you whether the name is manifest-backed (`Type: table` or `Type: sql`). For manifest-backed names you MUST use the overlay shape (`name:` + `measures:`/`segments:`/`description:` only - no `sql:`, `table:`, `grain:`, or `columns:`); the tool will reject a standalone write and you'll have wasted the call. If `sl_discover` returns nothing for the name, you can write a standalone source. Also call `sl_read_source` on existing sources you intend to extend so you don't duplicate measures.
2. **REQUIRED before any write**: call `sl_discover` for every candidate target source name. The response tells you whether the name is manifest-backed (`Type: table` or `Type: sql`). For manifest-backed names you MUST use the overlay shape (`name:` plus overlay fields such as `measures:`, `segments:`, `descriptions:`, `joins:`, `disable_joins:`, `column_overrides:`, and computed-only `columns:` entries with `expr` + `type`; no `sql:`, `table:`, `grain:`, or base-table `columns:`); the tool will reject a standalone write and you'll have wasted the call. If `sl_discover` returns nothing for the name, you can write a standalone source. Also call `sl_read_source` on existing sources you intend to extend so you don't duplicate measures.
3. Include `rawPaths: ["cards/<id>.json"]` on every `sl_write_source`, `sl_edit_source`, and `wiki_write` call. If one artifact generalizes multiple near-duplicate cards, include each contributing card path and no unrelated cards.
4. Decide:
- Simple aggregation on a table that already has a source → `sl_edit_source` to add a measure.
@ -98,7 +98,7 @@ measures:
expr: "<expression>"
```
Overlay shape: `name:` plus any of `measures:`, `segments:`, `descriptions:`, `joins:`, `disable_joins:`. Never include `sql:`, `table:`, `grain:`, or `columns:` on a manifest-backed name - those would shadow the manifest's schema and drop its joins. Overlay `joins:` are merged additively with the manifest's joins (deduped by `to` + `on`); use `disable_joins: ["<on-clause>"]` to suppress a specific manifest join. After the overlay exists, use `sl_edit_source` for further tweaks. See `sl_capture` skill for the canonical overlay rule.
Overlay shape: `name:` plus any of `measures:`, `segments:`, `descriptions:`, `joins:`, `disable_joins:`, `exclude_columns:`, `column_overrides:`, or computed-only `columns:` entries with `expr` + `type`. Never include `sql:`, `table:`, `grain:`, or base-table `columns:` on a manifest-backed name — those would shadow the manifest's schema and drop its joins. Use `column_overrides:` for inherited column descriptions. Overlay `joins:` are merged additively with the manifest's joins (deduped by `to` + `on`); use `disable_joins: ["<on-clause>"]` to suppress a specific manifest join. After the overlay exists, use `sl_edit_source` for further tweaks. See `sl_capture` skill for the canonical overlay rule.
**Join discovery:** When your card's SQL references warehouse tables (e.g. in `FROM` or `JOIN` clauses), call `sl_discover({ query: '<table>' })` before writing. The matching manifest entry's `name` is the value you use in `joins: [- to: <name>]` only when the card output exposes a local key that matches the target source grain (for example `account_id = mart_account_segments.account_id`). Do not declare a KTX join just because the card SQL joins that table internally. If the output only exposes display fields such as `account_name`, keep the SQL source self-contained or project the key before adding the join. Use `many_to_one` for FK-to-dimension joins, `one_to_many` for the reverse.

View file

@ -12,7 +12,7 @@ A MetricFlow `semantic_model` maps to an SL source; MetricFlow `measures` map to
| MetricFlow | KTX form | Notes |
|---|---|---|
| `semantic_model: X { model: ref('t') }` with measures + dimensions | **Overlay** at `<connId>/X.yaml` with `measures`, `columns` (computed), `joins` | The `model:` ref resolves to a manifest table. |
| `semantic_model: X { model: ref('t') }` with measures + dimensions | **Overlay** at `<connId>/X.yaml` with `measures`, computed-only `columns`, `column_overrides`, `joins` | The `model:` ref resolves to a manifest table. |
| `semantic_model: X { model: source('s','t') }` | **Overlay** at `<connId>/X.yaml` over table `t`. | Same shape; `source()` still resolves to a physical table. |
| `semantic_model: X { model: <literal> }` with no manifest entry | **Standalone** with explicit `sql:`, `grain:`, `columns:` | Happens when the dbt manifest isn't available. |
| `semantic_model: Y { extends: X }` | **Merge** Y's measures/dimensions/entities into X's overlay, or write a single overlay named for the most-derived child (Y) containing both X's and Y's primitives | Do not emit a second overlay for X - flatten. |
@ -84,7 +84,7 @@ If `sl_discover` errors because no such table exists, use `discover_data` and
`entity_details` to find the warehouse target. If a SQL probe is still needed,
call `sql_execution` with the same warehouse connection id, for example:
`sql_execution({connectionId: "warehouse", sql: "SELECT 1 FROM analytics.orders LIMIT 0"})`.
**Never invent column names** - every column in `columns:`, `grain:`, and
**Never invent column names** - every column in computed `columns:`, `column_overrides:`, `grain:`, and
`sql:` must be sourced from raw files, `entity_details`, or a successful SQL
probe.

View file

@ -39,6 +39,10 @@ columns: # computed dimensions only
- name: is_large_order
type: boolean
expr: "amount > 1000"
column_overrides: # metadata patches for inherited columns
- name: status
descriptions:
user: "Order lifecycle status."
segments:
- name: paid_non_refunded
expr: "is_paid = true AND is_refunded = false"
@ -51,6 +55,7 @@ joins:
Rules:
- Do **not** repeat base-table columns, grain, `table`, or `source_type` in an overlay - those are inherited.
- Overlay columns MUST be computed (`expr` + `type`).
- Use `column_overrides` to add descriptions or metadata to inherited manifest columns. Do not put `type` or `expr` in `column_overrides`.
- `exclude_columns` hides specific manifest columns; `disable_joins` suppresses specific auto-detected joins.
### Standalone table sources
@ -110,7 +115,7 @@ An SQL source is a one-shot answer: the aggregation is frozen, callers cannot re
### Columns
Every standalone column requires `name` and `type`. Overlays have computed columns only.
Every standalone column requires `name` and `type`. Overlays have computed columns in `columns:` and manifest column metadata patches in `column_overrides:`.
- `type`: one of `string`, `number`, `boolean`, `time`. Map LookML `date`/`datetime`/`timestamp``time`. Map LookML `yesno``boolean`.
- `role` (optional): `time` enables time-granularity queries (month, week, day). `default` is the implicit fallback.

View file

@ -100,7 +100,33 @@ measures:
**Extract repeated filter bundles into named segments.** If the same predicate appears on multiple measures of the same source, lift it to a `segments[]` entry and have each measure reference it. One edit updates every measure that depends on it.
**Never write a standalone file on a manifest-backed name.** If `sl_discover({ query: "<table-or-source-name>" })` finds an existing schema for that name, you MUST write an overlay (`name:` + `measures:`/`segments:`/`descriptions:` only - no `sql:`, `table:`, `grain:`, `columns:`, `joins:`). A standalone with `sql:` or `table:` on a manifest-backed name clobbers the inherited columns and joins; `sl_write_source` and `sl_validate` both reject this shape with a clear fix hint. Always run `sl_discover` before your first write on any existing name.
**Never write a standalone file on a manifest-backed name.** If `sl_discover({ query: "<table-or-source-name>" })` finds an existing schema for that name, you MUST write an overlay. A standalone with `sql:` or `table:` on a manifest-backed name clobbers the inherited columns and joins; `sl_write_source` and `sl_validate` both reject this shape with a clear fix hint. Always run `sl_discover` before your first write on any existing name.
Overlay before/after examples:
```yaml
# Wrong: patches an inherited manifest column through columns:
name: fct_orders
columns:
- name: status
descriptions:
user: "Order lifecycle status."
```
```yaml
# Right: patch inherited columns with column_overrides:
name: fct_orders
column_overrides:
- name: status
descriptions:
user: "Order lifecycle status."
columns:
- name: is_large_order
type: boolean
expr: "amount > 1000"
```
Overlay YAML may include `measures:`, `segments:`, `descriptions:`, `joins:`, `disable_joins:`, `exclude_columns:`, `column_overrides:`, and computed-only `columns:` entries with `expr` and `type`. Do not include `sql:`, `table:`, `grain:`, or base-table `columns:`.
**Prefer overlay decomposition over standalone SQL sources.** Before reaching for `source_type: sql`, check whether the metric decomposes into measures on existing overlays (including cross-source derived measures). Use `source_type: sql` only when:
- The metric requires per-user/per-entity derivation that cannot be expressed as a single `expr` (e.g., `EXISTS` over a time-windowed subset), OR

View file

@ -1,4 +1,4 @@
import { KtxMessageBuilder, type KtxLlmProvider, type KtxModelRole } from '@ktx/llm';
import { KtxMessageBuilder, splitKtxSystemMessages, type KtxLlmProvider, type KtxModelRole } from '@ktx/llm';
import { generateText, stepCountIs, type TelemetrySettings, type Tool } from 'ai';
import { noopLogger, type KtxLogger } from '../core/index.js';
import { summarizeKtxLlmDebugRequest, type KtxLlmDebugRequestRecorder } from '../llm/index.js';
@ -36,14 +36,6 @@ export interface AgentRunnerServiceDeps {
logger?: KtxLogger;
}
function splitSystemPromptMessages(messages: ReturnType<KtxMessageBuilder['wrapSimple']>['messages']) {
const systemMessages = messages.filter((message) => message.role === 'system');
return {
system: systemMessages.length === 0 ? undefined : systemMessages.length === 1 ? systemMessages[0] : systemMessages,
messages: messages.filter((message) => message.role !== 'system'),
};
}
export class AgentRunnerService {
private readonly logger: KtxLogger;
@ -62,7 +54,7 @@ export class AgentRunnerService {
tools: params.toolSet,
model,
});
const promptMessages = splitSystemPromptMessages(built.messages);
const promptMessages = splitKtxSystemMessages(built.messages);
await this.deps.debugRequestRecorder?.record(
summarizeKtxLlmDebugRequest({

View file

@ -36,7 +36,13 @@ describe('localConnectionToWarehouseDescriptor', () => {
});
it('returns null for non-warehouse adapters', () => {
expect(localConnectionToWarehouseDescriptor('looker', { driver: 'looker' })).toBeNull();
expect(
localConnectionToWarehouseDescriptor('looker', {
driver: 'looker',
base_url: 'https://looker.example.com',
client_id: 'client',
}),
).toBeNull();
});
});
@ -48,7 +54,9 @@ describe('local connection info helpers', () => {
});
it('keeps non-warehouse adapter labels for display-only local connection surfaces', () => {
expect(localConnectionTypeForConfig('prod-metabase', { driver: 'metabase' })).toBe('metabase');
expect(localConnectionTypeForConfig('prod-metabase', { driver: 'metabase', api_url: 'https://metabase.example.com' })).toBe(
'metabase',
);
expect(localConnectionTypeForConfig('missing-driver', {} as never)).toBe('unknown');
});

View file

@ -13,7 +13,20 @@ export const KTX_NOTION_ORG_KNOWLEDGE_WARNING =
type KtxNotionCrawlMode = 'all_accessible' | 'selected_roots';
export interface KtxNotionConnectionConfig extends KtxProjectConnectionConfig {
type RawKtxNotionConnectionConfig = Extract<KtxProjectConnectionConfig, { driver: 'notion' }>;
export type KtxNotionConnectionConfig = Omit<
RawKtxNotionConnectionConfig,
| 'auth_token'
| 'auth_token_ref'
| 'crawl_mode'
| 'root_page_ids'
| 'root_database_ids'
| 'root_data_source_ids'
| 'max_pages_per_run'
| 'max_knowledge_creates_per_run'
| 'max_knowledge_updates_per_run'
> & {
driver: 'notion';
auth_token: string | null;
auth_token_ref: string | null;
@ -24,7 +37,7 @@ export interface KtxNotionConnectionConfig extends KtxProjectConnectionConfig {
max_pages_per_run: number;
max_knowledge_creates_per_run: number;
max_knowledge_updates_per_run: number;
}
};
export interface RedactedKtxNotionConnectionConfig {
driver: 'notion';

View file

@ -3,6 +3,7 @@ import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { buildDefaultKtxProjectConfig } from '../../../project/index.js';
import { connectionConfigSchema } from '../../../project/driver-schemas.js';
import { KtxYamlMetabaseSourceStateReader, LocalMetabaseDiscoveryCache } from './local-source-state-store.js';
describe('Metabase YAML source state and discovery cache', () => {
@ -23,10 +24,11 @@ describe('Metabase YAML source state and discovery cache', () => {
config: {
...buildDefaultKtxProjectConfig(),
connections: {
'prod-metabase': {
'prod-metabase': connectionConfigSchema.parse({
driver: 'metabase',
api_url: 'https://metabase.example.com',
mappings,
},
}),
},
},
};

View file

@ -38,7 +38,7 @@ describe('importMetricflowSemanticModels', () => {
const scoped = {
getManifestEntry: vi.fn().mockResolvedValue(null),
isManifestBacked: vi.fn().mockResolvedValue(false),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
loadSource: vi.fn().mockResolvedValue(null),
writeSource: vi.fn().mockResolvedValue({ warnings: [] }),
};
@ -104,7 +104,7 @@ describe('importMetricflowSemanticModels', () => {
const scoped = {
getManifestEntry: vi.fn().mockResolvedValue(null),
isManifestBacked: vi.fn().mockResolvedValue(false),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
loadSource: vi.fn().mockImplementation((connectionId: string, sourceName: string) =>
Promise.resolve(sourceName === 'orders' ? { name: 'orders' } : null),
),
@ -139,7 +139,7 @@ describe('importMetricflowSemanticModels', () => {
const scoped = {
getManifestEntry: vi.fn().mockResolvedValue(null),
isManifestBacked: vi.fn().mockResolvedValue(false),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
loadSource: vi.fn().mockResolvedValue(null),
writeSource: vi.fn().mockRejectedValueOnce(new Error('cannot write orders')).mockResolvedValue({ warnings: [] }),
};
@ -190,7 +190,7 @@ describe('importMetricflowSemanticModels', () => {
isManifestBacked: vi.fn().mockImplementation(async (_connectionId: string, sourceName: string) => {
return sourceName === 'orders';
}),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
loadSource: vi.fn().mockResolvedValue(null),
writeSource: vi.fn().mockImplementation(async (_connectionId: string, source: (typeof written)[number]) => {
written.push(source);
@ -268,7 +268,7 @@ describe('importMetricflowSemanticModels', () => {
isManifestBacked: vi.fn().mockImplementation(async (_connectionId: string, sourceName: string) => {
return sourceName === 'orders';
}),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
loadSource: vi.fn().mockResolvedValue(null),
writeSource: vi.fn().mockResolvedValue({ warnings: [] }),
};
@ -311,7 +311,7 @@ describe('importMetricflowSemanticModels', () => {
const scoped = {
getManifestEntry: vi.fn().mockResolvedValue(null),
isManifestBacked: vi.fn().mockResolvedValue(false),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
loadSource: vi.fn().mockResolvedValue(null),
writeSource: vi
.fn()

View file

@ -71,7 +71,7 @@ export async function importMetricflowSemanticModels(
let crossModelSourcesCreated = 0;
const preexistingSourceNames = new Set(
(await semanticLayerService.loadAllSources(input.connectionId)).map((source) => source.name),
(await semanticLayerService.loadAllSources(input.connectionId)).sources.map((source) => source.name),
);
const modelContexts: MetricflowSemanticModelImportContext[] = [];
const sourceNameByModelRef = new Map<string, string>();

View file

@ -187,7 +187,10 @@ const makeDeps = () => {
loadAllSources: vi
.fn()
.mockImplementation((connectionId: string) =>
Promise.resolve(connectionId === 'warehouse-2' ? [{ name: 'looker__orders' }] : []),
Promise.resolve({
sources: connectionId === 'warehouse-2' ? [{ name: 'looker__orders' }] : [],
loadErrors: [],
}),
),
};
const slSearchService = {
@ -1347,7 +1350,7 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
frontmatter: { sl_refs: ['looker__b2b__sales_pipeline.arr'] },
});
deps.semanticLayerService.loadAllSources.mockImplementation((connectionId: string) =>
Promise.resolve([{ name: `${connectionId}_source` }]),
Promise.resolve({ sources: [{ name: `${connectionId}_source` }], loadErrors: [] }),
);
deps.agentRunner.runLoop.mockImplementation(async (params: any) => {
if (params.telemetryTags.operationName === 'ingest-bundle-wu') {
@ -1447,7 +1450,7 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
parseArtifacts: { semanticModels: [{ name: 'orders' }] },
});
deps.semanticLayerService.loadAllSources.mockImplementation((connectionId: string) =>
Promise.resolve([{ name: `${connectionId}_source` }]),
Promise.resolve({ sources: [{ name: `${connectionId}_source` }], loadErrors: [] }),
);
const postProcessor = {
run: vi.fn().mockResolvedValue({
@ -1631,7 +1634,10 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
const deps = makeDeps();
deps.adapter.listTargetConnectionIds = vi.fn().mockResolvedValue(['postgres-warehouse']);
deps.semanticLayerService.loadAllSources.mockImplementation((connectionId: string) =>
Promise.resolve(connectionId === 'postgres-warehouse' ? [{ name: 'stg_accounts' }] : []),
Promise.resolve({
sources: connectionId === 'postgres-warehouse' ? [{ name: 'stg_accounts' }] : [],
loadErrors: [],
}),
);
const runner = buildRunner(deps);
@ -1659,7 +1665,10 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
it('does not resolve qualified fallback table refs by source name alone', async () => {
const deps = makeDeps();
deps.semanticLayerService.loadAllSources.mockResolvedValue([{ name: 'orders', table: 'sales.orders' }]);
deps.semanticLayerService.loadAllSources.mockResolvedValue({
sources: [{ name: 'orders', table: 'sales.orders' }],
loadErrors: [],
});
const runner = buildRunner(deps);
await expect(

View file

@ -300,7 +300,7 @@ export class IngestBundleRunner {
const blocks = await Promise.all(
connectionIds.map(async (connectionId) => {
try {
const sources = await this.deps.semanticLayerService.loadAllSources(connectionId);
const { sources } = await this.deps.semanticLayerService.loadAllSources(connectionId);
const names = sources.map((source) => source.name).sort((left, right) => left.localeCompare(right));
const body = names.length > 0 ? names.join('\n') : '(no sources yet)';
return `## ${connectionId}\n${body}`;
@ -329,7 +329,7 @@ export class IngestBundleRunner {
): Promise<boolean> {
for (const connectionId of connectionIds) {
try {
const sources = await semanticLayerService.loadAllSources(connectionId);
const { sources } = await semanticLayerService.loadAllSources(connectionId);
if (sources.some((source) => semanticSourceMatchesTableRef(source, tableRef))) {
return true;
}
@ -1211,7 +1211,7 @@ export class IngestBundleRunner {
].sort();
for (const connectionId of touchedConnections) {
try {
const allSources = await this.deps.semanticLayerService.loadAllSources(connectionId);
const { sources: allSources } = await this.deps.semanticLayerService.loadAllSources(connectionId);
await this.deps.slSearchService.indexSources(connectionId, allSources);
} catch (err) {
this.logger.warn(

View file

@ -27,11 +27,12 @@ describe('local mapping yaml reconciliation bridge', () => {
const project = projectWithConnections({
'prod-metabase': {
driver: 'metabase',
api_url: 'https://metabase.example.com',
mappings: {
databaseMappings: { '1': 'prod-warehouse' },
syncEnabled: { '1': true },
syncMode: 'ONLY',
selections: { collections: [12] },
selections: { collections: [12], items: [] },
defaultTagNames: ['ktx'],
},
},
@ -46,6 +47,8 @@ describe('local mapping yaml reconciliation bridge', () => {
const project = projectWithConnections({
'prod-looker': {
driver: 'looker',
base_url: 'https://looker.example.com',
client_id: 'client',
mappings: { connectionMappings: { analytics: 'prod-warehouse' } },
},
'prod-warehouse': { driver: 'postgres', url: 'postgresql://readonly@db.test/analytics' },

View file

@ -227,9 +227,10 @@ describe('PageTriageService', () => {
});
generateTextMock
.mockImplementationOnce((args: any) => {
const systemMessage = args.messages.find((m: { role: string }) => m.role === 'system');
const systemMessage = args.system ?? args.messages.find((m: { role: string }) => m.role === 'system');
const userMessage = args.messages.find((m: { role: string }) => m.role === 'user');
const systemText = systemMessage.content as string;
const systemText =
typeof systemMessage === 'string' ? systemMessage : (systemMessage.content as string);
const userText = userMessage.content as string;
expect(systemText).toContain(
'Reusable templates and scripts are durable knowledge regardless of subject matter.',

View file

@ -1,7 +1,7 @@
import { createHash } from 'node:crypto';
import { readdir, readFile } from 'node:fs/promises';
import { dirname, join, relative } from 'node:path';
import { KtxMessageBuilder, type KtxLlmProvider } from '@ktx/llm';
import { KtxMessageBuilder, splitKtxSystemMessages, type KtxLlmProvider } from '@ktx/llm';
import { generateText, type ToolSet } from 'ai';
import pLimit from 'p-limit';
import { z } from 'zod';
@ -346,10 +346,12 @@ export class PageTriageService {
tools: {},
model,
});
const split = splitKtxSystemMessages(built.messages);
const result = await this.runGenerateText({
model,
temperature: 0,
messages: built.messages,
...(split.system ? { system: split.system } : {}),
messages: split.messages,
tools: built.tools as ToolSet,
});
return result.text;

View file

@ -44,23 +44,26 @@ describe('repairWikiSlRefs', () => {
})),
};
const semanticLayerService = {
loadAllSources: vi.fn(async () => [
{
name: 'mart_customer_health',
grain: [],
columns: [],
joins: [],
measures: [{ name: 'high_risk_account_count', expr: 'count(*)' }],
segments: [{ name: 'high_risk', expr: "risk_level = 'high'" }],
},
{
name: 'int_procurement_qualifying_actions',
grain: [],
columns: [],
joins: [],
measures: [],
},
]),
loadAllSources: vi.fn(async () => ({
sources: [
{
name: 'mart_customer_health',
grain: [],
columns: [],
joins: [],
measures: [{ name: 'high_risk_account_count', expr: 'count(*)' }],
segments: [{ name: 'high_risk', expr: "risk_level = 'high'" }],
},
{
name: 'int_procurement_qualifying_actions',
grain: [],
columns: [],
joins: [],
measures: [],
},
],
loadErrors: [],
})),
};
const result = await repairWikiSlRefs({

View file

@ -56,7 +56,8 @@ async function loadVisibleSlRefs(
const warnings: string[] = [];
for (const connectionId of connectionIds) {
try {
for (const source of await semanticLayerService.loadAllSources(connectionId)) {
const { sources } = await semanticLayerService.loadAllSources(connectionId);
for (const source of sources) {
for (const ref of entityRefsForSource(source)) {
refs.add(ref);
}

View file

@ -1,4 +1,4 @@
import { KtxMessageBuilder, type KtxLlmProvider, type KtxModelRole } from '@ktx/llm';
import { KtxMessageBuilder, splitKtxSystemMessages, type KtxLlmProvider, type KtxModelRole } from '@ktx/llm';
import { generateText, Output, type FlexibleSchema, type ToolSet } from 'ai';
type GenerateTextInput = Parameters<typeof generateText>[0];
@ -29,10 +29,12 @@ export async function generateKtxText(input: GenerateKtxTextInput): Promise<stri
tools: input.tools ?? {},
model,
});
const split = splitKtxSystemMessages(built.messages);
const result = await (input.generateText ?? generateText)({
model,
temperature: input.temperature ?? 0,
messages: built.messages,
...(split.system ? { system: split.system } : {}),
messages: split.messages,
tools: built.tools as ToolSet,
...(hasTools(built.tools as ToolSet)
? {
@ -58,10 +60,12 @@ export async function generateKtxObject<TOutput, TSchema>(
tools: input.tools ?? {},
model,
});
const split = splitKtxSystemMessages(built.messages);
const result = await (input.generateText ?? generateText)({
model,
temperature: input.temperature ?? 0,
messages: built.messages,
...(split.system ? { system: split.system } : {}),
messages: split.messages,
tools: built.tools as ToolSet,
...(hasTools(built.tools as ToolSet)
? {

View file

@ -89,7 +89,7 @@ const buildMocks = (overrides: Partial<BuiltMocks> = {}): BuiltMocks => {
embeddingService: { computeEmbedding: vi.fn() },
semanticLayerService: {
forWorktree: vi.fn().mockReturnThis(),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
readSourceFile: vi.fn(),
},
slSearchService: { indexSources: vi.fn(), buildSearchText: vi.fn() },

View file

@ -308,7 +308,7 @@ export class MemoryAgentService {
// Reindex SL search if any SL actions actually landed on main.
if (hasSL && finalActions.some((a) => a.target === 'sl')) {
try {
const allSources = await this.deps.semanticLayerService.loadAllSources(input.connectionId!);
const { sources: allSources } = await this.deps.semanticLayerService.loadAllSources(input.connectionId!);
await this.deps.slSearchService.indexSources(input.connectionId!, allSources);
} catch (e) {
this.logger.warn(
@ -610,7 +610,7 @@ export class MemoryAgentService {
private async buildSlIndex(connectionId: string): Promise<string> {
const [sources, warehouseLine] = await Promise.all([
this.deps.semanticLayerService.loadAllSources(connectionId),
this.deps.semanticLayerService.loadAllSources(connectionId).then((result) => result.sources),
this.buildWarehouseLine(connectionId),
]);
const indexLines =

View file

@ -509,4 +509,11 @@ describe('generateKtxProjectConfigJsonSchema', () => {
const relationships = scan?.properties?.relationships as { properties?: Record<string, { description?: string }> };
expect(relationships?.properties?.acceptThreshold?.description).toMatch(/auto-accepted/);
});
it('emits the mappings shapes under connections', () => {
const serialized = JSON.stringify(schema);
expect(serialized).toContain('databaseMappings');
expect(serialized).toContain('connectionMappings');
expect(serialized).toContain('expectedLookerConnectionName');
});
});

View file

@ -1,6 +1,7 @@
import { KTX_MODEL_ROLES } from '@ktx/llm';
import YAML from 'yaml';
import * as z from 'zod';
import { connectionConfigSchema } from './driver-schemas.js';
const KTX_LLM_BACKENDS = ['none', 'anthropic', 'vertex', 'gateway'] as const;
const KTX_EMBEDDING_BACKENDS = ['none', 'deterministic', 'openai', 'sentence-transformers'] as const;
@ -206,12 +207,7 @@ const storageSchema = z
})
.describe('Storage backends and commit policy for KTX state and search indexes.');
const connectionSchema = z
.looseObject({
driver: z.string().min(1).optional().describe('Connector driver identifier (e.g. "postgres", "bigquery", "snowflake").'),
url: z.string().optional().describe('Connection URL or DSN. Format depends on the driver; may contain environment-variable references.'),
})
.describe('A single database/connector connection entry. Additional driver-specific fields are accepted and passed through.');
const connectionSchema = connectionConfigSchema;
const agentSchema = z
.strictObject({

View file

@ -0,0 +1,140 @@
import { describe, expect, it } from 'vitest';
import { connectionConfigSchema } from './driver-schemas.js';
describe('connectionConfigSchema (driver discriminated union)', () => {
it.each([
['postgres', 'postgres://user:pass@host:5432/db'], // pragma: allowlist secret
['postgresql', 'postgresql://user:pass@host:5432/db'], // pragma: allowlist secret
['mysql', 'mysql://user:pass@host:3306/db'], // pragma: allowlist secret
['snowflake', 'snowflake://account/db'],
['bigquery', 'bigquery://project/dataset'],
['sqlite', 'sqlite:///tmp/db.sqlite'],
['clickhouse', 'clickhouse://host:8123/db'],
['sqlserver', 'sqlserver://host:1433;database=db'],
])('parses %s warehouse connection', (driver, url) => {
expect(connectionConfigSchema.parse({ driver, url })).toMatchObject({ driver, url });
});
it('preserves unknown warehouse fields via looseObject passthrough', () => {
const parsed = connectionConfigSchema.parse({
driver: 'postgres',
url: 'postgres://x',
historicSql: { enabled: true },
context: { queryHistory: { enabled: false } },
});
expect(parsed).toMatchObject({
driver: 'postgres',
historicSql: { enabled: true },
context: { queryHistory: { enabled: false } },
});
});
it('rejects an unknown driver', () => {
expect(() => connectionConfigSchema.parse({ driver: 'nope', url: 'x' })).toThrow();
});
});
describe('connectionConfigSchema - context source drivers with mappings', () => {
it('parses a metabase connection with mappings', () => {
const parsed = connectionConfigSchema.parse({
driver: 'metabase',
api_url: 'https://metabase.example.com',
api_key_ref: 'env:METABASE_API_KEY', // pragma: allowlist secret
mappings: {
databaseMappings: { '3': 'prod-warehouse' },
syncEnabled: { '3': true },
syncMode: 'ONLY',
},
});
expect(parsed).toMatchObject({
driver: 'metabase',
api_url: 'https://metabase.example.com',
mappings: {
databaseMappings: { '3': 'prod-warehouse' },
syncMode: 'ONLY',
},
});
});
it('parses a looker connection with connectionMappings', () => {
const parsed = connectionConfigSchema.parse({
driver: 'looker',
base_url: 'https://looker.example.com',
client_id: 'abc',
client_secret_ref: 'env:LOOKER_CLIENT_SECRET', // pragma: allowlist secret
mappings: { connectionMappings: { bigquery_prod: 'wh' } },
});
expect(parsed.mappings).toEqual({ connectionMappings: { bigquery_prod: 'wh' } });
});
it('parses a lookml connection with expectedLookerConnectionName', () => {
const parsed = connectionConfigSchema.parse({
driver: 'lookml',
repoUrl: 'https://github.com/acme/looker.git',
branch: 'main',
mappings: { expectedLookerConnectionName: 'bigquery_prod' },
});
expect(parsed.mappings).toEqual({ expectedLookerConnectionName: 'bigquery_prod' });
});
it('rejects metabase mapping with non-integer database key', () => {
expect(() =>
connectionConfigSchema.parse({
driver: 'metabase',
api_url: 'https://x',
mappings: { databaseMappings: { abc: 'wh' } },
}),
).toThrow();
});
});
describe('connectionConfigSchema - notion / dbt / metricflow', () => {
it('parses a notion connection with selected_roots crawl', () => {
const parsed = connectionConfigSchema.parse({
driver: 'notion',
auth_token_ref: 'env:NOTION_TOKEN',
crawl_mode: 'selected_roots',
root_page_ids: ['abc', 'def'],
max_pages_per_run: 500,
});
expect(parsed).toMatchObject({
driver: 'notion',
crawl_mode: 'selected_roots',
root_page_ids: ['abc', 'def'],
max_pages_per_run: 500,
});
});
it('rejects notion with unknown crawl_mode', () => {
expect(() =>
connectionConfigSchema.parse({
driver: 'notion',
auth_token_ref: 'env:NOTION_TOKEN',
crawl_mode: 'everything',
}),
).toThrow();
});
it('parses a dbt connection from a local source_dir', () => {
const parsed = connectionConfigSchema.parse({
driver: 'dbt',
source_dir: '/tmp/dbt-project',
target: 'dev',
});
expect(parsed).toMatchObject({ driver: 'dbt', source_dir: '/tmp/dbt-project', target: 'dev' });
});
it('parses a metricflow connection with nested config', () => {
const parsed = connectionConfigSchema.parse({
driver: 'metricflow',
metricflow: {
repoUrl: 'https://github.com/acme/sl.git',
branch: 'main',
},
});
expect(parsed).toMatchObject({
driver: 'metricflow',
metricflow: { repoUrl: 'https://github.com/acme/sl.git' },
});
});
});

View file

@ -0,0 +1,205 @@
import * as z from 'zod';
import {
lookerMappingsSchema,
lookmlMappingsSchema,
metabaseMappingsSchema,
} from './mappings-yaml-schema.js';
const warehouseDrivers = [
'postgres',
'postgresql',
'mysql',
'snowflake',
'bigquery',
'sqlite',
'clickhouse',
'sqlserver',
] as const;
type WarehouseDriver = (typeof warehouseDrivers)[number];
function warehouseConnectionSchema<const Driver extends WarehouseDriver>(driver: Driver) {
return z
.looseObject({
driver: z.literal(driver),
url: z
.string()
.min(1)
.optional()
.describe('Warehouse connection URL or DSN; may contain environment-variable references like env:DATABASE_URL.'),
})
.describe(
`${driver} warehouse connection. Additional driver-tunable fields (e.g. historicSql, context.queryHistory) are accepted and passed through.`,
);
}
const warehouseConnectionSchemas = [
warehouseConnectionSchema('postgres'),
warehouseConnectionSchema('postgresql'),
warehouseConnectionSchema('mysql'),
warehouseConnectionSchema('snowflake'),
warehouseConnectionSchema('bigquery'),
warehouseConnectionSchema('sqlite'),
warehouseConnectionSchema('clickhouse'),
warehouseConnectionSchema('sqlserver'),
] as const;
const positiveIntKeyMessage = (field: string) => `${field} keys must be positive-integer strings (e.g. "1", "42")`;
const positiveIntKeyRegex = /^[1-9]\d*$/;
const metabaseMappingsStrictSchema = metabaseMappingsSchema.superRefine((value, ctx) => {
for (const key of Object.keys(value.databaseMappings ?? {})) {
if (!positiveIntKeyRegex.test(key)) {
ctx.addIssue({
code: 'custom',
path: ['databaseMappings', key],
message: positiveIntKeyMessage('databaseMappings'),
});
}
}
for (const key of Object.keys(value.syncEnabled ?? {})) {
if (!positiveIntKeyRegex.test(key)) {
ctx.addIssue({
code: 'custom',
path: ['syncEnabled', key],
message: positiveIntKeyMessage('syncEnabled'),
});
}
}
});
const metabaseConnectionSchema = z
.looseObject({
driver: z.literal('metabase'),
api_url: z.string().url().describe('Metabase instance API URL (e.g. https://metabase.example.com).'),
api_key: z.string().min(1).optional().describe('Literal Metabase API key. Prefer api_key_ref for safety.'),
api_key_ref: z
.string()
.min(1)
.optional()
.describe('Reference to Metabase API key (e.g. env:METABASE_API_KEY or file:/path).'),
network_proxy: z.looseObject({}).optional().describe('Optional network proxy configuration (snake_case form).'),
networkProxy: z.looseObject({}).optional().describe('Optional network proxy configuration (camelCase form).'),
mappings: metabaseMappingsStrictSchema
.optional()
.describe('Metabase database-to-warehouse mappings and sync configuration.'),
})
.describe('Metabase context-source connection.');
const lookerConnectionSchema = z
.looseObject({
driver: z.literal('looker'),
base_url: z.string().url().describe('Looker instance base URL (e.g. https://looker.example.com).'),
client_id: z.string().min(1).describe('Looker OAuth client ID.'),
client_secret: z.string().min(1).optional().describe('Literal Looker OAuth client secret. Prefer client_secret_ref.'),
client_secret_ref: z
.string()
.min(1)
.optional()
.describe('Reference to Looker OAuth client secret (e.g. env:LOOKER_CLIENT_SECRET).'),
mappings: lookerMappingsSchema.optional().describe('Looker connection-name to KTX warehouse mappings.'),
})
.describe('Looker context-source connection.');
const lookmlConnectionSchema = z
.looseObject({
driver: z.literal('lookml'),
repoUrl: z
.string()
.min(1)
.describe('Git URL of the LookML project (https, ssh, or file:). Field is camelCase by convention.'),
branch: z.string().min(1).optional().describe('Git branch (default "main" downstream).'),
path: z.string().optional().describe('Subdirectory within the repo when the LookML project lives in a monorepo.'),
auth_token_ref: z.string().min(1).optional().describe('Reference to Git auth token for private repos (e.g. env:GITHUB_TOKEN).'),
mappings: lookmlMappingsSchema.optional().describe('LookML expected-connection mapping for ingest gating.'),
})
.describe('LookML context-source connection.');
const notionConnectionSchema = z
.looseObject({
driver: z.literal('notion'),
auth_token: z.string().min(1).optional().describe('Literal Notion integration token. Prefer auth_token_ref.'),
auth_token_ref: z
.string()
.min(1)
.optional()
.describe('Reference to Notion integration token (e.g. env:NOTION_TOKEN).'),
crawl_mode: z
.enum(['selected_roots', 'all_accessible'])
.optional()
.describe(
'Crawl scope. "selected_roots" requires at least one of root_page_ids, root_database_ids, root_data_source_ids.',
),
root_page_ids: z.array(z.string().min(1)).optional().describe('Notion page IDs to crawl when crawl_mode is selected_roots.'),
root_database_ids: z
.array(z.string().min(1))
.optional()
.describe('Notion database IDs to crawl when crawl_mode is selected_roots.'),
root_data_source_ids: z
.array(z.string().min(1))
.optional()
.describe('Notion data source IDs to crawl when crawl_mode is selected_roots.'),
max_pages_per_run: z
.number()
.int()
.min(1)
.max(10000)
.optional()
.describe('Maximum Notion pages fetched in a single ingest run.'),
max_knowledge_creates_per_run: z
.number()
.int()
.min(0)
.max(25)
.optional()
.describe('Maximum new wiki pages created per run.'),
max_knowledge_updates_per_run: z
.number()
.int()
.min(0)
.max(100)
.optional()
.describe('Maximum existing wiki pages updated per run.'),
})
.describe('Notion context-source connection.');
const dbtConnectionSchema = z
.looseObject({
driver: z.literal('dbt'),
source_dir: z.string().min(1).optional().describe('Absolute or project-relative path to a local dbt project.'),
repo_url: z.string().min(1).optional().describe('Git URL of the dbt project (https, ssh, or file:).'),
branch: z.string().min(1).optional().describe('Git branch when using repo_url.'),
path: z.string().optional().describe('Subdirectory within the repo when the dbt project lives in a monorepo.'),
auth_token_ref: z.string().min(1).optional().describe('Reference to Git auth token for private repos.'),
profiles_path: z.string().optional().describe('Override path to dbt profiles.yml.'),
target: z.string().min(1).optional().describe('dbt target name (e.g. dev, prod).'),
project_name: z.string().min(1).optional().describe('Override auto-detected dbt project name.'),
})
.describe('dbt context-source connection.');
const metricflowConnectionSchema = z
.looseObject({
driver: z.literal('metricflow'),
metricflow: z
.looseObject({
repoUrl: z.string().min(1).describe('Git URL of the MetricFlow / SL project.'),
branch: z.string().min(1).optional().describe('Git branch (default "main").'),
path: z.string().optional().describe('Subdirectory within the repo when the SL config lives in a monorepo.'),
auth_token_ref: z.string().min(1).optional().describe('Reference to Git auth token for private repos.'),
})
.describe('Nested MetricFlow configuration block.'),
})
.describe('MetricFlow / SL context-source connection.');
export const connectionConfigSchema = z.discriminatedUnion('driver', [
...warehouseConnectionSchemas,
metabaseConnectionSchema,
lookerConnectionSchema,
lookmlConnectionSchema,
notionConnectionSchema,
dbtConnectionSchema,
metricflowConnectionSchema,
]);
export type KtxConnectionConfig = z.infer<typeof connectionConfigSchema>;

View file

@ -15,6 +15,7 @@ export {
serializeKtxProjectConfig,
validateKtxProjectConfig,
} from './config.js';
export type { KtxConnectionConfig } from './driver-schemas.js';
export type { LocalGitFileStoreDeps } from './local-git-file-store.js';
export { LocalGitFileStore } from './local-git-file-store.js';
export { ktxLocalStateDbPath } from './local-state-db.js';

View file

@ -1,5 +1,8 @@
import { describe, expect, it } from 'vitest';
import {
lookerMappingsSchema,
lookmlMappingsSchema,
metabaseMappingsSchema,
parseConnectionMappingBootstrap,
parseLookmlMappingBootstrap,
parseLookerMappingBootstrap,
@ -82,4 +85,17 @@ describe('ktx.yaml mapping bootstrap schema', () => {
}),
).toMatchObject({ adapter: 'looker', connectionId: 'prod-looker' });
});
it('exports mapping shapes that parse documented examples', () => {
expect(metabaseMappingsSchema.parse({ databaseMappings: { '1': 'wh' } })).toMatchObject({
databaseMappings: { '1': 'wh' },
syncMode: 'ALL',
});
expect(lookerMappingsSchema.parse({ connectionMappings: { x: 'wh' } })).toEqual({
connectionMappings: { x: 'wh' },
});
expect(lookmlMappingsSchema.parse({ expectedLookerConnectionName: 'x' })).toEqual({
expectedLookerConnectionName: 'x',
});
});
});

View file

@ -1,5 +1,4 @@
import * as z from 'zod';
import type { KtxProjectConnectionConfig } from './config.js';
const metabaseSyncModeSchema = z.enum(['ALL', 'ONLY', 'EXCEPT']);
const positiveIntegerValueSchema = z.number().int().positive();
@ -11,24 +10,48 @@ const metabaseSelectionsSchema = z
items: z.array(positiveIntegerValueSchema).default([]),
});
const metabaseMappingsSchema = z
export const metabaseMappingsSchema = z
.object({
databaseMappings: z.record(z.string(), stringTargetSchema).default({}),
syncEnabled: z.record(z.string(), z.boolean()).default({}),
syncMode: metabaseSyncModeSchema.default('ALL'),
selections: metabaseSelectionsSchema.default({ collections: [], items: [] }),
defaultTagNames: z.array(z.string().min(1)).default([]),
});
databaseMappings: z
.record(z.string(), stringTargetSchema)
.default({})
.describe('Map of Metabase database ID (positive integer string) to KTX connection ID. Use null to explicitly unmap.'),
syncEnabled: z
.record(z.string(), z.boolean())
.default({})
.describe('Per-Metabase-database sync toggle, keyed by Metabase database ID string.'),
syncMode: metabaseSyncModeSchema
.default('ALL')
.describe('Sync scope: ALL ingests every mapped DB; ONLY restricts to syncEnabled=true; EXCEPT excludes syncEnabled=true.'),
selections: metabaseSelectionsSchema
.default({ collections: [], items: [] })
.describe('Optional Metabase collection and item IDs to scope ingest.'),
defaultTagNames: z
.array(z.string().min(1))
.default([])
.describe('Default tag names applied to ingested Metabase artifacts.'),
})
.describe('Metabase database-to-warehouse mapping and sync configuration.');
const lookerMappingsSchema = z
export const lookerMappingsSchema = z
.object({
connectionMappings: z.record(z.string().min(1), stringTargetSchema).default({}),
});
connectionMappings: z
.record(z.string().min(1), stringTargetSchema)
.default({})
.describe('Map of Looker connection name to KTX connection ID. Use null to explicitly unmap.'),
})
.describe('Looker connection-to-warehouse mapping configuration.');
const lookmlMappingsSchema = z
export const lookmlMappingsSchema = z
.object({
expectedLookerConnectionName: z.string().min(1).nullable().default(null),
});
expectedLookerConnectionName: z
.string()
.min(1)
.nullable()
.default(null)
.describe('Looker connection name that LookML models must declare; mismatches block sl_write_source at ingest time.'),
})
.describe('LookML connection-name expectation for ingest gating.');
export type MetabaseMappingBootstrap = {
adapter: 'metabase';
@ -54,6 +77,11 @@ export type LookmlMappingBootstrap = {
export type ConnectionMappingBootstrap = MetabaseMappingBootstrap | LookerMappingBootstrap | LookmlMappingBootstrap;
type MappingConnectionInput = Record<string, unknown> & {
driver?: unknown;
mappings?: unknown;
};
function recordValue(value: unknown): Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value) ? (value as Record<string, unknown>) : {};
}
@ -66,13 +94,13 @@ function assertPositiveIntegerKeys(field: string, record: Record<string, unknown
}
}
function driverOf(connection: KtxProjectConnectionConfig): string {
function driverOf(connection: MappingConnectionInput): string {
return String(connection.driver ?? '').toLowerCase();
}
export function parseMetabaseMappingBootstrap(
connectionId: string,
connection: KtxProjectConnectionConfig,
connection: MappingConnectionInput,
): MetabaseMappingBootstrap {
const rawMappings = recordValue(connection.mappings);
assertPositiveIntegerKeys('databaseMappings', recordValue(rawMappings.databaseMappings));
@ -91,7 +119,7 @@ export function parseMetabaseMappingBootstrap(
export function parseLookerMappingBootstrap(
connectionId: string,
connection: KtxProjectConnectionConfig,
connection: MappingConnectionInput,
): LookerMappingBootstrap {
const parsed = lookerMappingsSchema.parse(recordValue(connection.mappings));
return {
@ -103,7 +131,7 @@ export function parseLookerMappingBootstrap(
export function parseLookmlMappingBootstrap(
connectionId: string,
connection: KtxProjectConnectionConfig,
connection: MappingConnectionInput,
): LookmlMappingBootstrap {
const parsed = lookmlMappingsSchema.parse(recordValue(connection.mappings));
return {
@ -115,7 +143,7 @@ export function parseLookmlMappingBootstrap(
export function parseConnectionMappingBootstrap(
connectionId: string,
connection: KtxProjectConnectionConfig,
connection: MappingConnectionInput,
): ConnectionMappingBootstrap | null {
if (!connection.mappings || typeof connection.mappings !== 'object' || Array.isArray(connection.mappings)) {
return null;

View file

@ -203,11 +203,11 @@ describe('KtxDescriptionGenerator', () => {
expect(generateText).toHaveBeenCalledWith(
expect.objectContaining({
temperature: 0.2,
system: expect.objectContaining({
role: 'system',
content: expect.stringContaining('Please provide a concise description in 12 words or less.'),
}),
messages: expect.arrayContaining([
expect.objectContaining({
role: 'system',
content: expect.stringContaining('Please provide a concise description in 12 words or less.'),
}),
expect.objectContaining({
role: 'user',
content: expect.stringContaining('<column_name> status </column_name>'),
@ -215,6 +215,8 @@ describe('KtxDescriptionGenerator', () => {
]),
}),
);
const lastCall = vi.mocked(generateText).mock.calls.at(-1)?.[0];
expect(lastCall?.messages?.some((message) => message.role === 'system')).toBe(false);
});
it('samples through the connector when column values are not pre-fetched', async () => {
@ -391,3 +393,289 @@ describe('KtxDescriptionGenerator', () => {
expect(cache.set).toHaveBeenCalledWith('__connection:Warehouse', 'Commerce orders');
});
});
describe('KtxDescriptionGenerator resilience', () => {
function createLogger() {
return {
debug: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
};
}
it('retries sampleTable on transient failure and uses sampled rows when it eventually succeeds', async () => {
const sampleTable = vi
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
.mockRejectedValueOnce(new Error('pool: transient ECONNRESET'))
.mockRejectedValueOnce(new Error('pool: transient ECONNRESET'))
.mockResolvedValue({
headers: ['id', 'status'],
rows: [
[1, 'paid'],
[2, 'refunded'],
],
totalRows: 2,
});
const connector: KtxScanConnector = {
...createConnector(),
sampleTable,
};
const logger = createLogger();
const warnings: Array<{ code: string; table?: string }> = [];
const generator = new KtxDescriptionGenerator({
llmProvider: createLlmProvider('Commerce orders'),
logger,
onWarning: (warning) => warnings.push({ code: warning.code, ...(warning.table ? { table: warning.table } : {}) }),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24, concurrencyLimit: 2 },
});
const description = await generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: { catalog: null, db: 'public', name: 'orders' },
});
expect(description).toBe('Commerce orders');
expect(sampleTable).toHaveBeenCalledTimes(3);
expect(logger.warn).toHaveBeenCalledTimes(2);
expect(warnings).toEqual([]);
});
it('falls back to metadata-only prompt when sampleTable retries exhaust', async () => {
const sampleTable = vi
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
.mockRejectedValue(new Error('pool: connection refused'));
const connector: KtxScanConnector = {
...createConnector(),
sampleTable,
};
const logger = createLogger();
const warnings: Array<{ code: string; table?: string; metadata?: Record<string, unknown> }> = [];
const generator = new KtxDescriptionGenerator({
llmProvider: createLlmProvider('Customer reference data'),
logger,
onWarning: (warning) =>
warnings.push({
code: warning.code,
...(warning.table ? { table: warning.table } : {}),
...(warning.metadata ? { metadata: warning.metadata } : {}),
}),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24, concurrencyLimit: 2 },
});
const description = await generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: {
catalog: null,
db: 'public',
name: 'customers',
columns: [
{ name: 'id', nativeType: 'uuid' },
{ name: 'email', nativeType: 'text', comment: 'Primary contact email' },
],
},
});
expect(description).toBe('Customer reference data');
expect(sampleTable).toHaveBeenCalledTimes(3);
expect(warnings.map((warning) => warning.code)).toEqual(['sampling_failed', 'description_fallback_used']);
expect(warnings[1]?.metadata?.reason).toBe('sampling_failed');
const userPrompt = (vi.mocked(generateText).mock.calls.at(-1)?.[0] as { messages: Array<{ role: string; content: string }> })
.messages.find((message) => message.role === 'user')?.content;
expect(userPrompt).toContain('Columns (metadata only, no sample rows)');
expect(userPrompt).toContain('email (text)');
expect(userPrompt).toContain('Primary contact email');
});
it('emits enrichment_failed and returns null when both sampling and metadata-only LLM fail', async () => {
const sampleTable = vi
.fn<NonNullable<KtxScanConnector['sampleTable']>>()
.mockRejectedValue(new Error('pool: connection refused'));
const connector: KtxScanConnector = {
...createConnector(),
sampleTable,
};
const warnings: string[] = [];
const generator = new KtxDescriptionGenerator({
llmProvider: createFailingLlmProvider(),
onWarning: (warning) => warnings.push(warning.code),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const description = await generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: { catalog: null, db: 'public', name: 'orphan', columns: [{ name: 'id' }] },
});
expect(description).toBeNull();
expect(warnings).toEqual(['sampling_failed', 'enrichment_failed']);
});
it('uses metadata-only fallback when connector has no sampleTable', async () => {
const connector = createConnector();
const samplerWithoutTable: KtxScanConnector = {
...connector,
sampleTable: undefined,
};
const warnings: string[] = [];
const generator = new KtxDescriptionGenerator({
llmProvider: createLlmProvider('Orders mart'),
onWarning: (warning) => warnings.push(warning.code),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const description = await generator.generateTableDescription({
connectionId: 'conn-1',
connector: samplerWithoutTable,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
table: {
catalog: null,
db: 'public',
name: 'mart_orders',
columns: [{ name: 'order_id', nativeType: 'uuid' }],
},
});
expect(description).toBe('Orders mart');
expect(warnings).toEqual(['connector_capability_missing', 'description_fallback_used']);
});
it('aborts retry loop when the scan context signal fires', async () => {
const controller = new AbortController();
const sampleTable = vi.fn<NonNullable<KtxScanConnector['sampleTable']>>().mockImplementation(async () => {
controller.abort();
throw new Error('first attempt blew up');
});
const connector: KtxScanConnector = {
...createConnector(),
sampleTable,
};
const warnings: string[] = [];
const generator = new KtxDescriptionGenerator({
llmProvider: createLlmProvider('should not be called'),
onWarning: (warning) => warnings.push(warning.code),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
await expect(
generator.generateTableDescription({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1', signal: controller.signal },
dataSourceType: 'POSTGRESQL',
table: { catalog: null, db: 'public', name: 'orders' },
}),
).rejects.toThrow('aborted');
expect(sampleTable).toHaveBeenCalledTimes(1);
expect(warnings).toEqual([]);
});
it('generates column descriptions from rawDescriptions when sampleColumn is unavailable', async () => {
const samplerWithoutColumn: KtxScanConnector = {
...createConnector(),
sampleColumn: undefined,
};
const logger = createLogger();
const generator = new KtxDescriptionGenerator({
llmProvider: createLlmProvider('Payment lifecycle state'),
logger,
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector: samplerWithoutColumn,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'status', rawDescriptions: { db: 'order lifecycle state' } }],
},
});
expect(result.columnDescriptions).toEqual([['status', 'Payment lifecycle state']]);
expect(logger.warn).toHaveBeenCalled();
const userPrompt = (
vi.mocked(generateText).mock.calls.at(-1)?.[0] as { messages: Array<{ role: string; content: string }> }
).messages.find((message) => message.role === 'user')?.content;
expect(userPrompt).toContain('<sample_values> unavailable </sample_values>');
expect(userPrompt).toContain('<db_documentation> order lifecycle state </db_documentation>');
});
it('generates column descriptions from rawDescriptions when sampleColumn retries exhaust', async () => {
const sampleColumn = vi
.fn<NonNullable<KtxScanConnector['sampleColumn']>>()
.mockRejectedValue(new Error('pool: connection refused'));
const flakyConnector: KtxScanConnector = {
...createConnector(),
sampleColumn,
};
const generator = new KtxDescriptionGenerator({
llmProvider: createLlmProvider('Customer reference identifier'),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector: flakyConnector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'customer_id', rawDescriptions: { db: 'FK to customers.id' } }],
},
});
expect(sampleColumn).toHaveBeenCalledTimes(3);
expect(result.columnDescriptions).toEqual([['customer_id', 'Customer reference identifier']]);
});
it('skips column LLM call only when neither samples nor rawDescriptions are available', async () => {
const sampleColumn = vi
.fn<NonNullable<KtxScanConnector['sampleColumn']>>()
.mockResolvedValue({ values: [null, null], nullCount: 2, distinctCount: 0 });
const connector: KtxScanConnector = {
...createConnector(),
sampleColumn,
};
vi.mocked(generateText).mockClear();
const generator = new KtxDescriptionGenerator({
llmProvider: createLlmProvider('should not be called'),
settings: { columnMaxWords: 12, tableMaxWords: 18, dataSourceMaxWords: 24 },
});
const result = await generator.generateColumnDescriptions({
connectionId: 'conn-1',
connector,
context: { runId: 'run-1' },
dataSourceType: 'POSTGRESQL',
supportsNestedAnalysis: false,
table: {
catalog: null,
db: 'public',
name: 'orders',
columns: [{ name: 'opaque_blob' }],
},
});
expect(result.columnDescriptions).toEqual([['opaque_blob', null]]);
expect(generateText).not.toHaveBeenCalled();
});
});

View file

@ -5,11 +5,18 @@ import type {
KtxColumnSampleResult,
KtxScanContext,
KtxScanLoggerPort,
KtxScanWarning,
KtxTableRef,
KtxTableSampleInput,
KtxTableSampleResult,
} from './types.js';
interface KtxDescriptionTableColumn {
name: string;
nativeType?: string | null;
comment?: string | null;
}
export interface KtxDescriptionCachePort {
buildTableKey(table: KtxTableRef): string;
buildColumnKey(table: KtxTableRef, columnName: string): string;
@ -53,6 +60,7 @@ export interface KtxDescriptionColumnTable extends KtxTableRef {
export interface KtxDescriptionTableInput extends KtxTableRef {
rawDescriptions?: Record<string, string>;
columns?: KtxDescriptionTableColumn[];
}
export interface KtxColumnAnalysisResult {
@ -72,7 +80,8 @@ export interface KtxColumnDescriptionPromptInput {
export interface KtxTableDescriptionPromptInput {
tableName: string;
sampleData: KtxTableSampleResult;
sampleData?: KtxTableSampleResult;
columns?: KtxDescriptionTableColumn[];
dataSourceType: string;
rawDescriptions?: Record<string, string>;
}
@ -114,6 +123,7 @@ export interface KtxDescriptionGeneratorOptions {
llmProvider: KtxLlmProvider;
cache?: KtxDescriptionCachePort;
logger?: KtxScanLoggerPort;
onWarning?: (warning: KtxScanWarning) => void;
settings: KtxDescriptionGenerationSettings;
}
@ -136,6 +146,66 @@ function errorMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
class KtxAbortedError extends Error {
constructor() {
super('aborted');
this.name = 'KtxAbortedError';
}
}
async function delayWithAbort(ms: number, signal?: AbortSignal): Promise<void> {
if (!signal) {
await new Promise<void>((resolve) => setTimeout(resolve, ms));
return;
}
if (signal.aborted) {
throw new KtxAbortedError();
}
await new Promise<void>((resolve, reject) => {
const timer = setTimeout(() => {
signal.removeEventListener('abort', onAbort);
resolve();
}, ms);
const onAbort = (): void => {
clearTimeout(timer);
reject(new KtxAbortedError());
};
signal.addEventListener('abort', onAbort, { once: true });
});
}
interface RetryAsyncOptions {
attempts: number;
baseDelayMs: number;
signal?: AbortSignal;
onAttemptFailure?: (error: unknown, attempt: number) => void;
}
async function retryAsync<T>(fn: () => Promise<T>, options: RetryAsyncOptions): Promise<T> {
const attempts = Math.max(1, options.attempts);
let lastError: unknown;
for (let attempt = 1; attempt <= attempts; attempt += 1) {
if (options.signal?.aborted) {
throw new KtxAbortedError();
}
try {
return await fn();
} catch (error) {
lastError = error;
if (error instanceof KtxAbortedError) {
throw error;
}
options.onAttemptFailure?.(error, attempt);
if (attempt === attempts) {
break;
}
const delay = options.baseDelayMs * 2 ** (attempt - 1);
await delayWithAbort(delay, options.signal);
}
}
throw lastError;
}
function toTableRef(table: KtxTableRef): KtxTableRef {
return {
catalog: table.catalog,
@ -205,11 +275,12 @@ Example:
systemParts.push(wordLimitLine(input.maxWords));
}
const sampleValuesContent = valuesStr.length > 0 ? valuesStr : 'unavailable';
let user = `<table_context> ${input.tableContext} </table_context>
<column_name> ${input.columnName} </column_name>
<sample_values> ${valuesStr} </sample_values>
<sample_values> ${sampleValuesContent} </sample_values>
`;
const sources = descriptionSources(input.rawDescriptions);
@ -228,16 +299,6 @@ Example:
export function buildKtxTableDescriptionPrompt(
input: KtxTableDescriptionPromptInput & { maxWords?: number },
): KtxDescriptionPrompt {
const columnInfo: string[] = [];
for (let index = 0; index < Math.min(input.sampleData.headers.length, 10); index += 1) {
const header = input.sampleData.headers[index];
const sampleValues = input.sampleData.rows
.slice(0, 3)
.map((row) => row[index])
.filter((value) => value !== null && value !== undefined);
columnInfo.push(`${header}: ${sampleValues.map((value) => String(value)).join(', ')}`);
}
const systemParts: string[] = [
`Analyze database tables and provide a concise description.
@ -256,9 +317,38 @@ Example: "Information about healthcare professionals used for workforce manageme
systemParts.push(wordLimitLine(input.maxWords));
}
const hasSamples = !!input.sampleData && input.sampleData.rows.length > 0;
let columnsLine: string;
let rowsLine: string;
if (hasSamples) {
const sampleData = input.sampleData!;
const columnInfo: string[] = [];
for (let index = 0; index < Math.min(sampleData.headers.length, 10); index += 1) {
const header = sampleData.headers[index];
const sampleValues = sampleData.rows
.slice(0, 3)
.map((row) => row[index])
.filter((value) => value !== null && value !== undefined);
columnInfo.push(`${header}: ${sampleValues.map((value) => String(value)).join(', ')}`);
}
columnsLine = `Columns and sample data: ${columnInfo.join(' | ')}`;
rowsLine = `Total rows in sample: ${sampleData.rows.length}`;
} else if (input.columns && input.columns.length > 0) {
const columnInfo = input.columns.slice(0, 30).map((column) => {
const typePart = column.nativeType ? ` (${column.nativeType})` : '';
const commentPart = column.comment ? `${column.comment}` : '';
return `${column.name}${typePart}${commentPart}`;
});
columnsLine = `Columns (metadata only, no sample rows): ${columnInfo.join(' | ')}`;
rowsLine = 'Sample rows: unavailable';
} else {
columnsLine = 'Columns: unavailable';
rowsLine = 'Sample rows: unavailable';
}
let user = `Table: ${input.tableName}
Columns and sample data: ${columnInfo.join(' | ')}
Total rows in sample: ${input.sampleData.rows.length}
${columnsLine}
${rowsLine}
Data source type: ${input.dataSourceType}`;
const sources = descriptionSources(input.rawDescriptions);
@ -313,12 +403,14 @@ export class KtxDescriptionGenerator {
private readonly llmProvider: KtxLlmProvider;
private readonly cache?: KtxDescriptionCachePort;
private readonly logger?: KtxScanLoggerPort;
private readonly onWarning?: (warning: KtxScanWarning) => void;
private readonly settings: ResolvedKtxDescriptionGenerationSettings;
constructor(options: KtxDescriptionGeneratorOptions) {
this.llmProvider = options.llmProvider;
this.cache = options.cache;
this.logger = options.logger;
this.onWarning = options.onWarning;
this.settings = {
columnMaxWords: options.settings.columnMaxWords,
tableMaxWords: options.settings.tableMaxWords,
@ -366,26 +458,82 @@ export class KtxDescriptionGenerator {
}
}
if (!input.connector.sampleTable) {
this.logger?.warn('KTX scan connector does not support table sampling for table description generation', {
const sampleTable = input.connector.sampleTable;
let sampleData: KtxTableSampleResult | null = null;
let fallbackReason: 'capability_missing' | 'sampling_failed' | 'empty_sample' | null = null;
if (!sampleTable) {
fallbackReason = 'capability_missing';
this.logger?.warn('KTX scan connector does not support table sampling; falling back to metadata-only prompt', {
connectorId: input.connector.id,
table: input.table.name,
});
return 'Table not found';
this.onWarning?.({
code: 'connector_capability_missing',
message: `Connector ${input.connector.id} does not support sampleTable; using metadata-only description prompt`,
table: input.table.name,
recoverable: true,
metadata: { connectorId: input.connector.id, capability: 'sampleTable' },
});
} else {
try {
sampleData = await retryAsync(
() =>
sampleTable(
{
connectionId: input.connectionId,
table: tableRef,
limit: 20,
},
input.context,
),
{
attempts: 3,
baseDelayMs: 200,
signal: input.context.signal,
onAttemptFailure: (error, attempt) => {
this.logger?.warn(
`sampleTable attempt ${attempt} failed for ${input.table.name}: ${errorMessage(error)}`,
{
connectorId: input.connector.id,
table: input.table.name,
attempt,
},
);
},
},
);
if (sampleData.rows.length === 0) {
fallbackReason = 'empty_sample';
this.logger?.warn('sampleTable returned no rows; using metadata-only prompt', {
connectorId: input.connector.id,
table: input.table.name,
});
}
} catch (error) {
if (error instanceof KtxAbortedError) {
throw error;
}
fallbackReason = 'sampling_failed';
this.logger?.error(`sampleTable exhausted retries for ${input.table.name}: ${errorMessage(error)}`, {
connectorId: input.connector.id,
table: input.table.name,
});
this.onWarning?.({
code: 'sampling_failed',
message: `Failed to sample table ${input.table.name} after retries: ${errorMessage(error)}`,
table: input.table.name,
recoverable: true,
metadata: { connectorId: input.connector.id, error: errorMessage(error) },
});
}
}
try {
const sampleData = await input.connector.sampleTable(
{
connectionId: input.connectionId,
table: tableRef,
limit: 20,
},
input.context,
);
const prompt = buildKtxTableDescriptionPrompt({
tableName: input.table.name,
sampleData,
...(fallbackReason === null && sampleData ? { sampleData } : {}),
...(input.table.columns && input.table.columns.length > 0 ? { columns: input.table.columns } : {}),
dataSourceType: input.dataSourceType,
rawDescriptions: input.table.rawDescriptions,
maxWords: this.settings.tableMaxWords,
@ -394,10 +542,38 @@ export class KtxDescriptionGenerator {
if (cacheKey && description) {
await this.cache?.set(cacheKey, description);
}
if (description && fallbackReason !== null) {
this.onWarning?.({
code: 'description_fallback_used',
message: `Generated table description without sample rows for ${input.table.name} (reason: ${fallbackReason})`,
table: input.table.name,
recoverable: true,
metadata: { connectorId: input.connector.id, reason: fallbackReason },
});
}
if (!description) {
this.onWarning?.({
code: 'enrichment_failed',
message: `Failed to generate description for table ${input.table.name}`,
table: input.table.name,
recoverable: true,
metadata: { connectorId: input.connector.id, usedFallback: fallbackReason !== null },
});
}
return description;
} catch (error) {
this.logger?.error(`Error generating table description: ${errorMessage(error)}`);
return 'Table not found';
this.logger?.error(`Error generating table description: ${errorMessage(error)}`, {
connectorId: input.connector.id,
table: input.table.name,
});
this.onWarning?.({
code: 'enrichment_failed',
message: `Failed to generate description for table ${input.table.name}: ${errorMessage(error)}`,
table: input.table.name,
recoverable: true,
metadata: { connectorId: input.connector.id },
});
return null;
}
}
@ -496,33 +672,64 @@ export class KtxDescriptionGenerator {
let columnValues = column.sampleValues;
if (!columnValues || columnValues.length === 0) {
if (!input.connector.sampleColumn) {
this.logger?.warn('KTX scan connector does not support column sampling for column description generation', {
this.logger?.warn('KTX scan connector does not support column sampling; using available metadata only', {
connectorId: input.connector.id,
table: input.table.name,
column: column.name,
});
return {
columnName: column.name,
description: null,
skipped: false,
processed: false,
};
columnValues = [];
} else {
const sampleColumn = input.connector.sampleColumn;
try {
const sample = await retryAsync(
() =>
sampleColumn(
{
connectionId: input.connectionId,
table: tableRef,
column: column.name,
limit: 50,
},
input.context,
),
{
attempts: 3,
baseDelayMs: 200,
signal: input.context.signal,
onAttemptFailure: (error, attempt) => {
this.logger?.warn(
`sampleColumn attempt ${attempt} failed for ${input.table.name}.${column.name}: ${errorMessage(error)}`,
{
connectorId: input.connector.id,
table: input.table.name,
column: column.name,
attempt,
},
);
},
},
);
columnValues = sample.values;
} catch (error) {
if (error instanceof KtxAbortedError) {
throw error;
}
this.logger?.warn(
`sampleColumn exhausted retries for ${input.table.name}.${column.name}; using available metadata only: ${errorMessage(error)}`,
{
connectorId: input.connector.id,
table: input.table.name,
column: column.name,
},
);
columnValues = [];
}
}
const sample = await input.connector.sampleColumn(
{
connectionId: input.connectionId,
table: tableRef,
column: column.name,
limit: 50,
},
input.context,
);
columnValues = sample.values;
}
const nonNullValues = (columnValues ?? []).filter((value) => value !== null && value !== undefined);
if (nonNullValues.length === 0) {
const hasRawDescriptions = descriptionSources(column.rawDescriptions).length > 0;
if (nonNullValues.length === 0 && !hasRawDescriptions) {
return {
columnName: column.name,
description: null,
@ -553,7 +760,14 @@ export class KtxDescriptionGenerator {
processed: description !== null,
};
} catch (error) {
this.logger?.error(`Error analyzing column '${column.name}': ${errorMessage(error)}`);
if (error instanceof KtxAbortedError) {
throw error;
}
this.logger?.error(`Error analyzing column '${column.name}': ${errorMessage(error)}`, {
connectorId: input.connector.id,
table: input.table.name,
column: column.name,
});
return {
columnName: column.name,
description: null,

View file

@ -404,6 +404,41 @@ describe('local scan enrichment', () => {
expect(result.resolvedRelationships).toBeNull();
});
it('forwards context.logger and emits warnings when sampleTable fails repeatedly', async () => {
const failingConnector: KtxScanConnector = {
...connector(),
sampleTable: vi.fn(async () => {
throw new Error('pool: ECONNRESET');
}),
};
const logger = {
debug: vi.fn(),
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
};
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',
mode: 'enriched',
detectRelationships: false,
connector: failingConnector,
context: { runId: 'scan-run-warnings', logger },
providers: createDeterministicLocalScanEnrichmentProviders({ embeddingDimensions: 6 }),
});
const codes = result.warnings.map((warning) => warning.code);
expect(codes).toContain('sampling_failed');
expect(codes).toContain('description_fallback_used');
expect(result.warnings.some((warning) => warning.table === 'customers')).toBe(true);
expect(logger.warn).toHaveBeenCalled();
expect(logger.error).toHaveBeenCalled();
// Each of the two tables produced sampling_failed + description_fallback_used, so 2 + 2 = 4 warnings minimum.
expect(result.warnings.length).toBeGreaterThanOrEqual(4);
// Sampling was retried 3× for each of the 2 tables = 6 calls
expect(failingConnector.sampleTable).toHaveBeenCalledTimes(6);
});
it('runs configured deterministic enrichment with descriptions and embeddings', async () => {
const result = await runLocalScanEnrichment({
connectionId: 'warehouse',

View file

@ -298,6 +298,18 @@ function descriptionTable(table: KtxSchemaTable): KtxDescriptionColumnTable {
};
}
function tableMetadataColumns(table: KtxSchemaTable): Array<{
name: string;
nativeType?: string | null;
comment?: string | null;
}> {
return table.columns.map((column) => ({
name: column.name,
nativeType: column.nativeType ?? null,
comment: column.comment ?? null,
}));
}
function embeddingBatchSize(maxBatchSize: number): number {
return Number.isInteger(maxBatchSize) && maxBatchSize > 0 ? maxBatchSize : 100;
}
@ -308,9 +320,19 @@ async function generateDescriptions(input: {
context: KtxScanContext;
providers: KtxLocalScanEnrichmentProviders;
progress?: KtxProgressPort;
warnings?: KtxScanWarning[];
}): Promise<KtxLocalScanEnrichmentResult['descriptionUpdates']> {
const warningSink = input.warnings;
const generator = new KtxDescriptionGenerator({
llmProvider: input.providers.llm,
...(input.context.logger ? { logger: input.context.logger } : {}),
...(warningSink
? {
onWarning: (warning: KtxScanWarning) => {
warningSink.push(warning);
},
}
: {}),
settings: {
columnMaxWords: 16,
tableMaxWords: 24,
@ -355,6 +377,7 @@ async function generateDescriptions(input: {
db: table.db,
name: table.name,
rawDescriptions: table.comment ? { db: table.comment } : {},
columns: tableMetadataColumns(table),
},
});
return {
@ -559,6 +582,7 @@ export async function runLocalScanEnrichment(
context: input.context,
providers,
progress: descriptionProgress,
warnings,
}),
});
const embeddingProgress = progress?.startPhase(0.2);

View file

@ -166,11 +166,11 @@ describe('relationship LLM proposals', () => {
});
expect(generateText).toHaveBeenCalledWith(
expect.objectContaining({
system: expect.objectContaining({
role: 'system',
content: expect.stringContaining('You are helping KTX review possible SQL relationships'),
}),
messages: expect.arrayContaining([
expect.objectContaining({
role: 'system',
content: expect.stringContaining('You are helping KTX review possible SQL relationships'),
}),
expect.objectContaining({
role: 'user',
content: expect.stringContaining('"tables"'),
@ -178,9 +178,12 @@ describe('relationship LLM proposals', () => {
]),
}),
);
const call = (generateText.mock.calls as unknown as Array<[{ messages: Array<{ role: string; content: string }> }]>)[0]?.[0];
const call = (
generateText.mock.calls as unknown as Array<[{ messages: Array<{ role: string; content: string }> }]>
)[0]?.[0];
const userMessage = call?.messages.find((m) => m.role === 'user');
expect(userMessage?.content).not.toContain('You are helping KTX review possible SQL relationships');
expect(call?.messages.some((m) => m.role === 'system')).toBe(false);
});
it('skips deterministic providers without calling generateText', async () => {

View file

@ -345,7 +345,8 @@ export type KtxScanWarningCode =
| 'relationship_llm_invalid_reference'
| 'relationship_llm_proposal_failed'
| 'credential_redacted'
| 'enrichment_failed';
| 'enrichment_failed'
| 'description_fallback_used';
export interface KtxScanWarning {
code: KtxScanWarningCode;

View file

@ -392,6 +392,26 @@ describe('local semantic-layer helpers', () => {
).rejects.toThrow('Invalid semantic-layer source');
});
it('reports legacy overlay column patches with a file-attributed migration hint', async () => {
const invalidYaml = [
'name: orders',
'columns:',
' - name: status',
' descriptions:',
' user: Order status.',
'',
].join('\n');
await expect(
validateLocalSlSource(invalidYaml, { project, connectionId: 'warehouse', sourceName: 'orders' }),
).resolves.toEqual({
valid: false,
errors: [
"semantic-layer/warehouse/orders.yaml: column 'status' patches a manifest column but is in 'columns:' — move it to 'column_overrides:'",
],
});
});
it('rejects unsafe source paths', async () => {
await expect(
readLocalSlSource(project, {

View file

@ -12,6 +12,7 @@ import {
type ManifestTableEntry,
projectManifestEntry,
SemanticLayerService,
toResolvedWire,
} from './semantic-layer.service.js';
import type { PgliteSlSearchPrototypeOwnerOptions } from './pglite-sl-search-prototype.js';
import { loadLatestSlDictionaryEntries } from './sl-dictionary-profile.js';
@ -240,7 +241,12 @@ export async function loadLocalSlSourceRecords(
if (!base) {
continue;
}
const source = composeOverlay(base.source, parsed);
let source: SemanticLayerSource;
try {
source = composeOverlay(base.source, parsed);
} catch (error) {
throw new Error(`${path}: ${error instanceof Error ? error.message : String(error)}`);
}
sources.set(name, {
...summarizeSemanticSource({ connectionId, path, source }),
yaml: sourceToYaml(source),
@ -253,11 +259,28 @@ export async function loadLocalSlSourceRecords(
export async function validateLocalSlSource(
rawYaml: string,
options?: { project?: KtxLocalProject; connectionId?: string },
options?: { project?: KtxLocalProject; connectionId?: string; sourceName?: string },
): Promise<LocalSlValidationResult> {
try {
const parsed = parseYamlRecord(rawYaml);
const schema = parsed.table || parsed.sql ? sourceDefinitionSchema : sourceOverlaySchema;
if (schema === sourceOverlaySchema && Array.isArray(parsed.columns)) {
const sourceName = options?.sourceName ?? (typeof parsed.name === 'string' ? parsed.name : 'source');
const path =
options?.connectionId && isSafeConnectionId(options.connectionId)
? `semantic-layer/${options.connectionId}/${sourceName}.yaml`
: `${sourceName}.yaml`;
const legacyColumnPatchErrors = parsed.columns
.filter((column): column is Record<string, unknown> => isRecord(column))
.filter((column) => typeof column.name === 'string' && (!column.expr || !column.type))
.map(
(column) =>
`${path}: column '${column.name}' patches a manifest column but is in 'columns:' — move it to 'column_overrides:'`,
);
if (legacyColumnPatchErrors.length > 0) {
return { valid: false, errors: legacyColumnPatchErrors };
}
}
const result = schema.parse(parsed);
const errors: string[] = [];
@ -268,6 +291,10 @@ export async function validateLocalSlSource(
);
}
if ('table' in result || 'sql' in result) {
toResolvedWire(result as SemanticLayerSource);
}
return { valid: errors.length === 0, errors };
} catch (error) {
return { valid: false, errors: validationErrors(error) };

View file

@ -1,4 +1,4 @@
import type { SemanticLayerQueryInput, SemanticLayerSource } from './types.js';
import type { ResolvedSemanticLayerSource, SemanticLayerQueryInput } from './types.js';
export interface KtxConnectionInfo {
id: string;
@ -20,7 +20,7 @@ export interface SlConnectionCatalogPort {
export interface SlPythonPort {
validateSources(input: {
sources: SemanticLayerSource[];
sources: ResolvedSemanticLayerSource[];
dialect: string;
recently_touched?: string[];
}): Promise<{
@ -28,7 +28,7 @@ export interface SlPythonPort {
error?: unknown;
}>;
query(input: {
sources: SemanticLayerSource[];
sources: ResolvedSemanticLayerSource[];
query: SemanticLayerQueryInput;
dialect: string;
}): Promise<{ data?: { sql?: string; plan?: Record<string, unknown> } | null; error?: unknown }>;

View file

@ -0,0 +1,68 @@
import { execFileSync } from 'node:child_process';
import { Ajv2020 } from 'ajv/dist/2020.js';
import { describe, expect, it } from 'vitest';
import { resolvedSourceSchema } from './schemas.js';
import { toResolvedWire } from './semantic-layer.service.js';
import type { SemanticLayerSource } from './types.js';
function loadPythonSourceDefinitionSchema(): Record<string, unknown> | null {
try {
const stdout = execFileSync('uv', ['run', 'python', '-m', 'semantic_layer', 'dump-schema'], {
cwd: new URL('../../../../', import.meta.url),
encoding: 'utf8',
stdio: ['ignore', 'pipe', 'ignore'],
});
return JSON.parse(stdout) as Record<string, unknown>;
} catch {
return null;
}
}
const sourceDefinitionJsonSchema = loadPythonSourceDefinitionSchema();
const fixtures: SemanticLayerSource[] = [
{
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [
{ name: 'id', type: 'number' },
{
name: 'status',
type: 'string',
descriptions: { dbt: 'Order lifecycle status.' },
constraints: { dbt: { not_null: true } },
enum_values: { dbt: ['placed', 'shipped'] },
tests: { dbt: [{ name: 'accepted_values', package: 'dbt' }] },
},
],
joins: [{ to: 'customers', on: 'orders.customer_id = customers.id', relationship: 'many_to_one' }],
measures: [{ name: 'order_count', expr: 'count(id)' }],
segments: [{ name: 'paid', expr: "status = 'paid'" }],
default_time_dimension: { dbt: 'created_at' },
tags: { dbt: ['mart'] },
freshness: { dbt: { loaded_at_field: 'updated_at' } },
},
{
name: 'aav_orders',
sql: 'select id, status from public.orders where status = paid',
grain: ['id'],
columns: [{ name: 'id', type: 'number' }],
joins: [],
measures: [],
},
];
describe.skipIf(sourceDefinitionJsonSchema === null)('resolved source JSON Schema contract', () => {
it('keeps TS resolved-source fixtures accepted by the Python SourceDefinition schema', () => {
const ajv = new Ajv2020({ allErrors: true, strict: false });
const validate = ajv.compile(sourceDefinitionJsonSchema as Record<string, unknown>);
for (const fixture of fixtures) {
const wire = toResolvedWire(fixture);
expect(resolvedSourceSchema.safeParse(wire).success).toBe(true);
expect(validate(wire), JSON.stringify(validate.errors, null, 2)).toBe(true);
}
});
});

View file

@ -78,6 +78,8 @@ const joinDeclarationSchema = z.object({
alias: z.string().optional(),
});
const resolvedJoinDeclarationSchema = joinDeclarationSchema.strict();
const sourceColumnSchema = z.object({
name: unqualifiedNameSchema,
// type/descriptions optional on standalone sources: compose-time enrichment fills them
@ -89,24 +91,39 @@ const sourceColumnSchema = z.object({
visibility: z.enum(columnVisibilityValues).optional(),
descriptions: descriptionsSchema.optional(),
expr: z.string().optional(),
natural_granularity: z.string().optional(),
constraints: sourceKeyedColumnConstraintsSchema.optional(),
enum_values: sourceKeyedStringArraySchema.optional(),
tests: dbtColumnTestsSchema.optional(),
});
/** Overlay column: type requires expr (structural types are inherited from manifest). */
const resolvedSourceColumnSchema = sourceColumnSchema.extend({
type: z.enum(columnTypeValues),
}).strict();
/** Overlay column: computed columns only. Structural columns live in the manifest. */
const overlayColumnSchema = z
.object({
name: unqualifiedNameSchema,
type: z.enum(columnTypeValues).optional(),
type: z.enum(columnTypeValues),
role: z.enum(columnRoleValues).optional(),
visibility: z.enum(columnVisibilityValues).optional(),
descriptions: descriptionsSchema.optional(),
expr: z.string().optional(),
expr: z.string().min(1),
})
.refine((col) => !col.type || col.expr, {
message: "Overlay column with 'type' must also have 'expr' (only computed columns may specify a type)",
});
.strict();
const columnOverrideSchema = z
.object({
name: unqualifiedNameSchema,
role: z.enum(columnRoleValues).optional(),
visibility: z.enum(columnVisibilityValues).optional(),
descriptions: descriptionsSchema.optional(),
constraints: sourceKeyedColumnConstraintsSchema.optional(),
enum_values: sourceKeyedStringArraySchema.optional(),
tests: dbtColumnTestsSchema.optional(),
})
.strict();
/** Standalone source: has `table` or `sql`, requires grain + columns. */
export const sourceDefinitionSchema = z
@ -143,6 +160,26 @@ export const sourceDefinitionSchema = z
message: "Standalone source must have exactly one of 'table' or 'sql' (not both)",
});
export const resolvedSourceSchema = z
.object({
name: z.string().min(1),
descriptions: descriptionsSchema.optional(),
table: z.string().optional(),
sql: z.string().optional(),
grain: z.array(unqualifiedNameSchema).min(1),
columns: z.array(resolvedSourceColumnSchema).min(1),
joins: z.array(resolvedJoinDeclarationSchema).default([]),
measures: z.array(slMeasureDefinitionSchema).default([]),
segments: z.array(segmentDefinitionSchema).optional(),
default_time_dimension: defaultTimeDimensionDbtSchema.optional(),
tags: sourceKeyedStringArraySchema.optional(),
freshness: sourceFreshnessSchema.optional(),
})
.strict()
.refine((s) => (s.table || s.sql) && !(s.table && s.sql), {
message: "Resolved source must have exactly one of 'table' or 'sql' (not both)",
});
/** Overlay source: no table/sql, all fields optional except name. */
export const sourceOverlaySchema = z
.object({
@ -150,6 +187,7 @@ export const sourceOverlaySchema = z
descriptions: z.record(z.string(), z.string()).optional(),
grain: z.array(unqualifiedNameSchema).optional(),
columns: z.array(overlayColumnSchema).optional(),
column_overrides: z.array(columnOverrideSchema).optional(),
joins: z.array(joinDeclarationSchema).optional(),
measures: z.array(slMeasureDefinitionSchema).optional(),
segments: z.array(segmentDefinitionSchema).optional(),

View file

@ -2,13 +2,17 @@ import type { Mock } from 'vitest';
import { beforeEach, describe, expect, it, vi } from 'vitest';
import {
ColumnNameCollisionError,
composeOverlay,
ConflictingExcludeAndOverrideError,
enrichColumnsFromManifest,
findDanglingSegmentRefs,
projectManifestEntry,
SemanticLayerService,
toResolvedWire,
UnknownColumnOverrideError,
} from './semantic-layer.service.js';
import { sourceDefinitionSchema } from './schemas.js';
import { resolvedSourceSchema, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js';
import type { SemanticLayerSource } from './types.js';
const pythonPort = {
@ -139,6 +143,69 @@ describe('composeOverlay', () => {
expect(composed.measures).toHaveLength(1);
});
it('applies column_overrides to same-named manifest columns', () => {
const overlay = {
name: 'fct_labs',
column_overrides: [
{ name: 'lab_order_id', descriptions: { user: 'Primary key' } },
{ name: 'admin_user_id', descriptions: { user: 'FK to admin_users' } },
],
};
const composed = composeOverlay(baseTable, overlay);
// No duplicate columns appended — same-named overlay entries merged onto the base.
expect(composed.columns).toHaveLength(3);
const labOrder = composed.columns.find((c) => c.name === 'lab_order_id');
expect(labOrder?.type).toBe('string');
expect(labOrder?.descriptions).toEqual({ user: 'Primary key' });
const adminUser = composed.columns.find((c) => c.name === 'admin_user_id');
expect(adminUser?.type).toBe('string');
expect(adminUser?.descriptions).toEqual({ user: 'FK to admin_users' });
});
it('appends computed columns alongside column overrides', () => {
const overlay = {
name: 'fct_labs',
column_overrides: [
{ name: 'lab_order_id', descriptions: { user: 'PK doc' } },
],
columns: [
{ name: 'is_byol', type: 'boolean', expr: "lab_type = 'byol'" },
],
};
const composed = composeOverlay(baseTable, overlay);
expect(composed.columns).toHaveLength(4);
expect(composed.columns.find((c) => c.name === 'is_byol')?.expr).toBe("lab_type = 'byol'");
expect(composed.columns.find((c) => c.name === 'lab_order_id')?.type).toBe('string');
});
it('rejects column_overrides that target unknown manifest columns', () => {
expect(() =>
composeOverlay(baseTable, {
name: 'fct_labs',
column_overrides: [{ name: 'missing', descriptions: { user: 'Nope' } }],
}),
).toThrow(UnknownColumnOverrideError);
});
it('rejects computed columns whose names collide with manifest columns', () => {
expect(() =>
composeOverlay(baseTable, {
name: 'fct_labs',
columns: [{ name: 'lab_order_id', type: 'string', expr: 'lab_order_id' }],
}),
).toThrow(ColumnNameCollisionError);
});
it('rejects exclude/override conflicts before applying exclusions', () => {
expect(() =>
composeOverlay(baseTable, {
name: 'fct_labs',
exclude_columns: ['lab_order_id'],
column_overrides: [{ name: 'lab_order_id', descriptions: { user: 'Hidden PK' } }],
}),
).toThrow(ConflictingExcludeAndOverrideError);
});
it('merges overlay descriptions (plural) with base descriptions keyed by source', () => {
const baseWithDescriptions: SemanticLayerSource = {
...baseTable,
@ -418,6 +485,62 @@ describe('sourceDefinitionSchema', () => {
});
});
describe('sourceOverlaySchema', () => {
it('accepts column_overrides and keeps columns computed-only', () => {
const result = sourceOverlaySchema.safeParse({
name: 'orders',
column_overrides: [{ name: 'status', descriptions: { user: 'Lifecycle status' } }],
columns: [{ name: 'is_paid', type: 'boolean', expr: "status = 'paid'" }],
});
expect(result.success).toBe(true);
});
it('rejects typeless overlay columns and singular description on overrides', () => {
const result = sourceOverlaySchema.safeParse({
name: 'orders',
column_overrides: [{ name: 'status', description: 'Lifecycle status' }],
columns: [{ name: 'status', descriptions: { user: 'Lifecycle status' } }],
});
expect(result.success).toBe(false);
if (!result.success) {
const paths = result.error.issues.map((issue) => issue.path.join('.'));
expect(paths).toContain('column_overrides.0');
expect(paths).toContain('columns.0.type');
expect(paths).toContain('columns.0.expr');
}
});
});
describe('toResolvedWire', () => {
it('strips TS-only authoring and provenance fields before the Python boundary', () => {
const wire = toResolvedWire({
name: 'orders',
table: 'public.orders',
inherits_columns_from: 'orders',
grain: ['id'],
columns: [{ name: 'id', type: 'string' }],
joins: [{ to: 'customers', on: 'orders.customer_id = customers.id', relationship: 'many_to_one', source: 'formal' }],
measures: [],
usage: {
narrative: 'Frequently queried orders.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [],
},
});
expect(wire).toEqual({
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'string' }],
joins: [{ to: 'customers', on: 'orders.customer_id = customers.id', relationship: 'many_to_one' }],
measures: [],
});
expect(resolvedSourceSchema.parse(wire)).toEqual(wire);
});
});
describe('projectManifestEntry', () => {
it('projects manifest usage onto the semantic-layer source', () => {
const source = projectManifestEntry('orders', {
@ -537,7 +660,8 @@ describe('loadAllSources — standalone enrichment via inherits_columns_from', (
].join('\n'),
});
const sources = await service.loadAllSources('conn-1');
const { sources, loadErrors } = await service.loadAllSources('conn-1');
expect(loadErrors).toEqual([]);
expect(sources[0]).toMatchObject({
name: 'orders',
@ -601,7 +725,8 @@ describe('loadAllSources — standalone enrichment via inherits_columns_from', (
return Promise.reject(new Error(`Unexpected readFile: ${path}`));
});
const sources = await service.loadAllSources('conn-1');
const { sources, loadErrors } = await service.loadAllSources('conn-1');
expect(loadErrors).toEqual([]);
const aav = sources.find((s) => s.name === 'aav_consignments');
expect(aav).toBeDefined();
expect(aav?.columns).toEqual([
@ -646,7 +771,8 @@ describe('loadAllSources — standalone enrichment via inherits_columns_from', (
});
});
const sources = await service.loadAllSources('conn-1');
const { sources, loadErrors } = await service.loadAllSources('conn-1');
expect(loadErrors).toEqual([]);
const aav = sources.find((s) => s.name === 'aav_consignments');
expect(aav?.columns[0].type).toBe('string');
});
@ -670,7 +796,8 @@ describe('loadAllSources — standalone enrichment via inherits_columns_from', (
].join('\n'),
});
const sources = await service.loadAllSources('conn-1');
const { sources, loadErrors } = await service.loadAllSources('conn-1');
expect(loadErrors).toEqual([]);
const aav = sources.find((s) => s.name === 'aav_consignments');
expect(aav?.columns).toEqual([{ name: 'FOO', type: 'string' }]);
});
@ -693,7 +820,8 @@ describe('loadAllSources — standalone enrichment via inherits_columns_from', (
].join('\n'),
});
const sources = await service.loadAllSources('conn-1');
const { sources, loadErrors } = await service.loadAllSources('conn-1');
expect(loadErrors).toEqual([]);
expect(sources[0]).toMatchObject({
name: 'orders',
@ -701,6 +829,33 @@ describe('loadAllSources — standalone enrichment via inherits_columns_from', (
columns: [{ name: 'id', type: 'string', descriptions: { user: 'Stable order identifier.' } }],
});
});
it('reports file-attributed errors for legacy overlay column patches', async () => {
const schemaPath = 'semantic-layer/conn-1/_schema/marts.yaml';
const overlayPath = 'semantic-layer/conn-1/orders.yaml';
configService.listFiles.mockResolvedValue({ files: [schemaPath, overlayPath] });
configService.readFile.mockImplementation((path: string) => {
if (path === schemaPath) {
return Promise.resolve({
content: [
'tables:',
' orders:',
' table: public.orders',
' columns:',
' - { name: id, type: string, pk: true }',
].join('\n'),
});
}
return Promise.resolve({
content: ['name: orders', 'columns:', ' - name: id', ' descriptions: { user: "Stable id." }'].join('\n'),
});
});
const { loadErrors } = await service.loadAllSources('conn-1');
expect(loadErrors.join('\n')).toContain(overlayPath);
expect(loadErrors.join('\n')).toContain("move it to 'column_overrides:'");
});
});
describe('validateWithProposedSource', () => {

View file

@ -4,8 +4,14 @@ import { noopLogger } from '../core/index.js';
import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js';
import type { SlConnectionCatalogPort, SlPythonPort } from './ports.js';
import { normalizeSemanticLayerDescriptions } from './description-normalization.js';
import { isOverlaySource, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js';
import type { SemanticLayerQueryExecutionResult, SemanticLayerQueryInput, SemanticLayerSource } from './types.js';
import { isOverlaySource, resolvedSourceSchema, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js';
import type {
ResolvedSemanticLayerSource,
SemanticLayerColumnOverride,
SemanticLayerQueryExecutionResult,
SemanticLayerQueryInput,
SemanticLayerSource,
} from './types.js';
interface WriteSourceOptions {
skipValidation?: boolean;
@ -14,6 +20,30 @@ interface WriteSourceOptions {
const SL_DIR_PREFIX = 'semantic-layer';
const CONNECTION_ID_PATTERN = /^[a-zA-Z0-9][a-zA-Z0-9_-]*$/;
export interface LoadAllSourcesResult {
sources: SemanticLayerSource[];
loadErrors: string[];
}
export class UnknownColumnOverrideError extends Error {}
export class ColumnNameCollisionError extends Error {}
export class ConflictingExcludeAndOverrideError extends Error {}
class ComposeContractError extends Error {}
function isComposeError(error: unknown): boolean {
return (
error instanceof UnknownColumnOverrideError ||
error instanceof ColumnNameCollisionError ||
error instanceof ConflictingExcludeAndOverrideError ||
error instanceof ComposeContractError
);
}
function formatComposeError(filePath: string, error: unknown): string {
const message = error instanceof Error ? error.message : String(error);
return `${filePath}: ${message}`;
}
function formatPortError(error: unknown, fallback: string): string {
if (typeof error === 'string') {
return error;
@ -37,6 +67,24 @@ function formatPortError(error: unknown, fallback: string): string {
return fallback;
}
export function toResolvedWire(source: SemanticLayerSource): ResolvedSemanticLayerSource {
const stripped = {
...source,
columns: source.columns.map((column) => ({ ...column })),
joins: source.joins.map(({ source: _source, ...join }) => join),
} as Record<string, unknown>;
delete stripped.inherits_columns_from;
delete stripped.usage;
delete stripped.source_type;
const parsed = resolvedSourceSchema.safeParse(stripped);
if (!parsed.success) {
const issues = parsed.error.issues.map((issue) => `${issue.path.join('.')}: ${issue.message}`).join('; ');
throw new ComposeContractError(`resolved source '${source.name}' violates the TS/Python contract: ${issues}`);
}
return parsed.data as ResolvedSemanticLayerSource;
}
export class SemanticLayerService {
constructor(
private readonly configService: KtxFileStorePort,
@ -158,16 +206,17 @@ export class SemanticLayerService {
}
}
async loadAllSources(connectionId: string): Promise<SemanticLayerSource[]> {
async loadAllSources(connectionId: string): Promise<LoadAllSourcesResult> {
const dir = `${SL_DIR_PREFIX}/${connectionId}`;
const schemaDir = `${dir}/_schema`;
const loadErrors: string[] = [];
let allFiles: string[];
try {
const result = await this.configService.listFiles(dir);
allFiles = result.files.filter((f) => f.endsWith('.yaml'));
} catch {
return [];
return { sources: [], loadErrors };
}
// 1. Load manifest shards from _schema/*.yaml → project to sources
@ -184,7 +233,9 @@ export class SemanticLayerService {
}
}
} catch (e) {
this.logger.warn(`Failed to parse manifest shard ${filePath}: ${e}`);
const message = `Failed to parse manifest shard ${filePath}: ${e instanceof Error ? e.message : String(e)}`;
loadErrors.push(message);
this.logger.warn(message);
}
}
@ -227,6 +278,7 @@ export class SemanticLayerService {
);
}
}
toResolvedWire(standalone);
sources.set(name, standalone);
} else {
// Overlay — compose with manifest entry if present
@ -238,11 +290,15 @@ export class SemanticLayerService {
}
}
} catch (e) {
this.logger.warn(`Failed to parse YAML file ${filePath}: ${e}`);
const message = isComposeError(e)
? formatComposeError(filePath, e)
: `Failed to parse YAML file ${filePath}: ${e instanceof Error ? e.message : String(e)}`;
loadErrors.push(message);
this.logger.warn(message);
}
}
return Array.from(sources.values());
return { sources: Array.from(sources.values()), loadErrors };
}
/**
@ -622,8 +678,10 @@ export class SemanticLayerService {
connectionId: string,
proposedSource: SemanticLayerSource,
): Promise<{ errors: string[]; warnings: string[]; perSourceWarnings: Record<string, string[]> }> {
const existing = await this.loadAllSources(connectionId);
const loaded = await this.loadAllSources(connectionId);
const existing = loaded.sources;
const merged = existing.filter((s) => s.name !== proposedSource.name);
const loadErrors = [...loaded.loadErrors];
// Overlays (no table/sql) must be composed with their manifest base before
// validation, otherwise the filter below drops them and the edited source
@ -641,11 +699,27 @@ export class SemanticLayerService {
perSourceWarnings: {},
};
}
toPush = composeOverlay(base, { ...proposedSource });
try {
toPush = composeOverlay(base, { ...proposedSource });
} catch (error) {
return {
errors: [...loadErrors, formatComposeError(`${proposedSource.name}.yaml`, error)],
warnings: [],
perSourceWarnings: {},
};
}
} else if (proposedSource.inherits_columns_from) {
const base = await this.findManifestEntryByTableRef(connectionId, proposedSource.inherits_columns_from);
if (base) {
toPush = enrichColumnsFromManifest(proposedSource, base);
try {
toPush = enrichColumnsFromManifest(proposedSource, base);
} catch (error) {
return {
errors: [...loadErrors, formatComposeError(`${proposedSource.name}.yaml`, error)],
warnings: [],
perSourceWarnings: {},
};
}
}
// Miss is non-fatal — the source ships unenriched, validator will surface
// any column-without-type errors via the warehouse probe.
@ -654,37 +728,37 @@ export class SemanticLayerService {
const validatable = merged.filter((s) => s.table != null || s.sql != null);
if (validatable.length === 0) {
return { errors: [], warnings: [], perSourceWarnings: {} };
return { errors: loadErrors, warnings: [], perSourceWarnings: {} };
}
const dialect = await this.getDialectForConnection(connectionId);
try {
const { data, error } = await this.python.validateSources({
sources: validatable,
sources: validatable.map(toResolvedWire),
dialect,
recently_touched: [proposedSource.name],
});
if (error) {
const errorMsg = formatPortError(error, 'Unknown validation error');
return { errors: [errorMsg], warnings: [], perSourceWarnings: {} };
return { errors: [...loadErrors, errorMsg], warnings: [], perSourceWarnings: {} };
}
if (!data) {
return {
errors: await this.validatePhysicalTableReferences(connectionId, validatable),
errors: [...loadErrors, ...(await this.validatePhysicalTableReferences(connectionId, validatable))],
warnings: [],
perSourceWarnings: {},
};
}
const physicalErrors = await this.validatePhysicalTableReferences(connectionId, validatable);
return {
errors: [...(data.errors ?? []), ...physicalErrors],
errors: [...loadErrors, ...(data.errors ?? []), ...physicalErrors],
warnings: data.warnings ?? [],
perSourceWarnings: data.per_source_warnings ?? {},
};
} catch (e) {
return {
errors: [`Validation call failed: ${e instanceof Error ? e.message : String(e)}`],
errors: [...loadErrors, `Validation call failed: ${e instanceof Error ? e.message : String(e)}`],
warnings: [],
perSourceWarnings: {},
};
@ -692,23 +766,23 @@ export class SemanticLayerService {
}
async validateSourcesForConnection(connectionId: string): Promise<{ errors: string[]; warnings: string[] }> {
const allSources = await this.loadAllSources(connectionId);
const { sources: allSources, loadErrors } = await this.loadAllSources(connectionId);
const sources = allSources.filter((source) => source.table != null || source.sql != null);
if (sources.length === 0) {
return { errors: [], warnings: [] };
return { errors: loadErrors, warnings: [] };
}
const dialect = await this.getDialectForConnection(connectionId);
const { data, error } = await this.python.validateSources({ sources, dialect });
const { data, error } = await this.python.validateSources({ sources: sources.map(toResolvedWire), dialect });
if (error) {
return { errors: [formatPortError(error, 'Unknown validation error')], warnings: [] };
return { errors: [...loadErrors, formatPortError(error, 'Unknown validation error')], warnings: [] };
}
if (!data) {
return { errors: await this.validatePhysicalTableReferences(connectionId, sources), warnings: [] };
return { errors: [...loadErrors, ...(await this.validatePhysicalTableReferences(connectionId, sources))], warnings: [] };
}
const physicalErrors = await this.validatePhysicalTableReferences(connectionId, sources);
return {
errors: [...(data.errors ?? []), ...physicalErrors],
errors: [...loadErrors, ...(data.errors ?? []), ...physicalErrors],
warnings: data.warnings ?? [],
};
}
@ -802,6 +876,7 @@ export class SemanticLayerService {
} else {
// Overlay — check references against manifest
const excludeColumns = (data.exclude_columns as string[]) ?? [];
const columnOverrides = (data.column_overrides as Array<{ name: string }> | undefined) ?? [];
const disableJoins = (data.disable_joins as string[]) ?? [];
const cols = manifestColumns.get(name);
const joins = manifestJoins.get(name);
@ -817,6 +892,16 @@ export class SemanticLayerService {
}
}
const excluded = new Set(excludeColumns);
for (const override of columnOverrides) {
if (!cols.has(override.name)) {
warnings.push(`${name}: column_overrides references non-existent column '${override.name}'`);
}
if (excluded.has(override.name)) {
warnings.push(`${name}: column '${override.name}' appears in both exclude_columns and column_overrides`);
}
}
for (const joinOn of disableJoins) {
const normalized = joinOn.replace(/\s+/g, ' ').trim();
if (!joins?.has(normalized)) {
@ -999,7 +1084,10 @@ export class SemanticLayerService {
*/
async executeQuery(connectionId: string, query: SemanticLayerQueryInput): Promise<SemanticLayerQueryExecutionResult> {
// 1. Load sources, filtering out sources with no table or sql
const allSources = await this.loadAllSources(connectionId);
const { sources: allSources, loadErrors } = await this.loadAllSources(connectionId);
if (loadErrors.length > 0) {
throw new Error(`Semantic layer source load failed: ${loadErrors.join('; ')}`);
}
const sources = allSources.filter((s) => {
if (!s.table && !s.sql) {
this.logger.warn(`Skipping source "${s.name}" with no table or sql defined`);
@ -1021,7 +1109,7 @@ export class SemanticLayerService {
// 3. Generate SQL via python SL engine
const { data: slResult, error: slError } = await this.python.query({
sources,
sources: sources.map(toResolvedWire),
query,
dialect,
});
@ -1092,18 +1180,20 @@ export function projectManifestEntry(name: string, entry: ManifestTableEntry): S
const grain = pkColumns.length > 0 ? pkColumns : entry.columns.map((c) => c.name);
// Table-level dbt config from manifest shards is surfaced on the source for search / tools.
return {
const source: SemanticLayerSource = {
name,
table: entry.table,
descriptions: entry.descriptions,
grain,
columns,
joins: (entry.joins ?? []).map((j) => ({ to: j.to, on: j.on, relationship: j.relationship, source: j.source })),
joins: (entry.joins ?? []).map((j) => ({ to: j.to, on: j.on, relationship: j.relationship })),
measures: [],
...(entry.tags?.dbt?.length ? { tags: entry.tags } : {}),
...(entry.freshness?.dbt ? { freshness: entry.freshness } : {}),
...(entry.usage ? { usage: entry.usage } : {}),
};
toResolvedWire(source);
return source;
}
function normalizeWs(s: string): string {
@ -1331,6 +1421,7 @@ const COMPOSE_KNOWN_KEYS = new Set([
'descriptions',
'grain',
'columns',
'column_overrides',
'joins',
'measures',
'segments',
@ -1365,14 +1456,48 @@ export function composeOverlay(base: SemanticLayerSource, overlay: Record<string
result.usage = normalizedOverlay.usage as SemanticLayerSource['usage'];
}
// Filter out excluded columns
const excluded = new Set((normalizedOverlay.exclude_columns as string[] | undefined) ?? []);
let columns = result.columns.filter((c) => !excluded.has(c.name));
const columnOverrides = (normalizedOverlay.column_overrides as SemanticLayerColumnOverride[] | undefined) ?? [];
const overrideNames = columnOverrides.map((column) => column.name);
const conflictingOverrides = overrideNames.filter((name) => excluded.has(name));
if (conflictingOverrides.length > 0) {
throw new ConflictingExcludeAndOverrideError(
`column_overrides conflict with exclude_columns for '${base.name}': ${conflictingOverrides.join(', ')}`,
);
}
// Append overlay computed columns
const overlayColumns = (normalizedOverlay.columns as SemanticLayerSource['columns'] | undefined) ?? [];
columns = [...columns, ...overlayColumns];
result.columns = columns;
const baseByLowerName = new Map(base.columns.map((column) => [column.name.toLowerCase(), column]));
const columnsByLowerName = new Map(
result.columns.filter((column) => !excluded.has(column.name)).map((column) => [column.name.toLowerCase(), column]),
);
for (const override of columnOverrides) {
const key = override.name.toLowerCase();
const baseColumn = baseByLowerName.get(key);
if (!baseColumn) {
throw new UnknownColumnOverrideError(
`column '${override.name}' in column_overrides does not exist on manifest source '${base.name}'`,
);
}
const baseDescriptions = baseColumn.descriptions ?? {};
const overrideDescriptions = override.descriptions ?? {};
const merged = { ...baseColumn, ...override };
if (Object.keys(baseDescriptions).length > 0 || Object.keys(overrideDescriptions).length > 0) {
merged.descriptions = { ...baseDescriptions, ...overrideDescriptions };
}
columnsByLowerName.set(key, merged);
}
const computedColumns = (normalizedOverlay.columns as SemanticLayerSource['columns'] | undefined) ?? [];
for (const column of computedColumns) {
if (baseByLowerName.has(column.name.toLowerCase())) {
throw new ColumnNameCollisionError(
`column '${column.name}' in columns patches a manifest column on '${base.name}' — move it to 'column_overrides:'`,
);
}
columnsByLowerName.set(column.name.toLowerCase(), column);
}
result.columns = [...columnsByLowerName.values()];
// Measures from overlay only
result.measures = (normalizedOverlay.measures as SemanticLayerSource['measures'] | undefined) ?? [];
@ -1401,6 +1526,12 @@ export function composeOverlay(base: SemanticLayerSource, overlay: Record<string
const newJoins = overlayJoins.filter((j) => !existingKeys.has(`${j.to}::${normalizeWs(j.on)}`));
result.joins = [...manifestJoins, ...newJoins];
const overlayParse = sourceOverlaySchema.safeParse(normalizedOverlay);
if (!overlayParse.success) {
const issues = overlayParse.error.issues.map((issue) => `${issue.path.join('.')}: ${issue.message}`).join('; ');
throw new ComposeContractError(`overlay for '${base.name}' violates the authoring schema: ${issues}`);
}
toResolvedWire(result);
return result;
}
@ -1464,5 +1595,7 @@ export function enrichColumnsFromManifest(
}
return merged;
});
return { ...source, columns: enrichedColumns };
const enriched = { ...source, columns: enrichedColumns };
toResolvedWire(enriched);
return enriched;
}

View file

@ -7,7 +7,7 @@ import { SlDiscoverTool } from './sl-discover.tool.js';
function makeTool() {
const semanticLayerService = {
listConnectionIdsWithNames: vi.fn(async () => [] as Array<{ id: string; name: string; connectionType: string }>),
loadAllSources: vi.fn(async () => [] as SemanticLayerSource[]),
loadAllSources: vi.fn(async () => ({ sources: [] as SemanticLayerSource[], loadErrors: [] })),
};
const slSearchService = {
search: vi.fn(async () => []),
@ -53,7 +53,8 @@ describe('SlDiscoverTool - session-scoped reads', () => {
listConnectionIdsWithNames: vi.fn().mockResolvedValue([
{ id: 'warehouse', name: 'warehouse', connectionType: 'postgres' },
]),
loadAllSources: vi.fn().mockResolvedValue([
loadAllSources: vi.fn().mockResolvedValue({
sources: [
{
name: 'orders',
table: 'public.orders',
@ -62,7 +63,9 @@ describe('SlDiscoverTool - session-scoped reads', () => {
measures: [],
joins: [],
},
]),
],
loadErrors: [],
}),
};
const result = await tool.call({}, makeContext({ session: makeSession(sessionSemanticLayerService) }));

View file

@ -101,7 +101,7 @@ Use this to understand what data is available before querying through the semant
// If inspecting a specific source — show the SL interface (columns, measures, joins)
// without the raw SQL. Use `sl_read_source` to see the full YAML including SQL.
if (sourceName) {
const sources = await semanticLayerService.loadAllSources(connectionId);
const { sources } = await semanticLayerService.loadAllSources(connectionId);
const source = sources.find((s) => s.name === sourceName);
if (!source) {
return {
@ -151,7 +151,7 @@ Use this to understand what data is available before querying through the semant
// Load sources from all connections in parallel
const results = await Promise.all(
connections.map(async (conn) => {
const sources = await semanticLayerService.loadAllSources(conn.id);
const { sources } = await semanticLayerService.loadAllSources(conn.id);
let filtered = sources;
if (query) {
filtered = await this.filterByQuery(conn.id, sources, query);
@ -213,7 +213,7 @@ Use this to understand what data is available before querying through the semant
connectionName: string,
query?: string,
): Promise<ToolOutput<SlDiscoverStructured>> {
const sources = await semanticLayerService.loadAllSources(connectionId);
const { sources } = await semanticLayerService.loadAllSources(connectionId);
if (sources.length === 0) {
return {

View file

@ -11,7 +11,7 @@ function makeTool(overrides: any = {}) {
}),
validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
deleteSource: vi.fn().mockResolvedValue(undefined),
isManifestBacked: vi.fn().mockResolvedValue(false),
...overrides.semanticLayerService,
@ -44,7 +44,7 @@ function makeSession(overrides: Partial<ToolSession> = {}): ToolSession {
}),
validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
} as any,
wikiService: {} as any,
configService: {} as any,
@ -191,9 +191,10 @@ describe('SlEditSourceTool — manifest-backed source without overlay', () => {
expect(joinedErrors).toContain('manifest');
expect(joinedErrors).toContain('sl_write_source');
expect(joinedErrors).toContain('overlay');
// Overlay shape: only name + measures/segments/description
// Overlay shape: name plus overlay-only fields.
expect(joinedErrors).toContain('measures');
expect(joinedErrors).toContain('segments');
expect(joinedErrors).toContain('column_overrides');
});
it('still returns the plain "Source not found" error for truly-missing names', async () => {

View file

@ -127,7 +127,8 @@ If no source exists yet, use sl_write_source instead — this tool will reject t
` - name: <measure_name>`,
` expr: "<expression>"`,
` description: "<what it measures>"`,
`Overlay shape: "name:" plus any of "measures:", "segments:", "descriptions:". Do NOT include "sql:", "table:", "grain:", "columns:", or "joins:" — those are inherited from the manifest.`,
`Overlay shape: "name:" plus any of "measures:", "segments:", "descriptions:", "joins:", "disable_joins:", "exclude_columns:", "column_overrides:", or computed-only "columns:" entries with expr + type.`,
`Do NOT include "sql:", "table:", "grain:", or base-table "columns:" — those are inherited from the manifest.`,
].join('\n'),
],
sourceName,
@ -181,7 +182,7 @@ If no source exists yet, use sl_write_source instead — this tool will reject t
const result = await semanticLayerService.writeSource(connectionId, source, author, authorEmail, commitMessage);
if (!skipIndex) {
const allSources = await semanticLayerService.loadAllSources(connectionId);
const { sources: allSources } = await semanticLayerService.loadAllSources(connectionId);
await this.slSearchService.indexSources(connectionId, allSources).catch(() => {});
}

View file

@ -34,7 +34,7 @@ describe('SlValidateTool — session-aware touched-set filtering', () => {
{ name: 'customers', table: 'x.customers', grain: ['id'], columns: [], joins: [], measures: [] },
];
const serviceMock = {
loadAllSources: vi.fn().mockResolvedValue(sources),
loadAllSources: vi.fn().mockResolvedValue({ sources, loadErrors: [] }),
validateSourcesForConnection: vi.fn().mockResolvedValue({
errors: ['orders: missing join target', 'customers: invalid grain'],
warnings: ['orders: disconnected-components warning'],

View file

@ -62,7 +62,7 @@ Checks: all join targets exist, grain is valid, no missing references.
const semanticLayerService = context.session?.semanticLayerService ?? this.semanticLayerService;
const sources = await semanticLayerService.loadAllSources(connectionId);
const { sources } = await semanticLayerService.loadAllSources(connectionId);
if (sources.length === 0) {
return this.buildOutput(true, [], '(all)', {
validationErrors: ['No sources found for this connection.'],

View file

@ -8,7 +8,7 @@ function makeDeps(opts: { sourceYaml: string; executeQuery: ReturnType<typeof vi
isManifestBacked: vi.fn().mockResolvedValue(false),
listManifestSourceNames: vi.fn().mockResolvedValue([]),
loadSource: vi.fn().mockResolvedValue(null),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
validatePhysicalTableReferences: vi.fn().mockResolvedValue([]),
} as never,
connections: {

View file

@ -88,8 +88,9 @@ export async function validateSingleSource(
errors.push(
`${sourceName}.yaml: standalone source shadows an existing manifest entry — ` +
`writing it as-is drops the manifest's columns and joins. ` +
`Remove "sql:", "table:", "grain:", "columns:", and "joins:" and keep only ` +
`"name:" plus "measures:"/"segments:"/"descriptions:" to write an overlay ` +
`Remove "sql:", "table:", "grain:", and base-table "columns:" and keep only ` +
`"name:" plus overlay fields such as "measures:", "segments:", "descriptions:", ` +
`"joins:", "column_overrides:", or computed-only "columns:" to write an overlay ` +
`that inherits the manifest schema. Call sl_read_source to inspect the existing source first.`,
);
return { errors, warnings };
@ -108,7 +109,7 @@ export async function validateSingleSource(
}
if (errorPaths.has('columns')) {
warnings.push(
`${sourceName}.yaml: hint — overlay columns must be computed: {name, expr, type}. Do NOT include base table columns.`,
`${sourceName}.yaml: hint — overlay columns must be computed: {name, expr, type}. Use column_overrides for manifest column descriptions or metadata.`,
);
}
if (errorPaths.has('measures')) {
@ -240,7 +241,8 @@ async function probeOverlayMeasures(
}
| undefined;
try {
const all = await deps.semanticLayerService.loadAllSources(connectionId);
const { sources: all, loadErrors } = await deps.semanticLayerService.loadAllSources(connectionId);
errors.push(...loadErrors);
composed = all.find((s) => s.name === sourceName);
} catch (e) {
errors.push(

View file

@ -8,7 +8,7 @@ function makeTool(overrides: Partial<Record<string, any>> = {}) {
listManifestSourceNames: vi.fn().mockResolvedValue(['ACCOUNTS', 'ORDERS']),
isManifestBacked: vi.fn().mockResolvedValue(false),
loadSource: vi.fn().mockResolvedValue(null),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }),
deleteSource: vi.fn().mockResolvedValue(undefined),
@ -59,7 +59,7 @@ describe('SlWriteSourceTool — session gating', () => {
actions: [],
semanticLayerService: {
loadSource: vi.fn().mockResolvedValue(null),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }),
deleteSource: vi.fn().mockResolvedValue(undefined),
@ -213,7 +213,7 @@ describe('SlWriteSourceTool — session gating', () => {
ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'metabase' },
semanticLayerService: {
loadSource: vi.fn().mockResolvedValue(null),
loadAllSources: vi.fn().mockResolvedValue([]),
loadAllSources: vi.fn().mockResolvedValue({ sources: [], loadErrors: [] }),
validateWithProposedSource: vi.fn().mockResolvedValue({ errors: [], warnings: [] }),
writeSource: vi.fn().mockResolvedValue({ commitHash: 'c1' }),
deleteSource: vi.fn().mockResolvedValue(undefined),

View file

@ -23,7 +23,9 @@ const slWriteSourceInputSchema = z.object({
.describe('Name of the source to create, edit, or delete'),
source: sourceInputSchema
.optional()
.describe('Source definition (standalone with table/sql) or overlay (measures, computed columns, etc.)'),
.describe(
'Source definition (standalone with table/sql) or overlay (measures, column_overrides, computed columns, etc.)',
),
delete: z.boolean().optional().describe('Set to true to delete this source entirely'),
rawPaths: z
.array(z.string().min(1))
@ -73,7 +75,8 @@ If the source already exists, this tool will overwrite it with the new definitio
- table: For physical table/view sources (e.g., "public.orders"). Mutually exclusive with sql.
- sql: For SQL-based sources (the SQL query). Mutually exclusive with table.
- grain: What one row represents (e.g., ["id"], ["customer_id", "product_id"])
- columns: All columns with type (string/number/time/boolean) and optional descriptions
- columns: All columns with type (string/number/time/boolean) and optional descriptions. On overlays, columns are computed-only and require expr + type.
- column_overrides: Overlay-only metadata patches for existing manifest columns (descriptions, role, visibility, constraints, enum_values, tests). Do not include type or expr.
- joins: Relationships to other sources (to, on, relationship: many_to_one/one_to_many/one_to_one)
- measures: Pre-defined aggregations (name, expr like "sum(amount)", optional filter, optional segments bare names of segments defined on the same source, optional description)
- segments: Named, reusable boolean predicates scoped to this source (name, expr a SQL boolean over this source's columns, optional description). A measure references one with \`segments: [name]\`; a query references one with the dotted form \`source.segment_name\`. Use when the same predicate appears on 3+ measures — e.g. extract \`is_paid = true and is_refunded = '0'\` as \`segments: [{name: paid_non_refunded, expr: "..."}]\` and have each measure use \`segments: [paid_non_refunded]\` instead of re-typing the predicate inside \`sum(case when ... then x end)\`. Segments are predicates only — they cannot be selected as dimensions or grouped by; if you need to group by the predicate, add a \`columns[]\` entry instead.
@ -113,7 +116,7 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co
try {
await semanticLayerService.deleteSource(connectionId, sourceName, author, authorEmail);
if (!skipIndex) {
const allSources = await semanticLayerService.loadAllSources(connectionId);
const { sources: allSources } = await semanticLayerService.loadAllSources(connectionId);
await this.slSearchService.indexSources(connectionId, allSources).catch(() => {});
}
if (context.session) {
@ -210,7 +213,7 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co
);
if (!skipIndex) {
const allSources = await semanticLayerService.loadAllSources(connectionId);
const { sources: allSources } = await semanticLayerService.loadAllSources(connectionId);
await this.slSearchService.indexSources(connectionId, allSources).catch(() => {});
}
@ -317,8 +320,9 @@ Do NOT join back to a table that the SQL already aggregates from if the grain co
`Error: cannot write "${sourceName}" as a standalone source — a manifest entry with that name already exists.`,
` Writing standalone would drop the manifest's columns and joins, leaving only what you list here.`,
`To add measures/segments on top of the manifest, rewrite this YAML as an overlay:`,
` - Remove "sql:", "table:", "grain:", "columns:", and "joins:".`,
` - Keep only "name:", plus "measures:", "segments:", and/or "descriptions:".`,
` - Remove "sql:", "table:", "grain:", and base-table "columns:".`,
` - Keep "name:" plus "measures:", "segments:", "descriptions:", "joins:", "disable_joins:",`,
` "exclude_columns:", "column_overrides:", and/or computed-only "columns:" entries with expr + type.`,
` - The manifest's schema is inherited automatically.`,
`If you really need a different base table, use a different source name.`,
].join('\n');

View file

@ -47,6 +47,32 @@ export interface SemanticLayerSource {
usage?: TableUsageOutput;
}
type SemanticLayerColumn = SemanticLayerSource['columns'][number];
type SemanticLayerJoin = SemanticLayerSource['joins'][number];
export interface SemanticLayerColumnOverride {
name: string;
role?: string;
visibility?: string;
descriptions?: Record<string, string>;
constraints?: { dbt?: { not_null?: boolean; unique?: boolean } };
enum_values?: { dbt?: string[] };
tests?: {
dbt?: Array<{ name: string; package: string; kwargs?: Record<string, unknown> }>;
dbt_by_package?: Record<string, string[]>;
};
}
export type ResolvedSemanticLayerSource = Omit<
SemanticLayerSource,
'inherits_columns_from' | 'usage' | 'joins'
> & {
table?: string;
sql?: string;
columns: Array<SemanticLayerColumn & { type: string }>;
joins: Array<Omit<SemanticLayerJoin, 'source'>>;
};
export interface SemanticLayerQueryInput {
measures: Array<string | { expr: string; name: string }>;
dimensions: Array<string | { field: string; granularity?: string }>;

View file

@ -1,6 +1,7 @@
export { createKtxEmbeddingProvider } from './embedding-provider.js';
export { runKtxEmbeddingHealthCheck } from './embedding-health.js';
export { KtxMessageBuilder } from './message-builder.js';
export { KtxMessageBuilder, splitKtxSystemMessages } from './message-builder.js';
export type { KtxSplitSystemMessagesResult } from './message-builder.js';
export type { KtxEmbeddingHealthCheckOptions, KtxEmbeddingHealthCheckResult } from './embedding-health.js';
export type { KtxEmbeddingProviderDeps } from './embedding-provider.js';
export type { KtxLlmHealthCheckDeps, KtxLlmHealthCheckOptions, KtxLlmHealthCheckResult } from './model-health.js';

View file

@ -1,6 +1,6 @@
import type { ModelMessage } from 'ai';
import { describe, expect, it } from 'vitest';
import { KtxMessageBuilder } from './message-builder.js';
import { KtxMessageBuilder, splitKtxSystemMessages } from './message-builder.js';
import { createKtxLlmProvider } from './model-provider.js';
function makeBuilder(overrides: Parameters<typeof createKtxLlmProvider>[0]['promptCaching'] = {}) {
@ -111,3 +111,36 @@ describe('KtxMessageBuilder.build', () => {
expect((out.tools.z as { providerOptions: any }).providerOptions.anthropic.cacheControl.ttl).toBe('5m');
});
});
describe('splitKtxSystemMessages', () => {
it('returns undefined system when no system messages are present', () => {
const split = splitKtxSystemMessages([
{ role: 'user', content: 'hello' },
{ role: 'assistant', content: 'hi' },
]);
expect(split.system).toBeUndefined();
expect(split.messages).toHaveLength(2);
});
it('returns a single system message object when one system message is present, preserving providerOptions', () => {
const systemMessage = {
role: 'system' as const,
content: 'You are helpful.',
providerOptions: { anthropic: { cacheControl: { type: 'ephemeral' } } },
};
const split = splitKtxSystemMessages([systemMessage, { role: 'user', content: 'hello' }]);
expect(split.system).toBe(systemMessage);
expect(split.messages).toEqual([{ role: 'user', content: 'hello' }]);
});
it('returns an array of system messages when multiple are present, in order', () => {
const split = splitKtxSystemMessages([
{ role: 'system', content: 'cached' },
{ role: 'system', content: 'fresh' },
{ role: 'user', content: 'hello' },
]);
expect(Array.isArray(split.system)).toBe(true);
expect(split.system).toHaveLength(2);
expect(split.messages).toEqual([{ role: 'user', content: 'hello' }]);
});
});

View file

@ -1,7 +1,29 @@
import type { LanguageModel, ModelMessage, ToolSet } from 'ai';
import type { LanguageModel, ModelMessage, SystemModelMessage, ToolSet } from 'ai';
import { isAnthropicProtocolModel } from './model-provider.js';
import type { KtxLlmProvider, KtxPromptCacheTtl, KtxPromptParts } from './types.js';
export interface KtxSplitSystemMessagesResult {
system: SystemModelMessage | SystemModelMessage[] | undefined;
messages: ModelMessage[];
}
export function splitKtxSystemMessages(messages: readonly ModelMessage[]): KtxSplitSystemMessagesResult {
const systemMessages: SystemModelMessage[] = [];
const otherMessages: ModelMessage[] = [];
for (const message of messages) {
if (message.role === 'system') {
systemMessages.push(message);
} else {
otherMessages.push(message);
}
}
return {
system:
systemMessages.length === 0 ? undefined : systemMessages.length === 1 ? systemMessages[0] : systemMessages,
messages: otherMessages,
};
}
type ToolMap = ToolSet | Record<string, Record<string, unknown>>;
interface KtxMessageBuilderOptions {

3
pnpm-lock.yaml generated
View file

@ -379,6 +379,9 @@ importers:
'@vitest/coverage-v8':
specifier: ^4.1.6
version: 4.1.6(vitest@4.1.6)
ajv:
specifier: 8.20.0
version: 8.20.0
typescript:
specifier: ^6.0.3
version: 6.0.3

View file

@ -1,3 +1,22 @@
from semantic_layer.cli import main
from __future__ import annotations
main()
import json
import sys
from semantic_layer.cli import main as cli_main
from semantic_layer.models import SourceDefinition
def dump_schema() -> None:
json.dump(
SourceDefinition.model_json_schema(), sys.stdout, indent=2, sort_keys=True
)
sys.stdout.write("\n")
if __name__ == "__main__":
if len(sys.argv) > 1 and sys.argv[1] in {"dump-schema", "schema"}:
sys.argv.pop(1)
dump_schema()
else:
cli_main()

View file

@ -87,18 +87,23 @@ class SourceLoader:
sources[name] = SourceDefinition(**data)
else:
# Overlay — validate and compose with matching manifest entry
errors = validate_overlay(data)
if errors:
raise ValueError(
f"Invalid overlay '{name}' in {path}: {'; '.join(errors)}"
)
base = sources.get(name)
if base:
errors = validate_overlay(data, {c.name for c in base.columns})
if errors:
raise ValueError(
f"Invalid overlay '{name}' in {path}: {'; '.join(errors)}"
)
(
sources[name],
description_sources[name],
) = self._compose(base, data, description_sources.get(name))
else:
errors = validate_overlay(data)
if errors:
raise ValueError(
f"Invalid overlay '{name}' in {path}: {'; '.join(errors)}"
)
logger.warning(
"Orphan overlay '%s' in %s: no matching manifest entry, skipping",
name,
@ -149,12 +154,55 @@ class SourceLoader:
description_sources or None,
)
# Filter columns
excluded = set(overlay.get("exclude_columns", []))
overrides = overlay.get("column_overrides", [])
override_names = {override.get("name") for override in overrides}
conflicts = sorted(name for name in override_names if name in excluded)
if conflicts:
raise ValueError(
"column_overrides conflict with exclude_columns: "
+ ", ".join(conflicts)
)
base_by_name = {column.name: column for column in base.columns}
for override in overrides:
name = override.get("name")
base_column = base_by_name.get(name)
if base_column is None:
raise ValueError(
f"column '{name}' in column_overrides does not exist on manifest source '{base.name}'"
)
excluded = set(overlay.get("exclude_columns", []))
source.columns = [c for c in source.columns if c.name not in excluded]
# Append computed columns (overlay columns with expr)
columns_by_name = {column.name: column for column in source.columns}
for override in overrides:
name = override["name"]
base_column = base_by_name[name]
merged = base_column.model_dump(mode="python", exclude_none=True)
base_descriptions = merged.get("descriptions") or {}
override_data = dict(override)
override_descriptions = override_data.get("descriptions") or {}
merged.update(override_data)
if base_descriptions or override_descriptions:
merged["descriptions"] = {
**base_descriptions,
**override_descriptions,
}
columns_by_name[name] = SourceColumn(**merged)
source.columns = list(columns_by_name.values())
# Append computed columns. Manifest column names cannot be reused here;
# use column_overrides for metadata patches.
for col in overlay.get("columns", []):
name = col.get("name")
if name in base_by_name:
raise ValueError(
f"column '{name}' in columns patches a manifest column on '{base.name}' — move it to 'column_overrides:'"
)
source.columns.append(SourceColumn(**col))
# Set measures
@ -181,6 +229,11 @@ class SourceLoader:
]
source.joins = manifest_joins + new_joins
if not source.table and not source.sql:
raise ValueError("resolved source must have 'table' or 'sql'")
if source.table and source.sql:
raise ValueError("'table' and 'sql' are mutually exclusive")
return source, (description_sources or None)
def _validate_cross_references(self, sources: dict[str, SourceDefinition]) -> None:

View file

@ -143,7 +143,9 @@ class Manifest(BaseModel):
# ── Projection ──────────────────────────────────────────────────────
def validate_overlay(data: dict) -> list[str]:
def validate_overlay(
data: dict, manifest_column_names: set[str] | None = None
) -> list[str]:
"""Validate that overlay data doesn't contain structural fields.
Returns a list of error messages (empty if valid).
@ -162,11 +164,26 @@ def validate_overlay(data: dict) -> list[str]:
errors.append(
f"Overlay column '{col.get('name', '?')}' must use 'descriptions'"
)
if "type" in col and "expr" not in col:
if "expr" not in col:
errors.append(
f"Overlay column '{col.get('name', '?')}' specifies 'type' without 'expr' "
f"(structural types are inherited from manifest — only computed columns may specify a type)"
f"Overlay column '{col.get('name', '?')}' in 'columns' must define "
f"'expr' and 'type' (use 'column_overrides' to patch manifest columns)"
)
if "type" not in col:
errors.append(
f"Overlay column '{col.get('name', '?')}' in 'columns' must define "
f"'type' and 'expr' (use 'column_overrides' to patch manifest columns)"
)
for col in data.get("column_overrides", []):
name = col.get("name", "?")
if "description" in col:
errors.append(f"Column override '{name}' must use 'descriptions'")
if "type" in col:
errors.append(f"Column override '{name}' must not contain 'type'")
if "expr" in col:
errors.append(f"Column override '{name}' must not contain 'expr'")
if manifest_column_names is not None and name not in manifest_column_names:
errors.append(f"Column override '{name}' does not match a manifest column")
return errors

View file

@ -3,7 +3,7 @@ from __future__ import annotations
from enum import Enum
from typing import Any, Literal
from pydantic import BaseModel, Field, model_validator
from pydantic import BaseModel, ConfigDict, Field, model_validator
# ── Source Definition Models ──────────────────────────────────────────
@ -105,6 +105,8 @@ class DefaultTimeDimensionDbt(BaseModel):
class SourceDefinition(BaseModel):
model_config = ConfigDict(extra="forbid")
name: str
description: str | None = None
descriptions: dict[str, str] | None = None
@ -123,6 +125,8 @@ class SourceDefinition(BaseModel):
def validate_source(self) -> SourceDefinition:
if self.description is None:
self.description = _resolve_description_map(self.descriptions)
if not self.table and not self.sql:
raise ValueError("resolved source must have 'table' or 'sql'")
if self.table and self.sql:
raise ValueError("'table' and 'sql' are mutually exclusive")
if not self.grain:

Some files were not shown because too many files have changed in this diff Show more