Initial open-source release

This commit is contained in:
Andrey Avtomonov 2026-05-10 23:12:26 +02:00
commit 1a42152e6f
1199 changed files with 257054 additions and 0 deletions

View file

@ -0,0 +1,166 @@
{
"name": "@klo/context",
"version": "0.0.0-private",
"description": "Core context library for database agents",
"private": true,
"type": "module",
"engines": {
"node": ">=22.0.0"
},
"main": "dist/index.js",
"types": "dist/index.d.ts",
"exports": {
".": {
"types": "./dist/index.d.ts",
"import": "./dist/index.js",
"default": "./dist/index.js"
},
"./agent": {
"types": "./dist/agent/index.d.ts",
"import": "./dist/agent/index.js",
"default": "./dist/agent/index.js"
},
"./core": {
"types": "./dist/core/index.d.ts",
"import": "./dist/core/index.js",
"default": "./dist/core/index.js"
},
"./connections": {
"types": "./dist/connections/index.d.ts",
"import": "./dist/connections/index.js",
"default": "./dist/connections/index.js"
},
"./daemon": {
"types": "./dist/daemon/index.d.ts",
"import": "./dist/daemon/index.js",
"default": "./dist/daemon/index.js"
},
"./ingest": {
"types": "./dist/ingest/index.d.ts",
"import": "./dist/ingest/index.js",
"default": "./dist/ingest/index.js"
},
"./ingest/memory-flow": {
"types": "./dist/ingest/memory-flow/index.d.ts",
"import": "./dist/ingest/memory-flow/index.js",
"default": "./dist/ingest/memory-flow/index.js"
},
"./ingest/metabase-mapping": {
"types": "./dist/ingest/metabase-mapping.d.ts",
"import": "./dist/ingest/metabase-mapping.js",
"default": "./dist/ingest/metabase-mapping.js"
},
"./scan": {
"types": "./dist/scan/index.d.ts",
"import": "./dist/scan/index.js",
"default": "./dist/scan/index.js"
},
"./search": {
"types": "./dist/search/index.d.ts",
"import": "./dist/search/index.js",
"default": "./dist/search/index.js"
},
"./sql-analysis": {
"types": "./dist/sql-analysis/index.d.ts",
"import": "./dist/sql-analysis/index.js",
"default": "./dist/sql-analysis/index.js"
},
"./memory": {
"types": "./dist/memory/index.d.ts",
"import": "./dist/memory/index.js",
"default": "./dist/memory/index.js"
},
"./mcp": {
"types": "./dist/mcp/index.d.ts",
"import": "./dist/mcp/index.js",
"default": "./dist/mcp/index.js"
},
"./project": {
"types": "./dist/project/index.d.ts",
"import": "./dist/project/index.js",
"default": "./dist/project/index.js"
},
"./prompts": {
"types": "./dist/prompts/index.d.ts",
"import": "./dist/prompts/index.js",
"default": "./dist/prompts/index.js"
},
"./skills": {
"types": "./dist/skills/index.d.ts",
"import": "./dist/skills/index.js",
"default": "./dist/skills/index.js"
},
"./sl": {
"types": "./dist/sl/index.d.ts",
"import": "./dist/sl/index.js",
"default": "./dist/sl/index.js"
},
"./sl/descriptions": {
"types": "./dist/sl/descriptions.d.ts",
"import": "./dist/sl/descriptions.js",
"default": "./dist/sl/descriptions.js"
},
"./tools": {
"types": "./dist/tools/index.d.ts",
"import": "./dist/tools/index.js",
"default": "./dist/tools/index.js"
},
"./wiki": {
"types": "./dist/wiki/index.d.ts",
"import": "./dist/wiki/index.js",
"default": "./dist/wiki/index.js"
},
"./package.json": "./package.json"
},
"files": [
"dist",
"prompts",
"skills"
],
"scripts": {
"build": "tsc -p tsconfig.json",
"relationships:benchmarks": "pnpm --silent run build && node scripts/relationship-benchmark-report.mjs",
"search:pglite-spike": "node scripts/pglite-hybrid-search-spike.mjs",
"search:pglite-owner-prototype": "node scripts/pglite-owner-process-prototype.mjs",
"search:pglite-sl-prototype": "node scripts/pglite-sl-search-prototype.mjs",
"test": "vitest run",
"type-check": "tsc -p tsconfig.json --noEmit"
},
"dependencies": {
"@klo/llm": "workspace:*",
"@looker/sdk": "^26.6.1",
"@looker/sdk-node": "^26.6.1",
"@looker/sdk-rtl": "^21.6.5",
"@modelcontextprotocol/sdk": "^1.27.1",
"@notionhq/client": "^5.20.0",
"ai": "^6.0.168",
"better-sqlite3": "^12.6.2",
"handlebars": "^4.7.8",
"lookml-parser": "7.1.0",
"minimatch": "^10.2.4",
"p-limit": "^7.3.0",
"pg": "^8.19.0",
"simple-git": "3.32.2",
"yaml": "^2.8.2",
"zod": "^4.1.13"
},
"devDependencies": {
"@electric-sql/pglite": "^0.4.5",
"@electric-sql/pglite-socket": "^0.1.5",
"@types/better-sqlite3": "^7.6.13",
"@types/node": "^24.3.0",
"@types/pg": "^8.16.0",
"typescript": "^5.9.3",
"vitest": "^4.0.18"
},
"license": "Apache-2.0",
"repository": {
"type": "git",
"url": "git+https://github.com/kaelio/ktx.git",
"directory": "packages/context"
},
"bugs": {
"url": "https://github.com/kaelio/ktx/issues"
},
"homepage": "https://github.com/kaelio/ktx#readme"
}

View file

@ -0,0 +1,21 @@
<role>
You are backfilling knowledge from a historical chat transcript or archived SQL review. The content has already been researched by another user or process; you're running offline to extract what is durable enough to persist.
</role>
<stance>
Moderately conservative. Historical content is not directly steering current work, so spurious captures will surface in future chats and annoy users. But genuine patterns are worth saving — these backfills exist because the content is known to contain value.
Capture only when the signal is unambiguous: a metric definition stated plainly, a reusable SQL pattern, a documented correction, a durable business rule. Skip casual chatter and ambiguous interpretations.
</stance>
<workflow>
1. Read the wiki and SL indexes to avoid creating duplicates.
2. If the content has wiki-style signal, load the `knowledge_capture` skill and follow its workflow.
3. If the content has SL-style signal, load the `sl` skill and follow its Part 3 workflow.
4. Prefer updating existing entries over creating new ones — backfills often duplicate existing knowledge.
5. When done, exit the loop.
</workflow>
<scope>
Wiki writes follow the session's scope selection (USER for user-scoped enabled, GLOBAL otherwise). The `wiki_write` tool picks automatically — focus on capture judgment.
</scope>

View file

@ -0,0 +1,27 @@
<role>
You are the reconciliation agent for a multi-file ingest bundle. Stage 3 WorkUnits have already run against this job's session worktree; your input is the deterministic Stage Index listing every write each WU made, plus an Eviction Set listing raw files present in the prior sync but absent in this one. Your job is to (a) decide what happens to each evicted artifact (remove vs retain with a deprecation marker), (b) sweep the Stage Index for any cross-WU conflicts the individual WUs missed, and (c) emit conflict + eviction records that the runner will fold into the final IngestReport.
</role>
<stance>
Parsimonious. Stage 3 WUs already loaded `ingest_triage` and handled conflicts they saw. Your sweep is the safety net for contradictions that are only visible when you can see the whole job at once — e.g. two WUs that each looked clean in isolation but collectively form a near-duplicate cluster. Do not redo work Stage 3 already did.
</stance>
<workflow>
1. Load `ingest_triage`, then `sl_capture` + `knowledge_capture`.
2. Call `stage_list()` for the full index of this job's writes. If it is empty AND you have no evictions, exit — the runner short-circuits this case but the skill still teaches you to bail fast.
3. If the system prompt includes `<canonical_pins>`, apply those pins before flagging a same-name or near-duplicate conflict. A pinned `canonicalArtifactKey` keeps the contested name when it is present in the Stage Index; competing variants keep or receive disambiguated names.
4. For each pair of WUs that wrote overlapping SL source names or wiki keys, call `stage_diff` to see the actual difference. If they're the same content, leave it. If they differ per `ingest_triage` rules, apply the correct resolution (rename + capture; election of canonical; silent replace for expression-only re-ingest change; or pinned canonical), then call `emit_conflict_resolution` with the artifact key and decision.
5. Call `eviction_list()` for deleted raw paths. For each eviction: if inbound refs are empty, remove the artifact (`sl_delete`, `wiki_remove`); if inbound refs exist, retain with a deprecation marker. Then call `emit_eviction_decision` for every removed or retained artifact.
6. If the Stage 4 sweep discovers a raw file whose only honest outcome is standalone SQL, wiki-only capture, or a human flag, call `emit_unmapped_fallback` with the raw path, reason, and fallback kind.
7. Use `read_raw_span` to zoom into specific raw files when you need to resolve what two contested measures actually compute.
8. Exit when you've processed every item.
</workflow>
<scope>
All wiki writes are GLOBAL (same as Stage 3). SL writes target the same session worktree Stage 3 used.
</scope>
<do_not>
- Do not overwrite a Stage 3 WU's resolution that already matches `ingest_triage` output — that's churn.
- Do not treat two SL sources with the same logical meaning but legitimately different domains (e.g. `finance.revenue` and `marketing.revenue`) as a conflict — that's by design.
</do_not>

View file

@ -0,0 +1,28 @@
<role>
You are processing ONE WorkUnit of a multi-file ingest bundle. The WorkUnit gives you a slice of raw source files (LookML views, dbt/MetricFlow YAMLs, Metabase card JSONs, or similar) and you must translate that slice into KLO semantic-layer sources and/or knowledge wiki pages, in one pass. Prior WorkUnits in this same job may have already written SL sources and wiki pages; their writes are visible on the working branch and searchable via `wiki_sl_search`.
</role>
<stance>
Assertive. The bundle was explicitly submitted for ingest. Default to capturing everything the raw files declare that maps cleanly to KLO: one SL source per table/view, one wiki page per non-obvious business rule or alias. Do not abandon a WorkUnit because "some content overlaps with another WU"; use `ingest_triage` to reconcile, do not skip.
</stance>
<workflow>
1. Read this WorkUnit's section at the end of the user prompt. It lists your `rawFiles`, any unchanged `dependencyPaths` you may need to resolve references, the `peerFileIndex` (paths only; you CANNOT read them), the source's `skillNames`, and any `priorProvenance` rows telling you what earlier syncs produced from these files.
2. Load the per-source review skill first (e.g. `lookml_ingest`, `metricflow_ingest`, `dbt_ingest`), then `sl_capture` and `knowledge_capture`, and `ingest_triage` last. The triage skill tells you how to react when `wiki_sl_search` reveals that a prior WU already wrote something overlapping.
3. If the system prompt includes `<canonical_pins>`, read those pins before choosing artifact keys. A pin's `canonicalArtifactKey` is the preferred artifact for its `contestedKey`: prefer editing the pinned canonical artifact when it already exists or when this raw file clearly updates it. Do not create a duplicate contested artifact when a pin says another artifact is canonical; use a specific disambiguated key only when the raw file describes a genuinely different domain.
4. For each raw file: call `read_raw_file` (or `read_raw_span` for slicing large files) to load content. Before writing a new SL source or wiki page, call `wiki_sl_search` for each candidate name to find prior-WU writes; apply `ingest_triage` when you hit one, and apply any matching canonical pin before deciding whether to edit, rename, or skip.
5. When `priorProvenance` names an existing artifact for one of your raw files, prefer `sl_edit` over `sl_write` for that artifact: the re-ingest change rule says expression-only changes replace silently, grain/column/filter changes replace and flag.
6. When a raw file cannot map to normal SL and you use a fallback path, call `emit_unmapped_fallback` exactly once for that raw file and reason. Use `fallback: "sql_standalone"` for a standalone SQL source, `fallback: "wiki_only"` for documentation-only capture, and `fallback: "flagged"` when no reliable artifact can be written.
7. When you're done, exit the loop without further tool calls.
</workflow>
<scope>
All wiki writes go to the GLOBAL scope. Bundle ingests are not personal. The `wiki_write` tool selects scope automatically for this caller.
</scope>
<do_not>
- Do not read peer files; only files listed in `rawFiles` or `dependencyPaths` are accessible. `read_raw_file` will reject everything else.
- Do not invent measures/joins/rules not declared in the raw files.
- Do not duplicate an artifact that prior provenance says you already produced; update it.
- Do not silently accept a name collision with a prior WU's write when the formula differs. Trigger `ingest_triage`.
</do_not>

View file

@ -0,0 +1,28 @@
<role>
You are ingesting an external technical artifact (a LookML view, dbt model, schema description, business glossary, or other reference document) into KLO organizational memory. The user has explicitly submitted this content for bulk ingest. Assume it is intentional and worth capturing.
</role>
<stance>
Assertive. Unlike a chat turn, this content was deliberately submitted. Default to capturing. Err on the side of creating an SL source for every declared table/view and a wiki page for every non-obvious business rule, alias, or definition you find in the artifact.
A single artifact typically produces multiple actions: one SL source per table/view, additional measures or joins per metric, and one wiki page per alias or convention.
</stance>
<workflow>
1. Review the wiki and SL indexes in the prompt. Prefer updating existing entries over creating duplicates.
2. Load the `sl` skill for SL-writes and `knowledge_capture` for wiki-writes. Both skills describe schema, decision rules, and editing patterns — follow them.
3. For each distinct element in the artifact (table/view, measure, dimension group, derived column, computed filter, business rule, alias): decide whether it belongs in the SL, in the wiki, or both.
4. Write SL sources first (so they have stable names), then wiki pages that reference them via `sl_refs`.
5. When the artifact mixes data definitions with business rules, capture BOTH — one in each store, linked.
6. When you're done, exit the loop without calling any more tools.
</workflow>
<scope>
All wiki writes go to the GLOBAL scope — they will be visible to every user of this KLO project. Phrase wiki pages as objective business knowledge, not personal preference. The `wiki_write` tool handles scope selection automatically for external ingest.
</scope>
<do_not>
- Do not fabricate measures, joins, or rules that aren't in the artifact.
- Do not invent column names. If a type is unclear, omit it rather than guess.
- Do not mirror presentation hints (LookML `link:`, `map_layer_name:`, HTML formatting) into SL — those belong in wiki if anywhere.
</do_not>

View file

@ -0,0 +1,30 @@
<role>
You capture durable knowledge from an analytics assistant's chat turn. The user just asked a question, the assistant answered, and you are running after the turn to decide what — if anything — is worth saving for future chats.
</role>
<criteria>
Save the durable parts of a turn:
- A definition the user just stated or refined ("by X I mean…", "going forward, exclude Y", "treat Z as…").
- A reusable SQL pattern the assistant derived (aggregate metric, derived view, multi-table join).
- A new join path between two existing SL sources.
- A computed dimension or named segment that would be useful in later queries.
- An organizational convention or alias the user surfaced.
Skip:
- Pure clarifications and one-off lookups with no reusable structure.
- Trivial COUNT(*) / SELECT preview queries with no business filter.
- Restatements of patterns already captured (cite the existing entry instead).
</criteria>
<workflow>
1. Read the wiki index and the SL sources index in the prompt below.
2. Identify durable knowledge OR reusable data patterns in the turn.
3. If the turn has wiki-style signal (preferences, definitions, conventions), load the `knowledge_capture` skill and follow its workflow.
4. If the turn has SL-style signal (reusable metric aggregations, new joins, derived dimensions), load the `sl` skill and follow its Part 3 (capture) workflow.
5. A single turn can produce BOTH a wiki page and an SL source — load both skills and author the edge once on the wiki via `sl_refs: [source_name]`. The reverse edge (wiki pages that cite the SL source) is derived by the reconciler; do not set `knowledge_refs:` on the SL side.
6. When you're done, exit the loop without calling any more tools. Do NOT emit a final text summary.
</workflow>
<scope>
Wiki writes go to the GLOBAL scope by default. Phrase as objective business knowledge, not personal preference. (Users who want personal-scoped knowledge can opt in by toggling `userScopedKnowledgeEnabled` in app settings; when enabled, `wiki_write` will route to USER scope automatically.)
</scope>

View file

@ -0,0 +1,40 @@
# Light Context Extraction
Extract up to the configured maximum number of durable knowledge candidates from one short evidence page.
Capture only durable, reusable company knowledge:
- definitions
- business rules
- policies
- workflows and processes
- source-of-truth conventions
- aliases and glossary terms
- customer or product assumptions that affect future analysis
Skip meeting minutiae, raw task lists, project status updates, brainstorms without durable decisions, duplicate facts, transient announcements, and page summaries.
Each candidate must cite at least one chunk id from the supplied chunk list. Return only JSON with this shape:
```json
{
"candidates": [
{
"candidateKey": "stable-kebab-key",
"topic": "Topic name",
"assertion": "One durable assertion.",
"rationale": "Why the evidence supports this candidate.",
"evidenceChunkIds": ["00000000-0000-0000-0000-000000000000"],
"suggestedPageKey": "stable-page-key",
"actionHint": "create",
"durabilityScore": 3,
"authorityScore": 2,
"reuseScore": 3,
"noveltyScore": 2,
"riskScore": 0
}
]
}
```
Score fields are integers from 0 to 3. `actionHint` must be one of `create`, `update`, `merge`, `conflict`, or `skip`.

View file

@ -0,0 +1,102 @@
# Page Triage Classifier
Classify one staged evidence page into exactly one lane:
- `skip` - the page is indexed evidence, but it is transient, repetitive, task-like, date-titled status reporting, or too weak to produce durable knowledge candidates.
- `light` - the page is short and contains one to three durable facts, reusable templates, scripts, playbooks, personas, or messaging frameworks that can be extracted in one pass without tool use.
- `full` - the page has substantial structure, several candidate topics, cross-page context, conflicts, source-of-truth nuance, or enough ambiguity to require the full WorkUnit agent.
Use the page excerpt and structural signals as evidence. Structural signals can influence the decision but cannot replace reading the excerpt.
Reusable templates and scripts are durable knowledge regardless of subject matter. Sales, marketing, customer-success, and operations pages are not transient merely because they contain messaging copy, outreach scripts, positioning notes, personas, or campaign language. Date-titled standups are still skip; named templates and scripts are not.
Analytics evidence (BI tools like Looker, Metabase, Tableau) is durable knowledge of *how the organization defines its metrics and segments*. The `signals.objectType` tells you what you are looking at:
- `looker_explore` (or any explore-like analytics surface) -> `full` by default. Explores enumerate dimensions, measures, and joins — these are the canonical schema-of-the-business and warrant the full WorkUnit agent so each measure can become a candidate. Skip only if the excerpt is empty or contains zero measures and zero descriptive text.
- `looker_dashboard` (or any named dashboard with tile queries, filters, calculated fields) -> `full` when it has multiple tiles or named metrics, `light` when one or two tiles with trivial fields, `skip` only when usage hints make it clear it is unused (e.g. `queryCount30d` and `uniqueUsers30d` are both zero) AND there are no calculated fields, filters, or named tiles worth extracting.
- `looker_look` (or any saved query) -> `light` when the query is a simple field listing, `full` when it has custom calculations, non-trivial filters, or aggregation expressions, `skip` only when usage is zero AND the query is a default field listing.
Treat dashboard/Look filter values, saved aggregations, calculated fields, and named tiles as candidate metric/segment definitions — they are durable. Do **not** mark BI evidence as `skip` solely because it is "configuration" or "tied to a data model"; that is exactly the durable knowledge we want to capture.
Historic SQL query-history evidence is durable when usage signals show a repeated pattern worth memory work. For `signals.objectType === "historic_sql_template"`:
- If `propertyHints.executions_bucket=low AND distinct_users_bucket=solo`, return `skip`. A one-off query by one user is indexed evidence, but it is too weak to produce durable knowledge candidates.
- Else if `propertyHints.service_account_only=true AND below the frequency floor`, return `light`. Treat `executions_bucket=low` or `distinct_users_bucket=solo` as below the frequency floor for this rule. Service-account-only templates can preserve useful SQL evidence, but should not occupy a full WorkUnit unless other signals show shared human usage.
- Otherwise apply the standard full/light/skip logic to the page excerpt. Favor `full` for shared human usage with mid or high execution volume, especially when `tables_touched`, normalized SQL, and slot classifications define a reusable metric, segment, threshold, or operational query pattern.
Historic-SQL synthetic signal examples:
- skip low solo template:
```json
{
"objectType": "historic_sql_template",
"propertyHints": {
"executions_bucket": "low",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"slot_summary": "1 constant, 1 runtime"
}
}
```
-> `skip`
- light service-account-only template:
```json
{
"objectType": "historic_sql_template",
"propertyHints": {
"executions_bucket": "high",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "true",
"slot_summary": "1 constant, 0 runtime"
}
}
```
-> `light`
- full shared human template:
```json
{
"objectType": "historic_sql_template",
"propertyHints": {
"executions_bucket": "high",
"distinct_users_bucket": "team",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"slot_summary": "2 constant, 1 runtime"
}
}
```
-> `full`
Examples:
- `Cold Call Script` with reusable call flow, objection handling, or positioning language -> `light` when short, `full` when multi-section or ambiguous.
- `Updated Messaging For Everything` with reusable positioning or campaign messaging framework -> `light` when short, `full` when it contains several frameworks.
- `Messaging March sprint` with reusable messaging templates or playbook sections -> `light` or `full`.
- `2026-04-30 Daily Standup` containing status updates, blockers, and done/next lists -> `skip`.
- `Sales Pipeline` (looker_explore) listing dimensions and measures across opportunity, account, and contact joins -> `full`.
- `Marketing & Acquisition` (looker_dashboard) with tiles like "Cost per Lead", "MQL to SQL %", and saved filters -> `full`.
- An empty looker_explore stub with zero dimensions and zero measures -> `skip`.
Return only JSON with this shape:
```json
{
"lane": "skip",
"reason": "short reason"
}
```
Valid lane values are `skip`, `light`, and `full`.

View file

@ -0,0 +1,354 @@
import { readdir, readFile, realpath, rm, stat, writeFile, mkdtemp } from 'node:fs/promises';
import { createRequire } from 'node:module';
import { tmpdir } from 'node:os';
import { dirname, join, relative, resolve } from 'node:path';
import { performance } from 'node:perf_hooks';
import { fileURLToPath } from 'node:url';
const require = createRequire(import.meta.url);
const scriptDir = dirname(fileURLToPath(import.meta.url));
const contextDir = resolve(scriptDir, '..');
const kloRoot = resolve(contextDir, '../..');
const docsDir = join(kloRoot, 'docs');
const reportPath = join(docsDir, 'hybrid-search-pglite-spike.md');
async function timed(label, fn) {
const started = performance.now();
const value = await fn();
const durationMs = Number((performance.now() - started).toFixed(2));
return { label, durationMs, value };
}
async function directoryBytes(path) {
const entry = await stat(path);
if (entry.isFile()) {
return entry.size;
}
if (!entry.isDirectory()) {
return 0;
}
const children = await readdir(path);
const childSizes = await Promise.all(children.map((child) => directoryBytes(join(path, child))));
return childSizes.reduce((sum, size) => sum + size, 0);
}
async function resolvePackageJson(packageName) {
let currentDir = dirname(require.resolve(packageName));
while (currentDir !== dirname(currentDir)) {
const packageJsonPath = join(currentDir, 'package.json');
try {
const packageJson = JSON.parse(await readFile(packageJsonPath, 'utf8'));
if (packageJson.name === packageName) {
return { packageJsonPath, packageJson };
}
} catch (error) {
if (error?.code !== 'ENOENT') {
throw error;
}
}
currentDir = dirname(currentDir);
}
throw new Error(`Could not resolve package.json for ${packageName}`);
}
async function packageInfo(packageName) {
const { packageJsonPath, packageJson } = await resolvePackageJson(packageName);
const packageDir = await realpath(dirname(packageJsonPath));
return {
name: packageName,
version: packageJson.version,
path: relative(kloRoot, packageDir),
bytes: await directoryBytes(packageDir),
};
}
async function createDb(PGlite, vector, pg_trgm, dataDir) {
const db = await PGlite.create({
dataDir,
extensions: {
vector,
pg_trgm,
},
});
await db.exec(`
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE TABLE IF NOT EXISTS spike_documents (
id TEXT PRIMARY KEY,
search_text TEXT NOT NULL,
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
embedding vector(3) NOT NULL
);
CREATE INDEX IF NOT EXISTS spike_documents_fts_idx
ON spike_documents
USING GIN (to_tsvector('english', search_text));
CREATE INDEX IF NOT EXISTS spike_documents_vector_idx
ON spike_documents
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 1);
CREATE TABLE IF NOT EXISTS spike_dictionary_values (
connection_id TEXT NOT NULL,
source_name TEXT NOT NULL,
column_name TEXT NOT NULL,
value TEXT NOT NULL,
PRIMARY KEY (connection_id, source_name, column_name, value)
);
CREATE INDEX IF NOT EXISTS spike_dictionary_values_trgm_idx
ON spike_dictionary_values
USING GIN (value gin_trgm_ops);
`);
return db;
}
async function seed(db) {
await db.query(
`
INSERT INTO spike_documents (id, search_text, metadata, embedding)
VALUES
($1, $2, $3::jsonb, $4::vector),
($5, $6, $7::jsonb, $8::vector),
($9, $10, $11::jsonb, $12::vector)
ON CONFLICT (id) DO UPDATE
SET search_text = EXCLUDED.search_text,
metadata = EXCLUDED.metadata,
embedding = EXCLUDED.embedding
`,
[
'warehouse/orders',
'orders paid revenue refund status customer',
JSON.stringify({ connectionId: 'warehouse', sourceName: 'orders' }),
JSON.stringify([1, 0, 0]),
'finance/orders',
'orders finance bookings gross margin',
JSON.stringify({ connectionId: 'finance', sourceName: 'orders' }),
JSON.stringify([0.72, 0.28, 0]),
'warehouse/customers',
'customers accounts lifecycle region',
JSON.stringify({ connectionId: 'warehouse', sourceName: 'customers' }),
JSON.stringify([0, 1, 0]),
],
);
await db.query(`
INSERT INTO spike_dictionary_values (connection_id, source_name, column_name, value)
VALUES
('warehouse', 'orders', 'status', 'refunded'),
('warehouse', 'orders', 'status', 'paid'),
('warehouse', 'customers', 'region', 'emea')
ON CONFLICT DO NOTHING
`);
}
async function closeDb(db) {
if (typeof db.close === 'function') {
await db.close();
}
}
async function main() {
const importTimer = await timed('dynamic import @electric-sql/pglite', async () => {
const [{ PGlite }, { vector }, { pg_trgm }] = await Promise.all([
import('@electric-sql/pglite'),
import('@electric-sql/pglite/vector'),
import('@electric-sql/pglite/contrib/pg_trgm'),
]);
return { PGlite, vector, pg_trgm };
});
const { PGlite, vector, pg_trgm } = importTimer.value;
const tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-report-'));
const dataDir = join(tempDir, 'pgdata');
let db;
let reopened;
try {
const createTimer = await timed('create persistent PGlite database and load extensions', async () => {
db = await createDb(PGlite, vector, pg_trgm, dataDir);
return true;
});
const seedTimer = await timed('seed hybrid search fixture', async () => seed(db));
const ftsTimer = await timed('Postgres FTS query', () =>
db.query(
`
SELECT id
FROM spike_documents
WHERE to_tsvector('english', search_text) @@ websearch_to_tsquery('english', $1)
ORDER BY ts_rank_cd(to_tsvector('english', search_text), websearch_to_tsquery('english', $1)) DESC, id ASC
LIMIT 1
`,
['paid orders'],
),
);
const vectorTimer = await timed('pgvector cosine query', () =>
db.query(
`
SELECT id, 1 - (embedding <=> $1::vector) AS similarity
FROM spike_documents
ORDER BY embedding <=> $1::vector, id ASC
LIMIT 1
`,
[JSON.stringify([1, 0, 0])],
),
);
const trigramTimer = await timed('pg_trgm dictionary query', () =>
db.query(
`
SELECT connection_id || '/' || source_name AS id, value, similarity(value, $1) AS score
FROM spike_dictionary_values
WHERE similarity(value, $1) > 0
ORDER BY score DESC, id ASC, value ASC
LIMIT 1
`,
['refund'],
),
);
const sameInstanceTimer = await timed('same instance parallel reads', () =>
Promise.all(Array.from({ length: 4 }, () => db.query('SELECT COUNT(*)::int AS count FROM spike_documents'))),
);
let secondOpenStatus = 'opened';
let secondOpenMessage = 'Second direct opener executed SELECT 1.';
let second;
try {
second = await createDb(PGlite, vector, pg_trgm, dataDir);
await second.query('SELECT 1');
} catch (error) {
secondOpenStatus = 'blocked';
secondOpenMessage = error instanceof Error ? error.message : String(error);
} finally {
if (second) {
await closeDb(second);
}
}
await closeDb(db);
db = undefined;
const reopenTimer = await timed('reopen persistent PGlite database', async () => {
reopened = await createDb(PGlite, vector, pg_trgm, dataDir);
return reopened.query('SELECT COUNT(*)::int AS count FROM spike_documents');
});
const packages = await Promise.all([
packageInfo('@electric-sql/pglite'),
packageInfo('@electric-sql/pglite-socket'),
]);
const result = {
generatedAt: new Date().toISOString(),
node: process.version,
packages,
timingsMs: {
import: importTimer.durationMs,
createAndExtensions: createTimer.durationMs,
seed: seedTimer.durationMs,
ftsQuery: ftsTimer.durationMs,
vectorQuery: vectorTimer.durationMs,
trigramQuery: trigramTimer.durationMs,
sameInstanceParallelReads: sameInstanceTimer.durationMs,
reopen: reopenTimer.durationMs,
},
topResults: {
fts: ftsTimer.value.rows[0]?.id ?? null,
vector: vectorTimer.value.rows[0]?.id ?? null,
trigram: trigramTimer.value.rows[0]?.id ?? null,
persistedRowCount: reopenTimer.value.rows[0]?.count ?? null,
},
concurrency: {
sameInstanceReadCounts: sameInstanceTimer.value.map((queryResult) => queryResult.rows[0]?.count ?? null),
secondDirectOpenStatus: secondOpenStatus,
secondDirectOpenMessage: secondOpenMessage,
},
};
const totalPackageBytes = packages.reduce((sum, pkg) => sum + pkg.bytes, 0);
const recommendation =
secondOpenStatus === 'opened'
? 'Prototype a PGlite backend behind an explicit owner process or socket before exposing CLI plus MCP concurrent access.'
: 'Use a socket or owner-process architecture for any PGlite backend prototype because direct second opener access was blocked.';
const markdown = `# Hybrid Search PGlite Spike
Generated: ${result.generatedAt}
## Summary
PGlite loaded in Node ${result.node}, enabled vector and pg_trgm extensions, executed Postgres FTS, pgvector cosine ranking, pg_trgm dictionary ranking, and reopened a persistent filesystem database.
Recommendation: ${recommendation}
## Package Footprint
| Package | Version | Approx bytes | Resolved path |
| --- | --- | ---: | --- |
${packages.map((pkg) => `| \`${pkg.name}\` | \`${pkg.version}\` | ${pkg.bytes} | \`${pkg.path}\` |`).join('\n')}
Total measured package bytes: ${totalPackageBytes}
## Timings
| Probe | Duration ms |
| --- | ---: |
${Object.entries(result.timingsMs)
.map(([name, ms]) => `| ${name} | ${ms} |`)
.join('\n')}
## Search Feature Results
| Probe | Top result |
| --- | --- |
| Postgres FTS | \`${result.topResults.fts}\` |
| pgvector cosine | \`${result.topResults.vector}\` |
| pg_trgm dictionary | \`${result.topResults.trigram}\` |
| Reopened persisted row count | \`${result.topResults.persistedRowCount}\` |
## Concurrency Observation
Same-instance parallel read counts: \`${result.concurrency.sameInstanceReadCounts.join(', ')}\`
Second direct opener status: \`${result.concurrency.secondDirectOpenStatus}\`
Second direct opener message:
\`\`\`text
${result.concurrency.secondDirectOpenMessage}
\`\`\`
## Decision
The SQLite backend remains the production default. The next PGlite step, if approved, is an owner-process or socket-backed prototype that reuses the existing \`SearchBackendCapabilities\` and backend conformance helpers without changing the public CLI surface.
`;
await writeFile(reportPath, markdown);
process.stdout.write(`Wrote ${relative(process.cwd(), reportPath)}\n`);
process.stdout.write(JSON.stringify(result, null, 2));
process.stdout.write('\n');
} finally {
if (db) {
await closeDb(db);
}
if (reopened) {
await closeDb(reopened);
}
await rm(tempDir, { recursive: true, force: true });
}
}
main().catch((error) => {
console.error(error);
process.exitCode = 1;
});

View file

@ -0,0 +1,317 @@
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import { createServer } from 'node:net';
import { tmpdir } from 'node:os';
import { dirname, join, resolve } from 'node:path';
import { performance } from 'node:perf_hooks';
import { fileURLToPath } from 'node:url';
import { PGlite } from '@electric-sql/pglite';
import { pg_trgm } from '@electric-sql/pglite/contrib/pg_trgm';
import { vector } from '@electric-sql/pglite/vector';
import { PGLiteSocketServer } from '@electric-sql/pglite-socket';
import { Client } from 'pg';
const scriptDir = dirname(fileURLToPath(import.meta.url));
const contextDir = resolve(scriptDir, '..');
const kloRoot = resolve(contextDir, '../..');
const reportPath = join(kloRoot, 'docs', 'hybrid-search-pglite-owner-process.md');
async function timed(label, fn) {
const started = performance.now();
const value = await fn();
return {
label,
durationMs: Number((performance.now() - started).toFixed(2)),
value,
};
}
async function allocatePort() {
const server = createServer();
await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve));
const address = server.address();
if (typeof address !== 'object' || address === null) {
throw new Error('Expected TCP server address while allocating a PGlite owner-process port.');
}
await new Promise((resolve, reject) => {
server.close((error) => {
if (error) {
reject(error);
return;
}
resolve();
});
});
return address.port;
}
async function createOwner(dataDir, port) {
const db = await PGlite.create({
dataDir,
extensions: {
vector,
pg_trgm,
},
});
await db.exec(`
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE TABLE IF NOT EXISTS prototype_documents (
id TEXT PRIMARY KEY,
search_text TEXT NOT NULL,
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
embedding vector(3) NOT NULL
);
CREATE INDEX IF NOT EXISTS prototype_documents_fts_idx
ON prototype_documents
USING GIN (to_tsvector('english', search_text));
CREATE INDEX IF NOT EXISTS prototype_documents_vector_idx
ON prototype_documents
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 1);
CREATE TABLE IF NOT EXISTS prototype_dictionary_values (
connection_id TEXT NOT NULL,
source_name TEXT NOT NULL,
column_name TEXT NOT NULL,
value TEXT NOT NULL,
PRIMARY KEY (connection_id, source_name, column_name, value)
);
CREATE INDEX IF NOT EXISTS prototype_dictionary_values_trgm_idx
ON prototype_dictionary_values
USING GIN (value gin_trgm_ops);
`);
const server = new PGLiteSocketServer({
db,
host: '127.0.0.1',
port,
maxConnections: 100,
});
await server.start();
return {
db,
server,
connectionConfig: {
host: '127.0.0.1',
port,
user: 'postgres',
database: 'postgres',
application_name: 'klo-pglite-owner-report',
connectionTimeoutMillis: 5_000,
},
};
}
async function withClient(connectionConfig, fn) {
const client = new Client(connectionConfig);
await client.connect();
try {
return await fn(client);
} finally {
await client.end();
}
}
async function seed(connectionConfig) {
await withClient(connectionConfig, async (client) => {
await client.query(
`
INSERT INTO prototype_documents (id, search_text, metadata, embedding)
VALUES
($1, $2, $3::jsonb, $4::vector),
($5, $6, $7::jsonb, $8::vector),
($9, $10, $11::jsonb, $12::vector)
ON CONFLICT (id) DO UPDATE
SET search_text = EXCLUDED.search_text,
metadata = EXCLUDED.metadata,
embedding = EXCLUDED.embedding
`,
[
'warehouse/orders',
'orders paid revenue refund status customer',
JSON.stringify({ connectionId: 'warehouse', sourceName: 'orders' }),
JSON.stringify([1, 0, 0]),
'finance/orders',
'orders finance bookings gross margin',
JSON.stringify({ connectionId: 'finance', sourceName: 'orders' }),
JSON.stringify([0.72, 0.28, 0]),
'warehouse/customers',
'customers accounts lifecycle region',
JSON.stringify({ connectionId: 'warehouse', sourceName: 'customers' }),
JSON.stringify([0, 1, 0]),
],
);
await client.query(`
INSERT INTO prototype_dictionary_values (connection_id, source_name, column_name, value)
VALUES
('warehouse', 'orders', 'status', 'refunded'),
('warehouse', 'orders', 'status', 'paid'),
('warehouse', 'customers', 'region', 'emea')
ON CONFLICT DO NOTHING
`);
});
}
async function queryTopResults(connectionConfig) {
return await withClient(connectionConfig, async (client) => {
const lexical = await client.query(
`
SELECT id
FROM prototype_documents
WHERE to_tsvector('english', search_text) @@ websearch_to_tsquery('english', $1)
ORDER BY ts_rank_cd(to_tsvector('english', search_text), websearch_to_tsquery('english', $1)) DESC, id ASC
LIMIT 1
`,
['paid orders'],
);
const semantic = await client.query(
`
SELECT id
FROM prototype_documents
ORDER BY embedding <=> $1::vector, id ASC
LIMIT 1
`,
[JSON.stringify([1, 0, 0])],
);
const dictionary = await client.query(
`
SELECT connection_id || '/' || source_name AS id
FROM prototype_dictionary_values
WHERE similarity(value, $1) > 0
ORDER BY similarity(value, $1) DESC, id ASC, value ASC
LIMIT 1
`,
['refund'],
);
return {
lexical: lexical.rows[0]?.id ?? '<missing>',
semantic: semantic.rows[0]?.id ?? '<missing>',
dictionary: dictionary.rows[0]?.id ?? '<missing>',
};
});
}
async function concurrentReads(connectionConfig) {
const clients = await Promise.all(
Array.from({ length: 4 }, async () => {
const client = new Client(connectionConfig);
await client.connect();
return client;
}),
);
try {
const results = await Promise.all(
clients.map((client) => client.query('SELECT COUNT(*)::int AS count FROM prototype_documents')),
);
return results.map((result) => result.rows[0]?.count ?? null);
} finally {
await Promise.all(clients.map((client) => client.end().catch(() => undefined)));
}
}
async function stopOwner(owner) {
await owner.server.stop();
await owner.db.close();
}
async function main() {
const tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-owner-report-'));
const dataDir = join(tempDir, 'pgdata');
const port = await allocatePort();
let owner;
try {
const startTimer = await timed('startOwner', async () => await createOwner(dataDir, port));
owner = startTimer.value;
const seedTimer = await timed('seed', async () => await seed(owner.connectionConfig));
const queryTimer = await timed('searchQueries', async () => await queryTopResults(owner.connectionConfig));
const concurrentTimer = await timed('concurrentReads', async () => await concurrentReads(owner.connectionConfig));
await stopOwner(owner);
owner = undefined;
const restartTimer = await timed('restartOwner', async () => await createOwner(dataDir, port));
owner = restartTimer.value;
const persisted = await withClient(owner.connectionConfig, async (client) => {
const result = await client.query('SELECT COUNT(*)::int AS count FROM prototype_documents');
return result.rows[0]?.count ?? null;
});
const markdown = `# Hybrid Search PGlite Owner Process Prototype
Generated: ${new Date().toISOString()}
## Summary
PGlite started behind one explicit owner process, enabled vector and pg_trgm extensions, served PostgreSQL clients through \`@electric-sql/pglite-socket\`, answered lexical, semantic, and dictionary probes, and preserved rows across owner restart.
Recommendation: Keep SQLite as the production default. The next PGlite implementation step should be a private adapter prototype behind an explicit configuration flag, still guarded by backend conformance tests, before any CLI or MCP default changes.
## Timings
| Probe | Duration ms |
| --- | ---: |
| startOwner | ${startTimer.durationMs} |
| seed | ${seedTimer.durationMs} |
| searchQueries | ${queryTimer.durationMs} |
| concurrentReads | ${concurrentTimer.durationMs} |
| restartOwner | ${restartTimer.durationMs} |
## Search Feature Results
| Probe | Top result |
| --- | --- |
| Postgres FTS through socket | \`${queryTimer.value.lexical}\` |
| pgvector cosine through socket | \`${queryTimer.value.semantic}\` |
| pg_trgm dictionary through socket | \`${queryTimer.value.dictionary}\` |
| Reopened persisted row count | \`${persisted}\` |
## Concurrency Observation
Concurrent socket read counts: \`${concurrentTimer.value.join(', ')}\`
## Decision
The owner-process shape is viable for a prototype because it gives CLI and MCP callers a PostgreSQL protocol boundary without opening the same PGlite data directory from independent runtimes. This report is not a production adapter acceptance record.
`;
await writeFile(reportPath, markdown);
console.log(`Wrote ${reportPath}`);
console.log(
JSON.stringify(
{
port,
timings: {
startOwner: startTimer.durationMs,
seed: seedTimer.durationMs,
searchQueries: queryTimer.durationMs,
concurrentReads: concurrentTimer.durationMs,
restartOwner: restartTimer.durationMs,
},
topResults: queryTimer.value,
concurrentReads: concurrentTimer.value,
persisted,
},
null,
2,
),
);
} finally {
if (owner) {
await stopOwner(owner).catch(() => undefined);
}
await rm(tempDir, { recursive: true, force: true });
}
}
await main();

View file

@ -0,0 +1,263 @@
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import { createServer } from 'node:net';
import { tmpdir } from 'node:os';
import { dirname, join, resolve } from 'node:path';
import { performance } from 'node:perf_hooks';
import { fileURLToPath } from 'node:url';
import { PGlite } from '@electric-sql/pglite';
import { pg_trgm } from '@electric-sql/pglite/contrib/pg_trgm';
import { vector } from '@electric-sql/pglite/vector';
import { PGLiteSocketServer } from '@electric-sql/pglite-socket';
import { Client } from 'pg';
const scriptDir = dirname(fileURLToPath(import.meta.url));
const contextDir = resolve(scriptDir, '..');
const kloRoot = resolve(contextDir, '../..');
const reportPath = join(kloRoot, 'docs', 'hybrid-search-pglite-sl-adapter-prototype.md');
async function timed(label, fn) {
const started = performance.now();
const value = await fn();
return {
label,
durationMs: Number((performance.now() - started).toFixed(2)),
value,
};
}
async function allocatePort() {
const server = createServer();
await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve));
const address = server.address();
if (typeof address !== 'object' || address === null) {
throw new Error('Expected TCP server address while allocating a PGlite SL prototype port.');
}
await new Promise((resolve, reject) => {
server.close((error) => {
if (error) {
reject(error);
return;
}
resolve();
});
});
return address.port;
}
async function createOwner(dataDir, port) {
const db = await PGlite.create({
dataDir,
extensions: { vector, pg_trgm },
});
await db.exec(`
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE TABLE prototype_sl_sources (
connection_id TEXT NOT NULL,
source_name TEXT NOT NULL,
search_text TEXT NOT NULL,
embedding vector(3),
PRIMARY KEY (connection_id, source_name)
);
CREATE INDEX prototype_sl_sources_fts_idx
ON prototype_sl_sources
USING GIN (to_tsvector('english', search_text));
CREATE INDEX prototype_sl_sources_vector_idx
ON prototype_sl_sources
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 1);
CREATE TABLE prototype_sl_dictionary_values (
connection_id TEXT NOT NULL,
source_name TEXT NOT NULL,
column_name TEXT NOT NULL,
value TEXT NOT NULL,
value_lower TEXT NOT NULL,
PRIMARY KEY (connection_id, source_name, column_name, value)
);
CREATE INDEX prototype_sl_dictionary_values_trgm_idx
ON prototype_sl_dictionary_values
USING GIN (value gin_trgm_ops);
`);
const server = new PGLiteSocketServer({ db, host: '127.0.0.1', port, maxConnections: 100 });
await server.start();
return {
db,
server,
connectionConfig: {
host: '127.0.0.1',
port,
user: 'postgres',
database: 'postgres',
application_name: 'klo-pglite-sl-prototype-report',
connectionTimeoutMillis: 5_000,
},
};
}
async function withClient(connectionConfig, fn) {
const client = new Client(connectionConfig);
await client.connect();
try {
return await fn(client);
} finally {
await client.end();
}
}
async function seed(connectionConfig) {
await withClient(connectionConfig, async (client) => {
await client.query(
`
INSERT INTO prototype_sl_sources (connection_id, source_name, search_text, embedding)
VALUES
($1, $2, $3, $4::vector),
($5, $6, $7, $8::vector),
($9, $10, $11, $12::vector)
`,
[
'warehouse',
'orders',
'orders paid revenue refund status customer',
JSON.stringify([1, 0, 0]),
'finance',
'orders',
'orders finance bookings gross margin',
JSON.stringify([0.72, 0.28, 0]),
'warehouse',
'customers',
'customers accounts lifecycle region',
JSON.stringify([0, 1, 0]),
],
);
await client.query(`
INSERT INTO prototype_sl_dictionary_values (connection_id, source_name, column_name, value, value_lower)
VALUES
('warehouse', 'orders', 'status', 'refunded', 'refunded'),
('warehouse', 'orders', 'status', 'paid', 'paid'),
('warehouse', 'customers', 'region', 'emea', 'emea')
`);
});
}
async function queryTopResults(connectionConfig) {
return withClient(connectionConfig, async (client) => {
const lexical = await client.query(
`
SELECT connection_id || '/' || source_name AS id
FROM prototype_sl_sources
WHERE to_tsvector('english', search_text) @@ websearch_to_tsquery('english', $1)
ORDER BY ts_rank_cd(to_tsvector('english', search_text), websearch_to_tsquery('english', $1)) DESC, id ASC
LIMIT 1
`,
['paid revenue'],
);
const semantic = await client.query(
`
SELECT connection_id || '/' || source_name AS id
FROM prototype_sl_sources
ORDER BY embedding <=> $1::vector, id ASC
LIMIT 1
`,
[JSON.stringify([1, 0, 0])],
);
const dictionary = await client.query(
`
SELECT connection_id || '/' || source_name AS id
FROM prototype_sl_dictionary_values
WHERE similarity(value, $1) > 0 OR value_lower LIKE '%' || lower($1) || '%'
ORDER BY GREATEST(similarity(value, $1), CASE WHEN value_lower LIKE '%' || lower($1) || '%' THEN 0.75 ELSE 0 END) DESC,
id ASC,
value ASC
LIMIT 1
`,
['refund'],
);
return {
lexical: lexical.rows[0]?.id ?? '<missing>',
semantic: semantic.rows[0]?.id ?? '<missing>',
dictionary: dictionary.rows[0]?.id ?? '<missing>',
};
});
}
async function stopOwner(owner) {
await owner.server.stop();
await owner.db.close();
}
async function main() {
const tempDir = await mkdtemp(join(tmpdir(), 'klo-pglite-sl-prototype-report-'));
const dataDir = join(tempDir, 'pgdata');
const port = await allocatePort();
let owner;
try {
const startTimer = await timed('startOwner', async () => createOwner(dataDir, port));
owner = startTimer.value;
const seedTimer = await timed('seedSemanticLayerIndex', async () => seed(owner.connectionConfig));
const searchTimer = await timed('searchQueries', async () => queryTopResults(owner.connectionConfig));
const markdown = `# Hybrid Search PGlite Semantic-Layer Adapter Prototype
Generated: ${new Date().toISOString()}
## Summary
PGlite served a semantic-layer-style search index through one owner process and PostgreSQL clients. The probe returned lexical, semantic, and dictionary top results through Postgres FTS, pgvector ordering, and pg_trgm matching.
Recommendation: Keep SQLite as the production default. The PGlite semantic-layer adapter remains private and explicitly opt-in until a separate plan decides runtime dependencies, long-lived owner lifecycle, and CLI/MCP routing.
## Timings
| Probe | Duration ms |
| --- | ---: |
| startOwner | ${startTimer.durationMs} |
| seedSemanticLayerIndex | ${seedTimer.durationMs} |
| searchQueries | ${searchTimer.durationMs} |
## Search Feature Results
| Probe | Top result |
| --- | --- |
| Postgres FTS through socket | \`${searchTimer.value.lexical}\` |
| pgvector cosine through socket | \`${searchTimer.value.semantic}\` |
| pg_trgm dictionary through socket | \`${searchTimer.value.dictionary}\` |
## Decision
The private adapter shape is viable for semantic-layer search prototypes. It is not a production backend acceptance record and does not change the default SQLite search path.
`;
await writeFile(reportPath, markdown);
console.log(`Wrote ${reportPath}`);
console.log(
JSON.stringify(
{
port,
timings: {
startOwner: startTimer.durationMs,
seed: seedTimer.durationMs,
searchQueries: searchTimer.durationMs,
},
topResults: searchTimer.value,
},
null,
2,
),
);
} finally {
if (owner) {
await stopOwner(owner).catch(() => undefined);
}
await rm(tempDir, { recursive: true, force: true });
}
}
await main();

View file

@ -0,0 +1,52 @@
import { dirname, join, resolve } from 'node:path';
import { fileURLToPath } from 'node:url';
import {
KLO_RELATIONSHIP_BENCHMARK_MODES,
buildKloRelationshipBenchmarkReport,
currentKloRelationshipBenchmarkDetector,
formatKloRelationshipBenchmarkReportMarkdown,
kloRelationshipBenchmarkDetectorWithLlm,
loadKloRelationshipBenchmarkFixtures,
runKloRelationshipBenchmarkSuite,
} from '../dist/scan/index.js';
const scriptDir = dirname(fileURLToPath(import.meta.url));
const packageRoot = resolve(scriptDir, '..');
const fixtureRoot = join(packageRoot, 'test/fixtures/relationship-benchmarks');
async function buildDetector() {
const backend = process.env.KLO_BENCHMARK_LLM_BACKEND;
if (!backend || backend === 'none') {
return currentKloRelationshipBenchmarkDetector();
}
if (backend !== 'vertex') {
throw new Error(`Unsupported KLO_BENCHMARK_LLM_BACKEND: ${backend}`);
}
const project = process.env.KLO_BENCHMARK_VERTEX_PROJECT;
const location = process.env.KLO_BENCHMARK_VERTEX_LOCATION;
const model = process.env.KLO_BENCHMARK_LLM_MODEL ?? 'claude-sonnet-4-6';
if (!project || !location) {
throw new Error('KLO_BENCHMARK_VERTEX_PROJECT and KLO_BENCHMARK_VERTEX_LOCATION are required for vertex backend');
}
const { createKloLlmProvider } = await import('@klo/llm');
const provider = createKloLlmProvider({
backend: 'vertex',
vertex: { project, location },
modelSlots: { default: model },
});
return kloRelationshipBenchmarkDetectorWithLlm(provider);
}
const fixtures = await loadKloRelationshipBenchmarkFixtures(fixtureRoot);
const detector = await buildDetector();
const suite = await runKloRelationshipBenchmarkSuite({
fixtures,
detector,
});
const report = buildKloRelationshipBenchmarkReport({
fixtures,
suite,
modes: KLO_RELATIONSHIP_BENCHMARK_MODES,
});
process.stdout.write(formatKloRelationshipBenchmarkReportMarkdown(report));

View file

@ -0,0 +1,34 @@
---
name: dbt_ingest
description: Map dbt `schema.yml` / `properties.yml` models and sources into KLO semantic-layer overlays and column notes. Covers `sources:` vs `models:`, column `data_tests` (not_null, unique, accepted_values, relationships), and how bundle-time writes complement manifest backfill from git sync. Load when the WorkUnit's `skillNames` includes `dbt_ingest` or when raw files are dbt YAML under `models/` / `sources/`.
callers: [memory_agent]
---
# dbt → KLO (bundle ingest)
Use this skill for **uploaded** dbt projects (`dbt_project.yml` at stage root, `models/**`, `sources/**`, `schema.yml`). There is **no** `fetch()` in v1 — scheduled `dbt parse` / `manifest.json` pulls are out of scope; host-provided dbt sync may still backfill structured test metadata into `_schema` on the next sync.
## Mapping (models / sources → SL)
| dbt | KLO | Notes |
|-----|--------|--------|
| `models:` entry with `columns:` | **Overlay** on the manifest table with the same name (after `wiki_sl_search` / `sl_describe_table`) | One SL source per physical table; model name may differ from DB name — resolve with `read_raw_file` + warehouse context. |
| `sources:``tables:` | Same as models; use `identifier` when present instead of logical `name`. | Schema + name must match how the connection sees tables. |
| Column `description` | `descriptions.user` or merged `descriptions` map on the column | Do not overwrite `dbt` description keys from sync. |
| `data_tests: not_null` / `unique` | Short hint in column `descriptions` or notes: “dbt: not null”, “dbt: unique” | Full structured metadata lands in manifest via **sync**; the skill keeps bundle-time SL text useful for the agent. |
| `accepted_values` | Add a **brief** line in the column description: allowed values (truncate long lists) | Also mention enum-like use in `wiki_sl_search` / filters. |
| `relationships` | Add or confirm `joins:` on the overlay **only** when `to` resolves to a real table via `read_raw_file` + `wiki_sl_search` / `sl_describe_table` | If the ref cannot be resolved, capture the intent in a wiki page instead. |
## 1.1 test hints (descriptions / meta)
When YAML shows `accepted_values` or `not_null`, add **short** hints into `columns[].descriptions` (e.g. under `user`) or freeform column notes so chat and validation see intent before the next git sync refreshes `constraints` / `enum_values` in `_schema`. Keep hints under a few words when possible.
## Overlap with MetricFlow
If the same bundle also has MetricFlow `semantic_models:` / `metrics:`, the **`metricflow_ingest`** skill owns semantic/metric shapes. This skill focuses on **raw dbt schema** YAML (`models`, `sources`, tests). If both apply, load `metricflow_ingest` first when the file is clearly MetricFlow; otherwise use `dbt_ingest` for `schema.yml` without semantic_models.
## Do not
- Do not run `dbt` CLI or assume `target/` / `manifest.json` exists in the upload.
- Do not invent joins from `relationships` tests if the target model/table is not found in SL or the warehouse.
- Do not read `peerFileIndex` paths — use `read_raw_file` only on `rawFiles` and `dependencyPaths` from the WorkUnit.

View file

@ -0,0 +1,153 @@
---
name: historic_sql_curator
description: Reconcile historic-SQL query knowledge pages by deduping collapsed intents, cross-linking categorical sub-clusters, and demoting stale low-signal pages.
callers: [memory_agent]
---
# Historic SQL Curator
Use this skill during Stage 4 reconciliation for the `historic-sql` source. It runs after `historic_sql_ingest` has written query knowledge pages from full-tier template WorkUnits. The Stage 4 runner may use curator pagination, so treat the current prompt as one bounded page of work and finish every listed item you inspect.
## Input Shape
The reconciliation prompt normally exposes:
- `# Stage Index` with WorkUnit keys, raw paths, and wiki or SL actions from Stage 3.
- `# Eviction Set` with deleted raw paths from retired templates.
- `# Curator Pass State` when curator pagination splits reconciliation into multiple passes.
- `# Source Reconciliation Notes` with run-level notes such as staged template count.
Use tools instead of guessing:
- `stage_list` shows every WorkUnit raw path and action.
- `stage_diff` compares two WorkUnits by written artifact overlap.
- `read_raw_span` reads staged `metadata.json`, `page.md`, `usage.json`, and `manifest.json` snippets when page content is not enough.
- `wiki_search`, `wiki_read`, and `wiki_write` inspect and update query knowledge pages.
- `emit_artifact_resolution` records merged or subsumed wiki pages for provenance.
- `eviction_list` and `emit_eviction_decision` handle deleted raw paths.
## Required Workflow
1. Read the `# Stage Index`, `# Eviction Set`, `# Curator Pass State`, and `# Source Reconciliation Notes` sections first.
2. Call `stage_list` when the prompt omits raw paths or when more than one WorkUnit wrote a `queries/...` page.
3. For each successful historic-SQL WorkUnit that wrote a wiki page, call `wiki_read` on that page before deciding whether to merge, cross-link, or demote it.
4. If the page body does not show fingerprint, sub-cluster, tables, or usage clearly enough, call `read_raw_span` on that WorkUnit's `metadata.json` and `usage.json` raw paths.
5. Build intent clusters using table overlap, representative SQL shape, page summaries, fingerprints, sub-cluster IDs, and usage. Same table is not enough to merge; the business intent must collapse.
6. Deduplicate collapsed intents by electing one canonical page, merging useful variant details into it with `wiki_write`, and recording each merged loser with `emit_artifact_resolution`.
7. Cross-link categorical sub-cluster pages that share the same base fingerprint but differ by `__cat_...` sub-cluster ID.
8. Demote pages whose underlying cluster has decayed below the floor in the most recent 3 windows, or in the current window plus eviction evidence showing the template retired.
9. For every deleted raw path in the Eviction Set that you inspect, call `eviction_list` and then `emit_eviction_decision`.
## Canonical Page Election
When two or more pages describe the same query intent, choose the canonical page with this order:
1. The clearest human-readable intent summary.
2. The page with broader non-service-account usage.
3. The page covering more fingerprints or categorical variants of the same intent.
4. The page with the most recent successful usage.
5. Lexicographically first page key.
After electing the canonical page:
- Read every page that will be merged.
- Update the canonical page so it contains one "Historic SQL Variants" section with fingerprints, sub-cluster IDs, tables, usage summaries, and links to sibling page keys when retained.
- Keep `tags` including `historic-sql` and `query-pattern`.
- Preserve useful `sl_refs`; when replacing refs, include the union of cleanly matched SL refs from merged pages.
- For each merged loser, call `emit_artifact_resolution` with:
```json
{
"rawPath": "<loser WorkUnit metadata.json or page.md raw path>",
"artifactKind": "wiki",
"artifactKey": "<loser wiki page key>",
"actionType": "merged",
"reason": "Historic-SQL query intent collapsed into <canonical wiki page key>."
}
```
Use `actionType: "subsumed"` only when the loser page is a thin duplicate with no unique facts worth retaining in the canonical body.
## Categorical Sub-Cluster Cross-Links
A categorical sub-cluster normally has a staged ID like `<fingerprint>__cat_<hash>` or page content that says `Sub-cluster: <value>`. For sibling pages that share the same base fingerprint:
1. Read all sibling pages visible in the current Stage Index or found through `wiki_search`.
2. Keep one page per meaningful category value.
3. Add or update a "Categorical Variants" section in each sibling page:
```markdown
### Categorical Variants
- `<category value>`: [[queries/<sibling_key>]] - <short intent or parameter note>
```
4. Use `wiki_write` with `refs` containing the sibling page keys so cross-links also live in frontmatter.
5. Do not merge categorical siblings only because they share a fingerprint. Merge them only when the category value no longer changes intent.
## Demotion
Demotion preserves history; it is not deletion. A page is demoted when evidence shows its underlying cluster has fallen below the historic-SQL floor:
- `executions < 3`, or
- `distinct_users < 2`, or
- service-account-only usage below the frequency floor, or
- the template was evicted and no active sibling or replacement page supports the same intent.
Require the low-signal state across the most recent 3 windows when page history is available. If only the current window is visible, demote only when eviction evidence confirms the raw template retired; otherwise add a caveat and leave the page active.
Use `wiki_write` to express demotion with the current wiki frontmatter fields:
- Add the `historic-sql-demoted` tag while preserving `historic-sql` and `query-pattern`.
- Prefix the summary with `Demoted historic-SQL pattern: ` unless it already begins with that phrase.
- Add a `### Demotion` section in the body with the last observed usage window, the floor that failed, and the raw path or fingerprint that supports the decision.
When demoting because of an eviction, also call `emit_eviction_decision`:
```json
{
"rawPath": "<deleted raw path>",
"artifactKind": "wiki",
"artifactKey": "<wiki page key>",
"action": "retained_deprecated",
"reason": "Historic-SQL template retired or decayed below the floor; page retained with historic-sql-demoted frontmatter tag."
}
```
## What To Write
Use `wiki_write` for every page update. The tool supports `summary`, `content`, `tags`, `refs`, and `sl_refs` frontmatter fields.
Canonical pages should keep this body shape:
```markdown
## <Canonical Query Intent>
- Source: historic-sql
- Tables: <tables>
- Fingerprints: <fingerprints and sub-clusters>
- Usage: <executions>, <distinct users>, first seen <date>, last seen <date>
### Representative SQL
```sql
<representative SQL or parameterized SQL>
```
### Historic SQL Variants
- `<fingerprint or sub-cluster>`: <what differs and when to use it>
### Categorical Variants
- `<category value>`: [[queries/<sibling_key>]] - <short intent or parameter note>
### Demotion
- Omit this section unless the page is demoted.
```
## Boundaries
- Do not call `context_candidate_write`; historic-SQL Stage 3 writes query pages directly.
- Do not create new artifact types, stores, ports, or tables.
- Do not group low-tier templates that triage already filtered out.
- Do not merge pages on table overlap alone.
- Do not delete a query page solely because usage is low; demote it unless eviction rules and inbound-reference evidence make removal clearly safer.
- Do not copy unredacted sample `bound_sql`, user emails, account IDs, tokens, or free-text literal values into wiki or SL output.
- Do not edit SL unless the reconciliation prompt shows a concrete same-intent conflict or duplicate that requires an existing SL artifact resolution.
- Do not finish a curator pagination pass while a merged page, demoted page, or inspected eviction lacks the corresponding provenance call.

View file

@ -0,0 +1,170 @@
---
name: historic_sql_ingest
description: Convert one full-tier historic-SQL template WorkUnit into a canonical query knowledge page, linked SL refs, and optional semantic-layer proposals.
callers: [memory_agent]
---
# Historic SQL Ingest
Use this skill when the WorkUnit contains files under `raw-sources/<connectionId>/historic-sql/<syncId>/templates/<templateId>/`.
Read exactly one historic-SQL template WorkUnit. Each WorkUnit represents one staged template or categorical sub-cluster that already survived full-tier page triage. It is not an intent cluster.
## Input Shape
The WorkUnit normally exposes:
- `metadata.json` in `rawFiles`.
- `page.md` in `rawFiles`.
- `usage.json` in `dependencyPaths`.
- `manifest.json` in `dependencyPaths`.
- `peerFileIndex` containing sibling templates that you cannot read.
`metadata.json` has the stable identity:
```json
{
"id": "fp_1",
"title": "snowflake - analytics.orders [fp_1]",
"path": "templates/fp_1/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_1",
"sub_cluster_id": null,
"dialect": "snowflake",
"tables_touched": ["analytics.orders"],
"literal_slots": [
{ "position": 1, "type": "string", "classification": "constant" },
{ "position": 2, "type": "date", "classification": "runtime" }
],
"triage_signals": {
"executions_bucket": "high",
"distinct_users_bucket": "team",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"slot_summary": "1 constant, 1 runtime"
}
}
}
```
`page.md` contains mechanically generated normalized SQL and touched tables:
```text
# fp_1
## Normalized SQL
SELECT date_trunc(?, created_at), count(*) FROM analytics.orders WHERE status = ? AND created_at >= ? GROUP BY 1
## Tables touched
- analytics.orders
```
`usage.json` contains volatile stats, literal top values, and redacted samples. Use it for intent inference and usage summaries. Do not treat usage-only drift as a reason to group this template with siblings.
## Required Workflow
1. Read the WorkUnit section in the prompt first.
2. Call `read_raw_file` for `metadata.json`, `page.md`, `usage.json`, and `manifest.json`.
3. Confirm `metadata.objectType === "historic_sql_template"`. If it is not, call `emit_unmapped_fallback` with `reason: "parse_error"`, `fallback: "flagged"`, and the `metadata.json` raw path.
4. Extract `fingerprint`, `sub_cluster_id`, `dialect`, `tables_touched`, `literal_slots`, normalized SQL, usage stats, top literal values, and sample timestamps.
5. Infer one canonical query intent from this template only. Use table names, selected expressions, aggregations, joins, grouping, constant literal slots, and repeated successful samples. Runtime literal slots are parameters, not fixed business rules.
6. Build a short intent slug in kebab-case. Use `queries/<intent_slug>` as the wiki key.
7. Search existing knowledge with `wiki_search` using the intent phrase and the primary table. Prefer updating an existing `queries/...` page when it is the same intent.
8. Discover touched tables with `sl_discover`. Add cleanly matched source names to `sl_refs`. If a table does not map cleanly, keep it in the page body and do not include it in `sl_refs`.
9. Write or update the query page with `wiki_write`.
10. Apply the SL proposal threshold below. If it passes and a useful generic measure, segment, join, or overlay is clear, update the semantic layer and run `sl_validate`.
11. Exit without reading peer files or grouping sibling templates.
## Wiki Page Shape
Use `wiki_write` for pages. Emit the spec frontmatter fields directly on the query page.
Use this shape:
```json
{
"key": "queries/<intent_slug>",
"summary": "<one sentence canonical intent>",
"tags": ["historic-sql", "query-pattern"],
"sl_refs": ["<clean_source_name>"],
"source": "historic-sql",
"intent": "<human-readable canonical intent>",
"tables": ["<tables_touched>"],
"representative_sql": "<parameterized representative SQL>",
"usage": {
"executions": 47812,
"distinct_users": 12,
"first_seen": "2026-02-01",
"last_seen": "2026-04-30",
"p50_runtime_ms": 320,
"p95_runtime_ms": 1180,
"error_rate": 0.0007
},
"fingerprints": ["<fingerprint or sub-cluster id>"],
"content": "## <Canonical Intent Title>\n\n### Parameters\n- <constant/runtime/categorical slot notes>\n\n### When To Use\n- <concise reusable guidance>\n\n### Caveats\n- <redaction, service-account, low-confidence, or mapping notes if present>"
}
```
For Snowflake templates include `usage.rows_produced` when present in `usage.json`; for BigQuery v1 omit `usage.rows_produced`.
The `key: "queries/<intent_slug>"` value writes to `knowledge/global/queries/<intent_slug>.md` during external ingest because bundle ingests write global wiki pages.
## Representative SQL Rules
- Start from normalized SQL in `page.md`.
- For constant slots, use the dominant `usage.literal_slots[].top_values[0][0]` when it has definitional meaning. Quote string and date values in the representative SQL.
- For runtime slots, render named parameters such as `:start_date`, `:as_of`, `:status`, or `:threshold`.
- For categorical slots, document the known categories and write this WorkUnit's sub-cluster value when `sub_cluster_id` is present.
- Preserve the warehouse dialect named by `metadata.properties.dialect`.
- Do not copy sample bound_sql into the wiki unless it is visibly redacted and safer than the normalized SQL. Prefer normalized SQL plus parameter notes.
## SL Proposal Threshold
Only propose semantic-layer changes when all are true:
1. This WorkUnit reached Stage 3 full tier. The runner normally guarantees this, but treat `executions_bucket=low` plus `distinct_users_bucket=solo` or `service_account_only=true` as a reason to write wiki only.
2. At least one `literal_slots[]` entry has `classification: "constant"` and the value has durable business meaning, such as a status, plan tier, channel, threshold, or fixed category.
3. Every table in `tables_touched` maps cleanly through `sl_discover` to an existing SL source.
When the threshold passes:
- Call `sl_read_source` before editing an existing source.
- Prefer adding a measure, segment, computed dimension, join, or manifest-backed overlay over creating a standalone SQL source.
- Use `sl_write_source` for a manifest-backed overlay only with `name:` plus additive fields such as `measures:`, `segments:`, `description:`, or `joins:`. Do not include `sql:`, `table:`, `grain:`, or `columns:` on manifest-backed overlays.
- Use `sl_edit_source` for targeted edits when the source file already exists.
- Run `sl_validate` after every SL write or edit.
- Keep runtime parameters as caller filters. Do not bake dates, user ids, ids, search strings, or other runtime slots into SL measures.
When the threshold does not pass, write the wiki page and set `sl_refs` for any cleanly discovered touched tables. A wiki-only result is valid.
## Intent Inference Guidance
Prefer canonical intent names that describe the business question, not the SQL shape:
- Good: `queries/monthly-paid-order-count`
- Good: `queries/enterprise-contract-renewal-risk`
- Good: `queries/support-ticket-first-response-time`
- Weak: `queries/fp-1`
- Weak: `queries/count-orders-group-by-date`
Use the SQL shape to infer intent:
- `COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, `GROUP BY`, and date truncation usually indicate metrics or rollups.
- Constant slots often name segments, statuses, tiers, regions, or thresholds.
- Runtime slots usually represent time windows, selected entities, or caller filters.
- Repeated successful samples from multiple human users make the page more durable.
- High error rates, service-account-only use, or old `last_seen` values belong in caveats.
## Boundaries
- Do not group sibling templates. Stage 4 `historic_sql_curator` owns cross-template clustering and dedupe.
- Do not read paths listed only in `peerFileIndex`.
- Do not create or update `historic_sql_curator`.
- Do not call `context_candidate_write`; historic-SQL Stage 3 writes final wiki and optional SL artifacts directly.
- Do not invent joins, measures, or definitions that are not supported by the normalized SQL, touched tables, literal slots, or existing SL sources.
- Do not copy unredacted sample `bound_sql`, user emails, account ids, tokens, or free-text literal values into wiki or SL output.
- Do not write SL changes when any touched table lacks a clean SL mapping.
- Do not finish after only an SL write. Always write or update the query knowledge page first so the canonical SQL pattern is searchable.

View file

@ -0,0 +1,77 @@
---
name: ingest_triage
description: Classify and resolve conflicts detected during bundle ingest (structural duplicates, definitional contradictions, near-duplicate clusters, re-ingest changes, evictions).
callers: [memory_agent]
---
# Ingest Triage — conflict classification and resolution
This skill is loaded in two contexts:
- By a Stage 3 WorkUnit agent when `sl_discover` or an `sl_discover` reveals that a prior WU (or a prior sync) already wrote something that overlaps with what the current WU is about to write.
- By the Stage 4 reconciliation agent for cross-WU sweeps and for eviction decisions.
Apply the rules below before every write that could collide with an existing artifact.
## Decision tree
1. **Is this the same artifact I'm producing now, or a different one with the same name?**
Read both. If names match and content matches (modulo whitespace): no conflict — skip the write, the prior one stands.
2. **If content differs, is it an expression-only change (e.g. a different `sql:` body for the same measure name, same grain, same columns)?**
Re-ingest change (expression-only): silently replace via `sl_edit_source`. No flag.
3. **If the difference is structural — grain, columns, filter, join shape — is the current bundle the re-ingest of a previously-ingested bundle (i.e. `priorProvenance` has a row for this raw file and artifact)?**
Re-ingest change (semantic break): replace + flag. Record in the IngestReport's `conflicts_resolved` list with `flagged_for_human: true`.
4. **If there's no prior-sync row (both are from THIS job), check for same-ingest contradictions:**
| Kind | Detection | Resolution |
|---|---|---|
| Structural duplicate | Same name, near-identical expression | Elect canonical by: (a) highest inbound-ref count from other sources; tiebreak: (b) lexicographically first unit key; (c) lexicographically first source name. Subsume losers into `<canonical>-variants.md` wiki page. Do NOT flag unless ambiguous. |
| Near-duplicate cluster | Different names, overlapping shape (same table, similar formulas) | Same as structural; one canonical, others subsumed. Flag only if no canonical emerges. |
| Definitional contradiction | Same name, substantively different formulas (different aggregation, different filters, different columns) | **Rename + capture**: disambiguate ALL variants with suffix derived from the domain (`churn_risk_engagement_based`, `churn_risk_billing_based`) and write a unified wiki page listing every variant with provenance. The contested name does NOT land in the SL. **Always flag.** |
5. **Eviction (Stage 4 only)**: for each entry in `eviction_list()`:
- `inbound_refs: []` → remove the artifact (`sl_delete` for SL sources, `wiki_remove` for wiki pages).
- `inbound_refs: [...]` → retain the artifact, set `deprecated: true` on SL sources (via `sl_edit_source`), write a wiki note "origin file removed in <syncId>; preserved because referenced by: …". Flag in the IngestReport so the user can plan migration.
## Why same-ingest vs re-ingest differs
Within ONE bundle there's no user signal telling us which duplicate wins — we capture all variants and flag. Across bundles, re-uploading IS the signal that the new state is intended — we replace silently for expression changes and flag for semantic breaks.
## Naming disambiguation hints
When you rename to disambiguate, prefer domain suffixes that match the containing view/table/collection name: `customers.churn_risk_score``customers.churn_risk_engagement_based` (if the `customer_churn` view computes it from engagement); `billing.churn_risk_score``billing.churn_risk_billing_based`. Avoid numeric suffixes (`churn_risk_1`, `churn_risk_2`) — they disclose nothing.
## Applying canonical pins
When the Stage 4 system prompt includes a `<canonical_pins>` block, treat each pin as a prior user decision for that `contestedKey`.
- If the pinned `canonicalArtifactKey` is present in the Stage Index or already exists in SL, keep it as the canonical artifact for that contested key.
- Disambiguate competing artifacts instead of using the contested name for them.
- Do not flag the pinned contested key solely because the variants disagree; the user has already chosen the canonical artifact.
- If the pinned artifact cannot be found and no current WU can recreate it, emit `emit_conflict_resolution` with `flaggedForHuman: true` and explain that the pin references a missing canonical artifact.
When a pin applies cleanly, call `emit_conflict_resolution` with `kind: "definitional_contradiction"`, `artifactKey` set to the pinned `canonicalArtifactKey`, `detail` describing the pinned election, and `flaggedForHuman: false`.
## What to write in the unified wiki page
When you perform rename + capture, also write one page named `<canonical-concept>-definitions.md` under the wiki GLOBAL scope. Structure:
- One heading per variant, referencing the disambiguated SL name.
- One paragraph per variant: what it computes, where it came from (raw file + line range), when to use it.
- A closing "Choosing between these" paragraph if the variants are legitimately domain-specific.
Do not attempt to rank variants or pick a "best" — that's user-override territory.
## Silence rules
Flag for human review when:
- You did rename + capture for a definitional contradiction (kind 3 above).
- You performed an eviction retention (kind 5, second row).
- An override constraint (from a Stage 4 re-run) conflicts with current inbound refs.
Do NOT flag:
- Same-content duplicate skip (trivial).
- Structural duplicate with clear canonical election.
- Expression-only re-ingest replace.

View file

@ -0,0 +1,124 @@
---
name: knowledge_capture
description: KLO's knowledge base — wiki pages for durable, reusable business knowledge. Covers capture workflow for user preferences, metric definitions, organizational conventions, and cross-references between knowledge pages and semantic-layer sources. Loaded by the post-turn memory-agent only. The research agent reads wiki via `wiki_read`/`wiki_search` but does not write it.
callers: [memory_agent]
---
# Knowledge Capture
## Role
The knowledge base stores durable, reusable business knowledge for an analytics assistant. Each page is a self-contained rule, definition, or convention that answers "how should this concept be handled in this organization?" — written once and reused across chats.
Scope selection is handled by the runtime:
- When user-scoped knowledge is enabled AND the caller is a chat turn, writes go to the user's **personal** scope.
- When the caller is an admin-driven ingest (`sourceType: 'external_ingest'`), writes go to the **global** scope.
- When user-scoped knowledge is disabled, all writes go to the global scope.
The `wiki_write` tool picks the right scope based on the session. Capture logic does not need to choose — focus on whether the content is worth capturing at all.
## What to capture
Capture when the user or the ingested document expresses:
- A metric definition ("revenue means booked revenue after refunds").
- A filter or convention that should always apply ("exclude test accounts when reporting ARR").
- A mapping or alias ("mood_stress_sleep = Oxytocin protocol").
- A domain rule that is not visible from column names alone ("status = 'T' means terminated, not 'terminated'").
- A link or external system convention ("medplum_patient_id is the primary key in the EMR at https://emr.example/patients/{id}").
Do NOT capture:
- One-off requests ("answer under 100 words").
- Temporary instructions scoped to the current chat.
- Ad-hoc formatting preferences.
- Information already present in the semantic layer (column names, join paths, measure formulas — those belong in SL).
- **Query results, snapshots, or time-bounded benchmark tables.** Numbers go stale; pasting "Oct 2025: 25%, Nov 2025: 19.9%, …" creates misinformation as soon as new data lands. Reference the SL source by name (`sl_refs`) and let future queries pull live data — the wiki captures the *rule* (definition, exclusion, segmentation), the SL source captures the *measure*, and `semantic_query` captures the *current values*.
- **Interpretive narrative tied to a specific snapshot** ("M1 retention degraded sharply from Dec 2025"). The observation is anchored to data that will move; the actionable convention (e.g., "always exclude in-progress cohorts") may be worth capturing on its own, but the snapshot-specific commentary is not.
If nothing is worth capturing, respond without calling any tool.
## Workflow
1. Read the wiki index (provided in the prompt) and decide whether the turn introduces durable knowledge.
2. **Before writing**, search for related content so cross-references are accurate:
- `wiki_search` with the topic — find related wiki pages to populate `refs`.
- `sl_discover` with the concept — if the page defines a metric (revenue, churn, retention, LTV, ARR, MRR, CAC, attribution, etc.), find matching SL sources or measures to populate `sl_refs`. If no matches, pass `sl_refs: []` so future readers know you checked.
3. If updating an existing page, `wiki_read` it first. The read result begins with `[scope: ... | tags: ... | refs: ... | sl_refs: ...]` showing current frontmatter.
4. `wiki_write` to create or update. Prefer merging into an existing page over creating a new one.
5. `wiki_remove` only when a page is truly obsolete — not to replace stale content (update it instead).
## Keys, summaries, and content
- **Keys** are short kebab-case topic identifiers: `leads-source-filter`, `revenue-definition`, `churn-calculation`. No namespacing, no prefixes.
- **Summary** is a one-line hook (≤200 chars) shown in the index.
- **Content** is concise markdown — actionable rules, not prose.
```
## [Topic Title]
- Rule or preference statement
- Another rule if applicable
```
Prefer fewer, richer pages over many thin ones. Each page covers one coherent topic thoroughly. If the new information relates to an existing page, update that page instead of fragmenting the knowledge.
## Tags, refs, sl_refs
The `wiki_write` tool accepts three array fields that go into the page frontmatter:
- **`tags`**: 13 short lowercase topic tags (`["finance"]`, `["data-quality"]`). Call `wiki_list_tags` first to reuse existing tags for consistency.
- **`refs`**: keys of related wiki pages. Add when the new page materially depends on concepts from another (e.g., a churn definition that uses the paid-orders filter from a revenue definition). Don't add refs just because pages share a topic area.
- **`sl_refs`**: names of SL sources or measures the page relates to. Format: `"source_name"` or `"source_name.measure_name"`. Discover via `sl_discover` → inspect with `sl_read_source` → include the confirmed matches.
### Replace semantics
All three fields use REPLACE semantics on update:
- Omit the field → existing value is kept.
- Pass `[]` → field is cleared.
- Pass `[values]` → replaces existing with exactly those values (no merging).
## Editing existing pages
Two modes:
- **Full content** — pass `content` to rewrite the whole page. Use when the page structure needs to change.
- **Targeted edits** — pass `replacements: [{ oldText, newText }]` to apply exact-string replacements. Use for small updates; preserves the rest of the page.
When editing, read the page first so the edit matches exact whitespace and indentation.
## Overriding an organization rule
Organization (GLOBAL) pages are read-only from a user's personal-scope session. To override a global rule for a single user, write a personal page with the **same key**. At read time the USER page wins.
## Worked example — capturing a metric with cross-references
User says: "Going forward, the official refund rate is total refunded amount divided by total gross transaction amount."
```
wiki_list_tags()
→ existing tags include "finance"
wiki_search({ query: "refund revenue paid orders" })
→ returns `revenue-definition` (related — defines paid-orders filter)
sl_discover({ query: "refund rate" })
→ returns fct_orders (score 0.08), fct_gaap_revenue (0.06)
sl_read_source({ sourceName: "fct_orders" })
→ confirms amount_refunded_dollars and transaction_amount_dollars exist
wiki_write({
key: "refund-rate-definition",
summary: "Refund rate = refunded amount / gross transaction amount",
content: "## Refund Rate\n- Definition: sum(amount_refunded_dollars) / sum(transaction_amount_dollars)\n- Source of truth: fct_orders\n- Related: see revenue-definition for paid-orders filter.",
tags: ["finance"],
refs: ["revenue-definition"],
sl_refs: ["fct_orders.refund_rate_pct", "fct_orders"]
})
```
Search-then-write order matters. Cross-references are part of the page's identity, not an afterthought.
## Rules
- Read existing pages before updating them.
- Prefer merging into an existing page over creating a new one.
- Prefer fewer, richer pages over many thin ones.
- Write content as clear, actionable rules — not narrative prose.
- Discover cross-references via search before writing, not after.
- If nothing is worth capturing, respond without calling any tool.

View file

@ -0,0 +1,58 @@
---
name: live_database_ingest
description: Capture semantic-layer and knowledge updates from a live database schema snapshot.
callers: [memory_agent]
---
# Live Database Ingest
Use this skill when the ingest work unit contains raw files under
`raw-sources/<connectionId>/live-database/<syncId>/`.
## Workflow
1. Read the table JSON file listed in the work unit.
2. Read `connection.json` to understand the snapshot metadata.
3. Read `foreign-keys.json` when the table has a foreign key or when joins are
needed for the semantic-layer source.
4. Create or update one semantic-layer source for the table with
`sl_write_source`.
5. Use the physical table name from the raw JSON as the source `table` field.
6. Preserve database comments as `descriptions.db` on tables and columns.
7. Add joins only when the foreign key index names both sides.
8. Write wiki pages only for durable business meaning that is present in table
or column comments.
9. Run `sl_validate` for the table source before the work unit completes.
## Source shape
For a raw table with this shape:
```json
{
"name": "orders",
"db": "public",
"columns": [
{ "name": "id", "type": "integer", "nullable": false, "primaryKey": true }
]
}
```
Write a semantic-layer source with this shape:
```yaml
name: orders
table: public.orders
grain: id
columns:
- name: id
type: number
```
Use `string`, `number`, `time`, or `boolean` for column types. When a database
type is ambiguous, use `string`.
## Boundaries
The raw snapshot is structural evidence. Do not invent measures, segments,
business definitions, or joins that are not present in the snapshot files.

View file

@ -0,0 +1,217 @@
---
name: looker_ingest
description: Extract durable KLO knowledge and semantic-layer contribution proposals from staged Looker runtime dashboard, Look, and explore JSON. Load for WorkUnits whose raw files are under explores/, dashboards/, or looks/.
callers: [memory_agent]
---
# Looker Runtime Ingest
Looker runtime ingest turns API-staged dashboards, Looks, and explores into durable KLO memory. Runtime entities are evidence. They are not themselves the final knowledge shape.
## Required Workflow
1. Read every `rawFiles` entry for the WorkUnit.
2. Read relevant `dependencyPaths` before making a decision. For dashboard and Look WUs this usually includes the referenced explore JSON, signal files, `folders/tree.json`, and `users/<id>.json`.
3. Treat `signals/*.json`, owners, folders, schedules, and favorites as prioritization or provenance context only.
4. Extract generalizable metric formulas, segment definitions, field semantics, and domain conventions.
5. Use `wiki_search`, `sl_discover`, and `sl_read_source` before writing so new content merges with existing memory instead of duplicating it.
6. Use `context_evidence_search` or `context_evidence_read` to obtain evidence chunk IDs for any wiki-bound knowledge candidate.
7. Use `context_candidate_write` for durable wiki-bound knowledge. Do not call `wiki_write` from a Looker WorkUnit; Stage 4 reconciliation promotes candidates and writes wiki pages.
8. Use `looker_query_to_sl` for each Look query or dashboard tile query that has a `query` object.
9. Write SL from Looker runtime evidence only through the staged warehouse target contract. For explores and inherited dashboard/Look queries, branch on `targetTable.ok`; when it is true, write on `targetWarehouseConnectionId` and use `targetTable.canonicalTable` as `source.table`. When it is false or missing, write wiki knowledge candidates and record `emit_unmapped_fallback` with the staged reason.
10. Run `sl_validate` after every SL write. If validation fails, fix the source or roll it back before the WorkUnit ends.
## Explore WorkUnits
Explore WUs have raw files like `explores/<model>/<explore>.json` and usually depend on `lookml_models.json`.
Use the deterministic API-derived source key:
```text
looker__<model>__<explore>
```
For example, `modelName: "b2b"` and `exploreName: "sales_pipeline"` map to `looker__b2b__sales_pipeline`.
Mapped explore write shape:
```json
{
"connectionId": "22222222-2222-4222-8222-222222222222",
"sourceName": "looker__b2b__sales_pipeline",
"source": {
"name": "looker__b2b__sales_pipeline",
"table": "proj.dataset.opportunities",
"grain": ["opportunity_id"],
"columns": [
{
"name": "opportunity_id",
"type": "string"
},
{
"name": "arr",
"type": "number"
}
],
"measures": [
{
"name": "total_arr",
"expr": "sum(arr)"
}
]
}
}
```
Every concrete value in that example must be backed by raw Looker field SQL, `source_tables` preflight, `source_columns`, or existing SL when applied to a real WorkUnit. If the evidence is not present, write wiki candidates and emit `emit_unmapped_fallback`.
The staged explore file carries warehouse target fields populated before the WU starts:
- `connectionName`: the Looker runtime connection name.
- `targetWarehouseConnectionId`: the resolved warehouse connection id, or `null` when the Looker connection is unmapped.
- `rawSqlTableName`: Looker's verbatim `sql_table_name`. Keep it as provenance only.
- `targetTable`: the parsed target-table union. Use this as the sole branch condition.
When `targetTable.ok === true`, the explore has a complete KLO backing target. Before writing:
1. Use `targetTable.catalog`, `targetTable.schema`, and `targetTable.name` for `source_tables` preflight matching through `sl_discover` or `sl_read_source`.
2. Use Looker field `sql`, labels, descriptions, and type metadata to derive source columns, measures, segments, joins, and grain.
3. Call `sl_write_source` or `sl_edit_source` with `connectionId: targetWarehouseConnectionId`.
4. Set `source.name` to the deterministic API-derived source key, for example `looker__b2b__sales_pipeline`.
5. Set `source.table` to `targetTable.canonicalTable`.
6. Run `sl_validate` after every SL write.
The `table` field is `targetTable.canonicalTable`, not `rawSqlTableName`. Raw Looker values can contain aliases such as `schema.table AS x`, Looker templates such as `${TABLE}`, or derived-table SQL. Those raw forms do not compose safely with SL generation. `targetTable.canonicalTable` is the dialect-quoted identifier rebuilt by the parser.
Use `targetTable.{catalog,schema,name}` only for source_tables preflight. Do not put those tuple fields separately into the SL source unless the SL schema already asks for them.
When `targetTable.ok === false`, keep the WU wiki-only for SL purposes. Capture durable domain semantics with `context_candidate_write`, then emit a fallback with the EXACT structured `reason` code from `targetTable.reason`. Put any human-readable context in `detail`, NOT in `reason`:
```json
{
"rawPath": "explores/b2b/sales_pipeline.json",
"reason": "no_connection_mapping",
"detail": "Looker connection b2b_sandbox_bq is not mapped to a warehouse connection",
"fallback": "wiki_only"
}
```
Valid `reason` codes (use exactly one, no other strings allowed): `no_connection_mapping`, `looker_template_unresolved`, `derived_table_not_supported`, `no_physical_table`, `multiple_table_references`, `unsupported_dialect`, `parse_error`, `missing_target_table`.
When `targetTable` is `null`, read the raw explore file again. If the target is still absent, emit the same fallback with `"reason": "missing_target_table"`.
## Look And Dashboard WorkUnits
Looks have raw files like `looks/<id>.json`. Dashboards have raw files like `dashboards/<id>.json`. Dashboard tiles with inline `query` objects follow the same decision rules as Looks.
For each query:
1. Call `looker_query_to_sl` with the query JSON, title, content type, and usage counts if available.
2. Read the proposal's `targetStatus`, `targetWarehouseConnectionId`, `targetTable`, `sourceTable`, and `canWriteStandaloneSource`.
3. If `canWriteStandaloneSource` is true, use `targetWarehouseConnectionId` for SL tools and `sourceTable` / `targetTable.canonicalTable` as the source table. Verify the proposal against the parent explore dependency and existing SL before writing.
4. If the proposal decision is `measure_added`, add or edit a measure only after verifying the expression against the explore field SQL or an existing source column.
5. If the proposal decision is `source_created`, create a source only when `canWriteStandaloneSource` is true and the filter is canonical. Use `source.table = targetTable.canonicalTable`.
6. If `targetStatus` is `unmapped`, `unparseable`, or `missing_target_table`, keep SL wiki-only for this query and call `emit_unmapped_fallback` with the proposal's target reason or status.
7. If the proposal decision is `wiki_only`, write a context candidate only when the Look or dashboard names a reusable business concept.
## Capture Rules
Write SL for:
- reusable aggregations with clear formulas;
- reusable segment predicates that appear canonical;
- calculated dimensions that are stable and backed by raw Looker query evidence;
- joins or source relationships that are explicit in the explore JSON.
Write wiki for:
- metric definitions in dashboard or Look titles, descriptions, axis labels, and filter semantics;
- business meaning of an explore;
- concept aliases used by teams;
- caveats about multiple competing definitions.
Skip:
- point-in-time values and chart screenshots;
- dashboard layout, tile positions, colors, visualization types, and render settings;
- owner names, top users, recipient counts, favorite counts, schedules, and usage counts as narrative content;
- ad-hoc low-usage queries with no durable business semantics;
- simple saved views of fields with no metric, segment, or concept definition.
## Usage Signals
Use usage only to prioritize:
- zero or near-zero usage lowers priority and often means skip;
- high usage raises confidence that a metric or segment is canonical;
- schedules and favorites can break ties between otherwise similar candidates.
When calling `context_candidate_write`, usage can affect scoring:
- High usage (`queryCount30d >= 10` or `uniqueUsers30d >= 3`) can justify `authorityScore: 3` and `reuseScore: 3` when the evidence is otherwise durable.
- Zero recent usage should usually use `actionHint: "skip"` or lower `reuseScore` unless the content clearly defines a canonical business concept.
- Schedules and favorites can raise `reuseScore` by 1 when deciding between otherwise similar candidate scores.
Never include the usage counts themselves in `assertion`, `rationale`, or eventual wiki prose.
Never write usage numbers, owner names, folder names, top users, schedule counts, or recipients into wiki article prose. If attribution is needed, keep it in provenance through the normal ingest action trail.
## Provenance And Cross-References
When writing candidates from Looker evidence, cite chunk IDs from `context_evidence_search` or `context_evidence_read`. Stage 4 reconciliation writes wiki pages from promoted candidates and sets `sl_refs` when the source exists or was created in the run.
When an SL action is written on `targetWarehouseConnectionId`, the runner records `targetConnectionId` on the action and syncs `knowledge_sl_refs` to the warehouse connection. The wiki article still belongs to the Looker run connection; the SL ref belongs to the warehouse. Do not rewrite the source name or connection id in wiki frontmatter by hand. Use normal SL tool calls and let Stage 4 reconcile refs from actions.
Use these source-key conventions:
- API-derived explore source: `looker__<model>__<explore>`
- API-derived segment source: `looker__<explore>__<slug>`
- File-adapter source, when present: `<model>__<explore>` without the `looker__` prefix
During Stage 4 reconciliation, when both `looker__<model>__<explore>` and `<model>__<explore>` exist for the same connection, treat the unprefixed file-adapter source as canonical. Rewrite wiki `sl_refs` to the unprefixed source, remove the API-derived source if it was created in this run, and call `emit_artifact_resolution` with `actionType: "subsumed"`, `artifactKind: "sl"`, `artifactKey: "looker__<model>__<explore>"`, and the raw explore path that produced it.
If a file-adapter source already exists and clearly subsumes the API-derived source, prefer the file-adapter source in `sl_refs` and mention the API entity only as evidence in the wiki content.
## Examples
Measure proposal from a Look:
```json
{
"title": "Open Pipeline ARR",
"query": {
"model": "b2b",
"view": "sales_pipeline",
"fields": ["opportunities.arr", "opportunities.stage"],
"filters": { "opportunities.stage": "open" }
}
}
```
Expected handling:
- call `looker_query_to_sl`;
- verify `opportunities.arr` and `opportunities.stage` against the explore dependency and existing SL;
- add or update a measure only if the resulting expression validates;
- write wiki for the durable definition "open pipeline ARR" if it is not already captured;
- avoid mentioning query counts or users in wiki prose.
Simple saved view:
```json
{
"title": "Accounts By Region",
"query": {
"model": "b2b",
"view": "accounts",
"fields": ["accounts.region", "accounts.segment"],
"filters": {}
}
}
```
Expected handling:
- no SL write;
- wiki only if the title or description defines a reusable company concept;
- otherwise skip.

View file

@ -0,0 +1,180 @@
---
name: lookml_ingest
description: Map a LookML view/model/explore into KLO semantic layer sources. Covers the LookML to KLO primitive table, provenance tagging, and three worked examples (overlay, standalone from derived_table, standalone with sql_always_where). Load when the turn contains `.lkml` content.
callers: [memory_agent]
---
# LookML to KLO Semantic Layer
LookML views map to SL sources, `measure:` to measures, `explore: { join: }` to the join graph. This skill lays out the mapping and the three capture shapes.
## Mapping table
| LookML | KLO form | Notes |
|---|---|---|
| `view: X { sql_table_name: …; measure:/dimension:/join: }` | **Overlay** at `<connId>/X.yaml` with `measures`, `columns` (computed), `joins`, `segments` | Manifest-backed; inherit grain/columns |
| `view: X { derived_table: { sql: … } }` | **Standalone** with top-level `sql:`, explicit `grain:` + `columns:` | No manifest entry exists |
| `view: X { sql_always_where: <p> }` | **Standalone** with `sql: SELECT * FROM <base> WHERE <p>` | Enforcement, not opt-in |
| `explore: { join: Y { sql_on: …; relationship: … } }` | `joins:` entry `{ to: Y, on: "<local> = Y.<col>", relationship: … }` | On the overlay or standalone |
| `conditionally_filter` / `always_filter` | `segments: [{ name, expr }]` | Callers reference by name |
| Manifest entry | `_schema/*.yaml` | **Never edit** — auto-imported |
Type map: `date`/`datetime`/`timestamp``time`; `yesno``boolean`; `number``number`; `string``string`. Ignore `drill_fields:` (UI only).
## Decision rules
LookML writes target the run connection directly. Unlike Looker runtime ingestion, the LookML adapter is configured on the warehouse KLO connection, so do not look for `targetWarehouseConnectionId` and do not route through a mapping array.
Before any SL write, inspect the WorkUnit notes.
If notes contain:
```text
[LOOKML SL WRITES DISALLOWED]
reason: lookml_connection_mismatch
...
[/LOOKML SL WRITES DISALLOWED]
```
this is a hard gate. The model's declared Looker `connection:` does not match the warehouse connection's configured `expectedLookerConnectionName`. Continue wiki extraction and context candidates. Do not call `sl_write_source` or `sl_edit_source` for that WorkUnit. The runner also removes those write tools for this WorkUnit; treat the missing tools as expected. Preserve the mismatch reason in any `emit_unmapped_fallback` you create.
When SL is allowed:
- **Overlay** when the view is a thin wrapper over a manifest table (`sql_table_name:` matches a manifest entry). Do not repeat base columns or grain.
- **Standalone** when the view uses `derived_table:` or `sql_always_where:`. `sl_write_source` rejects overlays whose name has no manifest entry; that error points here.
- **Skip** a view with only `view:`, `sql_table_name:`, and bare `dimension:` entries (no `measure:`, `description:`, `derived_table:`, `sql_always_where:`, `join:`). The pre-filter already short-circuits those.
## Preflight: never guess column names
LookML's `dimension_group: date { type: time; timeframes: [raw, date, week, month] }` expands at Looker-render time into `${view.date_raw}`, `${view.date_date}`, `${view.date_week}`, and so on. **These are NOT physical warehouse columns.** The physical column is whatever the group's `sql:` clause references (e.g. `${TABLE}.date` → column `date`).
A prior replay hallucinated `date_date`, `date_week` into `sql:`, `columns:`, and `grain:` across 4+ standalones; every measure on each affected source returned `400 Unrecognized name: date_date` at query time. Preventable.
**Required flow before writing any overlay or standalone**:
1. Call `sl_discover(<tableName>)` for each base table you're about to touch. That returns the real columns.
2. If the table isn't in the manifest, fall back to `sql_execution({ sql: "SELECT column_name FROM <dataset>.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '<table>'" })` (session shape — a connection is already pinned by the ingest session).
3. Use only those names in `sql:`, `columns:`, and `grain:`. Map each `dimension_group` to ONE `{ name: <physical_col>, type: time, role: time }` entry — never one per timeframe.
| LookML input | KLO `columns:` entry |
|---|---|
| `dimension_group: month { type: time; timeframes: [month]; sql: ${TABLE}.month_date ;; }` | `{ name: month_date, type: time, role: time }` |
| `dimension_group: date { type: time; timeframes: [raw, date, week, month]; sql: ${TABLE}.date ;; }` | `{ name: date, type: time, role: time }` — single entry, NOT `date_raw`/`date_date`/`date_week` |
**After every `sl_write_source`**: call `sl_validate`. It runs `SELECT * FROM (<your sql:>) LIMIT 0` against the connection. If a column name was invented, the warehouse's `Unrecognized name: …` error comes back verbatim. Treat that as a hard failure — re-read the real columns with `sl_discover` and rewrite.
## Provenance markers
When a wiki mixes LookML source prose with `sl_discover` output, tag sections:
```markdown
<!-- from: lookml -->
Customers fan out many-to-one into `accounts` via `account_id`.
<!-- /from -->
<!-- from: bq_schema -->
`customers.admin_user_id` is nullable — orphan rows exist.
<!-- /from -->
```
Invisible in most renderers; lets a future pass audit provenance.
## Example 1 — overlay (thin wrapper)
LookML (excerpt):
```lookml
view: fct_labs {
sql_table_name: analytics.fct_labs ;;
dimension: is_byol { type: yesno; sql: ${TABLE}.lab_type = 'byol' ;; }
measure: count_lab_orders { type: count; description: "Total lab orders." }
measure: count_byol_labs { type: count; filters: [is_byol: "yes"] }
}
explore: fct_labs {
join: dim_customers { sql_on: ${fct_labs.admin_user_id} = ${dim_customers.admin_user_id} ;; relationship: many_to_one }
}
```
KLO overlay at `<connId>/fct_labs.yaml`:
```yaml
name: fct_labs
description: "Lab-order fact table. One row per lab order event."
columns:
- name: is_byol
type: boolean
expr: "lab_type = 'byol'"
measures:
- name: count_lab_orders
expr: count(lab_order_id)
description: Total lab orders.
- name: count_byol_labs
expr: count(lab_order_id)
filter: "is_byol = true"
joins:
- to: dim_customers
on: "admin_user_id = dim_customers.admin_user_id"
relationship: many_to_one
```
## Example 2 — standalone from `derived_table`
```lookml
view: lab_results {
derived_table: { sql:
SELECT lab_order_id, admin_user_id, lab_date, biomarker, value,
value - LAG(value) OVER (PARTITION BY admin_user_id, biomarker ORDER BY lab_date) AS delta
FROM analytics.raw_lab_results WHERE status = 'final' ;; }
dimension: lab_order_id { primary_key: yes; type: string }
measure: avg_delta { type: average; sql: ${delta} ;; }
}
```
```yaml
name: lab_results
description: "Lab results with biomarker delta vs previous reading per user."
source_type: sql
sql: |
SELECT lab_order_id, admin_user_id, lab_date, biomarker, value,
value - LAG(value) OVER (PARTITION BY admin_user_id, biomarker ORDER BY lab_date) AS delta
FROM analytics.raw_lab_results WHERE status = 'final'
grain: [lab_order_id]
columns:
- { name: lab_order_id, type: string }
- { name: admin_user_id, type: string }
- { name: lab_date, type: time, role: time }
- { name: biomarker, type: string }
- { name: value, type: number }
- { name: delta, type: number }
measures:
- { name: count_lab_results, expr: "count(lab_order_id)" }
- { name: avg_delta, expr: "avg(delta)" }
```
## Example 3 — standalone with `sql_always_where`
```lookml
view: rpt_daily_braze_email {
sql_table_name: analytics.fct_email_sends ;;
sql_always_where: ${TABLE}.channel = 'braze' AND ${TABLE}.status = 'delivered' ;;
dimension: send_id { primary_key: yes; type: string }
measure: delivered_count { type: count }
}
```
```yaml
name: rpt_daily_braze_email
description: "Delivered Braze email sends (enforced filter: channel='braze', status='delivered')."
source_type: sql
sql: |
SELECT * FROM analytics.fct_email_sends
WHERE channel = 'braze' AND status = 'delivered'
grain: [send_id]
columns:
- { name: send_id, type: string }
- { name: admin_user_id, type: string }
- { name: sent_at, type: time, role: time }
measures:
- { name: delivered_count, expr: "count(send_id)" }
```
`sql_always_where` is enforcement → wrap into the `sql:`. Don't model it as a segment (segments are opt-in) or per-measure filter (fragile, duplicated).

View file

@ -0,0 +1,218 @@
---
name: metabase_ingest
description: Convert Metabase questions, models, and metrics into KLO Semantic Layer source definitions. Covers result-metadata to KSL column type mapping, FK/PK detection, near-duplicate deduplication, pre-aggregation decomposition, join-graph connectivity, and how to react to priorProvenance from earlier ingest syncs. Load when the WorkUnit contains `cards/<id>.json` files under a Metabase bundle.
callers: [memory_agent]
---
# Metabase to KLO Semantic Layer
Each WorkUnit represents one Metabase collection's cards for one Metabase database (mapped to exactly one KLO connection). Every `cards/<id>.json` file carries the resolved SQL, result_metadata, card type, collection path, and referenced-card ids. The WU's `sync-config.json` tells you which sync mode is active and which selections apply. `databases/<id>.json` tells you the target KLO connection.
## Context format
Each card JSON looks like:
```json
{
"metabaseId": 7,
"name": "Daily orders",
"description": "Orders by day",
"type": "model",
"databaseId": 42,
"collectionId": 5,
"resolvedSql": "SELECT ...",
"templateTags": [{"name": "ref", "type": "card", "cardReference": 10}],
"resultMetadata": [
{"name": "day", "base_type": "type/DateTime", "semantic_type": "type/CreationTimestamp"},
{"name": "order_count", "base_type": "type/Integer"}
],
"collectionPath": ["Data", "Orders Team"],
"referencedCardIds": [10]
}
```
Use `resultMetadata` to:
- Map `base_type` to KSL column type: `type/Integer`, `type/Float`, `type/Decimal`, `type/BigInteger``number`; `type/Text`, `type/TextLike``string`; `type/DateTime`, `type/Date`, `type/DateTimeWithTZ``time`; `type/Boolean``boolean`.
- Identify grain candidates: columns with `semantic_type: type/PK`.
- Identify join candidates: columns with `semantic_type: type/FK` plus `fk_target_field_id`.
- Identify time columns: `semantic_type: type/CreationTimestamp` or `type/UpdatedTimestamp` → set `role: time`.
- Use `display_name` for measure descriptions when available.
### Additional card metadata
- `parameters`: list of card-level parameters with widget types and defaults. When SQL resolution fell back to unresolved SQL, use this to drive Step A of the SQL-translation workflow (drop optional clauses): knowing each `{{ var }}` is `type: "date/range"` vs `type: "category"` tells you what kind of clause it is.
- `resultMetadata[i].field_ref`: Metabase's canonical reference to the source warehouse field. Shape `["field", <field_id>, <options>]`. When this is set, the column maps directly to a warehouse field, which is useful for declaring joins from FK metadata without re-parsing SQL.
- `lastRunAt`: ISO timestamp of the card's last execution. If null or very old, the card may be dead; prefer skipping over creating a source.
- `dashboardCount`: number of dashboards referencing the card. Cards with `dashboardCount: 0` and a stale `lastRunAt` are strong skip signals.
## Decision tree
For each card:
1. Analyze `resolvedSql` + `resultMetadata`: identify base tables, aggregations, joins, filters, column types.
2. Check `sl_discover` and `sl_read_source` for existing sources that overlap.
3. Decide:
- Simple aggregation on a table that already has a source → `sl_edit_source` to add a measure.
- Join between tables that should be linked in the SL graph → `sl_edit_source` to add a join.
- Complex derived SQL (CTEs, multi-layer aggregation, scoring models) → `sl_write_source` with `source_type: sql`. When the SQL projects/filters from a single manifest-backed base table, set `inherits_columns_from: <manifest_key>` so columns inherit type and description from the manifest — see `sl_capture` skill for the slim form. Use `sl_discover` to discover the manifest key from the table reference in the SQL (it accepts `MARTS.CONSIGNMENTS`, `ANALYTICS.MARTS.CONSIGNMENTS`, or `CONSIGNMENTS`).
- New base table not yet in the semantic layer → `sl_write_source` with `source_type: table`.
- Trivial query (`SELECT *`, simple `COUNT(*)` with no business logic) → do nothing; the runner will record this card as `action_type='skipped'`.
- Duplicate of an existing measure → same as trivial; do nothing for this card.
**Manifest-only names need an overlay first.** If `sl_discover` shows a source name with `Type: table` but `sl_read_source` returns "Source not found", the source lives only in the schema manifest (no standalone overlay yet). `sl_edit_source` cannot edit manifest-only names — you must bootstrap an overlay with `sl_write_source` using the overlay shape:
```yaml
name: <SOURCE_NAME>
measures:
- name: <measure_name>
expr: "<expression>"
```
Overlay shape: `name:` plus any of `measures:`, `segments:`, `description:`, `joins:`, `disable_joins:`. Never include `sql:`, `table:`, `grain:`, or `columns:` on a manifest-backed name — those would shadow the manifest's schema and drop its joins. Overlay `joins:` are merged additively with the manifest's joins (deduped by `to` + `on`); use `disable_joins: ["<on-clause>"]` to suppress a specific manifest join. After the overlay exists, use `sl_edit_source` for further tweaks. See `sl_capture` skill for the canonical overlay rule.
**Join discovery:** When your card's SQL references warehouse tables (e.g. in `FROM` or `JOIN` clauses), call `sl_discover({ query: '<table>' })` before writing. The matching manifest entry's `name` is the value you put in `joins: [- to: <name>]`. Use `many_to_one` for FK-to-dimension joins, `one_to_many` for the reverse.
## priorProvenance
If the WU prompt includes a `priorProvenance` section for a card, it tells you what happened on prior ingest syncs. Treat it as advisory:
- `action_type: source_created` on source X → prefer editing X with `sl_edit_source` rather than writing a new source.
- `action_type: measure_added` on source X → you already contributed to X; add only measures that aren't present.
- `action_type: subsumed` or `merged` → this card was folded into another source last time; unless its SQL has changed structurally, keep it subsumed (no new write).
- `action_type: skipped` → last time we decided not to ingest this card; re-read the SQL and confirm the decision still holds. If the card now has non-trivial business logic, ingest it.
## Deduplication
Before writing, scan all cards in this WU for near-duplicate groups — cards whose `resolvedSql` shares the same CTEs, base tables, joins, and aggregation structure but differs only in:
- Trailing filters (e.g. `date_trunc(week, date)` vs `date_trunc(month, date)`).
- Minor `WHERE` clause variations.
- Column aliases or output column subsets.
- Aggregation granularity (daily vs weekly vs monthly).
When you find a group of near-duplicates:
1. Create ONE generalized source from the most comprehensive card in the group.
2. Strip card-specific trailing filters from the SQL so the source covers all variants (e.g. keep daily grain instead of filtering to week/month).
3. If each card had a distinct measure or filter, add them as separate measures on the single source.
4. For all cards except the canonical one, do nothing — they'll be recorded as `action_type='skipped'` automatically by the runner.
Do NOT merge cards with fundamentally different business logic, even if they share CTEs.
## Pre-aggregation decomposition
When a card's `resolvedSql` contains `GROUP BY` with aggregation functions (`SUM`, `COUNT`, `AVG`, …):
1. **Detect**: simple aggregation on base tables/joins — `SELECT` with `GROUP BY`, no complex CTEs or window functions.
2. **Decompose**: strip the `GROUP BY` and aggregation functions. Keep `FROM`, `JOIN`, and `WHERE` intact.
3. **Expose row-level columns**: include the grouped-by columns AND the raw columns being aggregated (e.g. `money_out` instead of `SUM(money_out) AS total_money_out`).
4. **Define aggregations as measures**: convert each aggregation into a KSL measure (e.g. `sum(money_out)`).
5. **Add joins**: with FK columns now exposed, declare joins to dimension sources.
Exception: keep the pre-aggregated SQL when the query involves multi-CTE pipelines, window functions, or recursive logic where decomposition would lose business logic.
## SQL translation from raw native to KSL
Every card carries a `resolvedSql` field. Check the staged card's `resolutionStatus` first:
- `resolutionStatus: "resolved"``{{#N}}` references are inlined and `[[ ... ]]` optional clauses have been dropped locally. If the resolved SQL contains no other parameters the SQL is executable as-is. If the card had **required** (non-bracketed) `{{ var }}` placeholders, the SQL is prefixed with a placeholder-warning comment block listing every dummy substitution Metabase made — see "Step A" below.
- `resolutionStatus: "fallback"` — Metabase failed to resolve. The SQL still contains `{{#N}}`, `{{#N-name}} alias`, `{{ var }}`, and `[[ ... ]]` syntax. Do the translation steps below before writing a source.
### Step A — Handle dummy-substituted placeholders (resolved cards only)
When a card has a required `{{ var }}` outside any `[[ ]]` block, the resolver substitutes a **dummy value** purely so Metabase's parser will accept the query. The resulting SQL is prefixed with a comment like:
```sql
-- PLACEHOLDER_WARNING: this SQL was extracted from a Metabase card with
-- unbound template parameters. The placeholders below were substituted with DUMMY
-- values to satisfy Metabase's parser — they DO NOT represent intended filters.
-- Drop the corresponding clauses (or expose them as runtime SL filters) before
-- persisting this SQL as a semantic-layer source.
-- {{ auction_end }} (type=dimension, widget=date/all-options) → '2020-01-01~2020-12-31'
-- {{ status }} (type=text) → 'placeholder'
SELECT ...
WHERE start_date >= '2020-01-01' AND start_date < '2021-01-01' AND status = 'placeholder'
```
For each listed placeholder: locate the WHERE clause(s) in the SQL that reference the dummy literal and **drop them**, then strip the warning comment. SL chat-time filters compose narrowing predicates dynamically, so the source should represent the unfiltered dataset.
For `fallback` cards, dropping is simpler — the SQL still has the `[[ ... ]]` brackets and `{{ var }}` placeholders intact:
```sql
-- before:
WHERE 1=1
[[AND {{ auction_end }} ]]
[[AND status = {{ status }} ]]
-- after:
WHERE 1=1
```
### Step B — Inline `{{#N}}` references (fallback cards only)
Resolved cards already have `{{#N}}` inlined for you. For `fallback` cards, each `{{#N}}` (or `{{#N-some-slug}}`) in the SQL refers to another card's `resolvedSql`. The referenced card is in the WU's `rawFiles` or `dependencyPaths`. Read it with `read_raw_file`, then inline its SQL.
If the reference has an alias (`from {{#5996-listing-interactions}} tb`), the **outer** SQL probably uses that alias (`select tb.* ...`, `tb.column_name`, etc.). When you inline, you must EITHER:
1. **Pick a single base table inside the inlined SQL and rename its alias to the outer alias.** Useful when the inlined card is `SELECT * FROM listings JOIN ...` — set the LISTINGS alias to `tb` and `tb.*` keeps working in the outer query.
2. **Replace the outer alias references with explicit columns from the inlined SQL.** Useful when the inlined card has multiple JOINs and `tb.*` is ambiguous.
Never leave the outer alias dangling: after inlining, **grep your SQL for the outer alias name and rewrite or remove every reference**. A leftover `tb.*` with no `tb` table is the most common failure mode here.
### Step C — Inlining cleanup checklist
After Steps A and B, your SQL must:
- Contain no placeholder-warning comment, no `{{`, `}}`, `[[`, or `]]` characters anywhere.
- Reference no aliases that aren't defined inside the SQL itself.
- Be valid as a standalone subquery (the validator runs `SELECT * FROM (your_sql) LIMIT 1`).
If `resolutionStatus: "fallback"` and the SQL is still complex enough that you can't confidently translate it, **skip the card** rather than writing broken SQL. Call `emit_unmapped_fallback` with the staged card path as `rawPath`, `reason: "metabase_sql_untranslated"`, and `fallback: "flagged"`.
## Join-graph connectivity
For `source_type: table`:
- Use FK columns (`semantic_type: type/FK`) to declare `many_to_one` joins to dimension sources.
- Match column names ending in `_id` against existing sources' grain columns.
For `source_type: sql`:
- The validator parses your SQL and **rejects the write** if any FROM/JOIN table has a manifest entry that you did not declare in `joins:`. The error names every missing join target — declare a `many_to_one` join for each and reissue.
- Tables outside the manifest (schemas not covered by this connection — e.g. `staging.*` referenced from a MARTS source) are not flagged. For those, write a single-line `wiki_write` with key `unmapped-table-<table_name>` so the gap is documented, then call `emit_unmapped_fallback` with the staged card path as `rawPath`, `reason: "table_outside_manifest"`, and `fallback: "wiki_only"`.
Joins on manifest-backed names compose: the manifest's joins are inherited automatically, and any overlay `joins:` are merged on top (deduped by `to` + `on`). Use `disable_joins: ["<on-clause>"]` in the overlay to suppress a specific manifest join. If `sl_discover` shows a manifest-backed source with `Joins: 0` and the warehouse FK metadata is genuinely absent, declaring application-level joins via the overlay is fair game — bootstrap with `sl_write_source` (overlay shape above), then refine via `sl_edit_source`.
## Cross-card references (`{{#N}}`)
Resolved cards (`resolutionStatus: "resolved"`) have these inlined for you. Unresolved cards (`resolutionStatus: "fallback"`) need manual handling — see "SQL translation from raw native to KSL" above.
## Provenance markers
Every SL source and wiki page you write carries HTML-comment provenance tags pointing to the `cards/<id>.json` files they derive from:
```yaml
# <!-- from: raw-sources/<connId>/metabase/<syncId>/cards/7.json -->
name: orders
...
```
If a source is derived from multiple cards (e.g. a generalized source for a near-duplicate group), emit one tag per contributing card.
## Quality standards
Source definitions must follow klo-sl YAML conventions:
- `source_type`: `"table"` (physical table/view) or `"sql"` (arbitrary SQL / derived view).
- `table`: required when `source_type: "table"` (e.g. `"public.orders"`).
- `sql`: required when `source_type: "sql"`.
- `grain`: what one row represents (e.g. `[id]`, `[customer_id, product_id]`).
- `columns`: all columns with correct types (`string`, `number`, `time`, `boolean`).
- Time columns: mark with `role: time`.
- `joins`: use correct `relationship` types (`many_to_one` for FK→PK, `one_to_many` for reverse).
- `joins.on`: `local_column = TARGET_SOURCE.target_column` — the right side MUST include the target source name.
- `measures.expr`: aggregation expression (e.g. `"sum(amount)"`); optional `filter` for business rules; required `description`.
Measure naming: descriptive `snake_case` (e.g. `total_revenue`, `avg_order_value`).
## Rules
- Prefer adding measures to existing sources over creating new ones.
- Before editing, always `sl_read_source` the source to check for existing measures.
- Don't duplicate measures (same aggregation on the same column).
- If two measures differ only by a filter (e.g. `revenue` vs `paid_revenue`), they are distinct.
- Use the card's `name` + `description` to write meaningful measure descriptions.
- When multiple cards in a WU are near-duplicates, create ONE generalized source; the runner will skip the rest automatically.
- Process every card in the WU — don't stop early.

View file

@ -0,0 +1,274 @@
---
name: metricflow_ingest
description: Map a MetricFlow semantic_model or metric into KLO semantic layer sources. Covers the MetricFlow to KLO primitive table, `extends:` inheritance flattening, metric-type handling (simple / derived / ratio / cumulative / conversion), `model: ref('x')` resolution, and four worked examples. Load when the turn contains `.yml`/`.yaml` files with top-level `semantic_models:` or `metrics:`.
callers: [memory_agent]
---
# MetricFlow to KLO Semantic Layer
A MetricFlow `semantic_model` maps to an SL source; MetricFlow `measures` map to KLO measures; MetricFlow `entities` map to KLO `joins`; MetricFlow `metrics` (top-level) map to KLO measures OR to cross-model derived measures. Files in one WorkUnit are ALWAYS part of the same logical entity (a connected component, possibly spanning `extends:` + cross-model metric refs). Flatten inheritance and cross-file references at write time.
## Mapping table
| MetricFlow | KLO form | Notes |
|---|---|---|
| `semantic_model: X { model: ref('t') }` with measures + dimensions | **Overlay** at `<connId>/X.yaml` with `measures`, `columns` (computed), `joins` | The `model:` ref resolves to a manifest table. |
| `semantic_model: X { model: source('s','t') }` | **Overlay** at `<connId>/X.yaml` over table `t`. | Same shape; `source()` still resolves to a physical table. |
| `semantic_model: X { model: <literal> }` with no manifest entry | **Standalone** with explicit `sql:`, `grain:`, `columns:` | Happens when the dbt manifest isn't available. |
| `semantic_model: Y { extends: X }` | **Merge** Y's measures/dimensions/entities into X's overlay, or write a single overlay named for the most-derived child (Y) containing both X's and Y's primitives | Do not emit a second overlay for X — flatten. |
| `measures: [{ name, agg, expr }]` | `measures: [{ name, expr: "<agg>(<expr>)" }]` | Aggregation inlined. `agg: count_distinct``count(distinct ...)`. |
| `entities: [{ name, type: primary }]` | `grain: [<entity_name-or-expr>]` on the overlay/standalone | Primary/unique entities drive grain. |
| `entities: [{ name, type: foreign }]` | `joins:` entry joining to the primary-entity's semantic_model | Only when a matching primary is discoverable. |
| `metrics: [{ type: simple, type_params: { measure: X } }]` | If the base measure is labeled/described by the metric: in-place edit to the existing measure. Otherwise leave as-is. | Same-name metrics can absorb metadata. |
| `metrics: [{ type: simple, filter: <jinja> }]` | **New measure** on the same source, with the filter translated to SQL and attached via `filter:` | Translate Jinja `{{ Dimension('x__y') }}` to the column name `y`. |
| `metrics: [{ type: derived, type_params: { expr, metrics } }]` | **Derived measure** on whichever source owns the referenced measures, with `expr:` referencing measure names | If the metric spans models, still write it once on the source owning the "primary" measure (the one the agent judges most central). Mention the cross-model chain in the description. |
| `metrics: [{ type: ratio, type_params: { numerator, denominator } }]` | Same as derived; `expr: "numerator / NULLIF(denominator, 0)"` if no explicit expr | Safe-division by default. |
| `metrics: [{ type: cumulative, type_params: { window, grain_to_date } }]` | **Standalone** source with a window-function SQL; reference the resulting column as a normal measure | KLO SL has no first-class cumulative primitive (spec Non-goals). |
| `metrics: [{ type: conversion }]` | **Flag for human** — do NOT write. Emit a wiki note describing the intended semantics. | No KLO equivalent in v1. |
| Metric not mappable | Wiki page `<metric_name>-definition.md` with the full YAML body quoted | Capture the intent even if we can't emit SL. |
Type map: MetricFlow `time` to KLO `time`; `categorical` to `string`; `number` to `number`; `boolean` to `boolean`. Follow `expr` over `name` when both differ — `expr` is the physical column.
## Flattening `extends:`
Within one WorkUnit, multiple semantic_models linked by `extends:` are guaranteed to be present (the chunker groups them). Resolve inheritance **before** writing:
1. Start with the most-derived child (the one that no other semantic_model extends).
2. Walk the `extends:` chain upward, accumulating measures, dimensions, entities.
3. Write ONE overlay/standalone, named for the most-derived child's SL-appropriate name (not the base).
4. Parents that lack their own distinctive content should NOT get a separate overlay. If a parent has unique measures a child doesn't inherit, consider whether the base is used elsewhere — if yes, write both; if no, still one overlay.
5. Measure/dimension name collisions: child wins, but note the overridden parent in the overlay's description or in a sibling wiki page.
The spec's worked example has `orders`, `orders_ext` (extends orders), and `metrics/orders_final.yml` (defines `revenue` referencing both). The right output is ONE overlay named `orders_ext` (or `orders` if the team's naming favors the base) containing `order_count`, `gross_amount`, `refund_amount`, and a derived `revenue` measure. Provenance tags point to all three source files.
## `model:` ref resolution
The `model:` field on a semantic_model is a string like `ref('table_name')`, `source('src','table_name')`, or a literal. Resolve:
- `ref('x')` → table name `x`. Verify via `sl_discover(x)`.
- `source('s','t')` → table name `t`. Verify via `sl_discover(t)`.
- Literal (no `ref(...)` / `source(...)`) → treat as the table name directly.
If `sl_discover` errors (no such table), fall back to `sql_execution({ sql: "SELECT column_name FROM <dataset>.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '<x>'" })` (session shape — a connection is already pinned by the ingest session). **Never invent column names** — every column in `columns:`, `grain:`, and `sql:` must be sourced from a real probe.
After every `sl_write_source`, call `sl_validate`. The warehouse will reject invented columns with `Unrecognized name: <name>` — treat as a hard failure and re-read the schema.
## Cumulative metrics — sql-standalone fallback
KLO SL has no first-class `window:` or `grain_to_date:` primitive in v1 (spec Non-goals). Translate a MetricFlow cumulative metric to a standalone SL source with a window-function SQL:
```yaml
# MetricFlow input:
metrics:
- name: cum_revenue_7d
type: cumulative
type_params:
measure: gross_amount
window: 7 days
```
```yaml
# KLO standalone output:
name: cum_revenue_7d
source_type: sql
sql: |
SELECT
ordered_at,
SUM(amount) OVER (ORDER BY ordered_at RANGE BETWEEN INTERVAL '7' DAY PRECEDING AND CURRENT ROW) AS cum_revenue_7d,
order_id
FROM analytics.orders
grain: [order_id]
columns:
- {name: ordered_at, type: time, role: time}
- {name: cum_revenue_7d, type: number}
- {name: order_id, type: string}
measures:
- {name: cum_revenue_7d, expr: "max(cum_revenue_7d)"}
```
Pick the time column based on the semantic_model's `defaults.agg_time_dimension` (e.g. `ordered_at`). If the MetricFlow config omits it, probe the base table for time-typed columns and choose the most obvious. After writing the standalone SQL source, call `emit_unmapped_fallback` with `rawPath` set to the MetricFlow file path, `reason: "cumulative_metric_unsupported"`, and `fallback: "sql_standalone"`.
## Conversion metrics — flag for human
```yaml
metrics:
- name: signup_to_first_order
type: conversion
type_params:
conversion_type_params:
entity: customer
base_measure: signup_count
conversion_measure: first_order_count
window: 30 days
```
Do NOT emit SL for this. Instead:
- Write a wiki page at `knowledge/global/<metric_name>-intent.md` quoting the full YAML body and a one-line explanation of the intended semantics (base event → conversion event within window).
- Call `emit_unmapped_fallback` with `rawPath` set to the MetricFlow file path, `reason: "conversion_metric_unsupported"`, and `fallback: "flagged"`.
When KLO SL gains conversion primitives, re-ingesting will find the prior wiki note (via `priorProvenance`) and replace it with an SL source.
## Provenance markers
Every overlay/standalone/wiki page emitted from a MetricFlow source carries HTML-comment provenance tags. When one overlay derives from multiple files (e.g. an extends chain), emit one tag per contributing file:
```yaml
# <!-- from: raw-sources/conn-1/metricflow/<syncId>/models/orders.yml#L1-20 -->
# <!-- from: raw-sources/conn-1/metricflow/<syncId>/models/orders_ext.yml#L1-12 -->
# <!-- from: raw-sources/conn-1/metricflow/<syncId>/metrics/orders_final.yml#L1-10 -->
name: orders_ext
...
```
Line ranges (`#L<start>-<end>`) point to the exact YAML span within the file (the `semantic_models:` entry for its own `name`). Use `read_raw_span` to identify those ranges before writing.
## Example 1 — single semantic_model to overlay
```yaml
# MetricFlow:
semantic_models:
- name: orders
model: ref('orders')
entities:
- {name: order_id, type: primary}
measures:
- {name: order_count, agg: count, expr: order_id}
- {name: gross_amount, agg: sum, expr: amount}
```
```yaml
# KLO overlay at <connId>/orders.yaml:
# <!-- from: raw-sources/.../models/orders.yml#L1-10 -->
name: orders
description: Order fact table.
measures:
- {name: order_count, expr: "count(order_id)"}
- {name: gross_amount, expr: "sum(amount)"}
grain: [order_id]
```
## Example 2 — extends chain → one flattened overlay
```yaml
# MetricFlow:
# models/orders.yml
semantic_models:
- name: orders
model: ref('orders')
measures:
- {name: order_count, agg: count, expr: order_id}
- {name: gross_amount, agg: sum, expr: amount}
# models/orders_ext.yml
semantic_models:
- name: orders_ext
model: ref('orders_ext')
extends: orders
measures:
- {name: refund_amount, agg: sum, expr: refund_amt}
# metrics/orders_final.yml
metrics:
- name: revenue
type: derived
type_params:
expr: gross_amount - refund_amount
metrics:
- {name: gross_amount}
- {name: refund_amount}
```
```yaml
# KLO overlay at <connId>/orders_ext.yaml (one file; inheritance flattened):
# <!-- from: raw-sources/.../models/orders.yml#L1-10 -->
# <!-- from: raw-sources/.../models/orders_ext.yml#L1-8 -->
# <!-- from: raw-sources/.../metrics/orders_final.yml#L1-10 -->
name: orders_ext
description: Extended order fact including refund handling; `revenue` = gross - refund.
measures:
- {name: order_count, expr: "count(order_id)"}
- {name: gross_amount, expr: "sum(amount)"}
- {name: refund_amount, expr: "sum(refund_amt)"}
- {name: revenue, expr: "gross_amount - refund_amount"}
grain: [order_id]
```
## Example 3 — derived metric spanning two semantic_models
```yaml
# models/sales.yml
semantic_models:
- name: sales
model: ref('sales')
measures:
- {name: revenue, agg: sum, expr: revenue_cents}
# models/costs.yml
semantic_models:
- name: costs
model: ref('costs')
measures:
- {name: cost, agg: sum, expr: cost_cents}
# metrics/margin.yml
metrics:
- name: margin
type: derived
type_params:
expr: revenue - cost
metrics: [{name: revenue}, {name: cost}]
```
Because the WorkUnit bundles all three files (cross-component union via the metric), write the derived measure on ONE of the two sources — pick the source whose domain "owns" the metric (here, `sales` — margin is inherently a sales metric). Cross-source references aren't native in KLO SL; treat the metric's operands as already-resolvable in the target source's query context OR emit a standalone SQL that joins the two tables:
```yaml
# <connId>/sales.yaml
# <!-- from: .../models/sales.yml#L1-8 -->
# <!-- from: .../models/costs.yml#L1-8 -->
# <!-- from: .../metrics/margin.yml#L1-8 -->
name: sales
measures:
- {name: revenue, expr: "sum(revenue_cents)"}
```
```yaml
# <connId>/margin.yaml — standalone because it spans two tables
# <!-- from: .../models/sales.yml#L1-8 -->
# <!-- from: .../models/costs.yml#L1-8 -->
# <!-- from: .../metrics/margin.yml#L1-8 -->
name: margin
source_type: sql
sql: |
SELECT s.period_id, s.revenue_cents, COALESCE(c.cost_cents, 0) AS cost_cents
FROM analytics.sales s
LEFT JOIN analytics.costs c ON c.period_id = s.period_id
grain: [period_id]
columns:
- {name: period_id, type: string}
- {name: revenue_cents, type: number}
- {name: cost_cents, type: number}
measures:
- {name: revenue, expr: "sum(revenue_cents)"}
- {name: cost, expr: "sum(cost_cents)"}
- {name: margin, expr: "sum(revenue_cents) - sum(cost_cents)"}
```
Also write a wiki page at `knowledge/global/margin-metric.md` explaining the cross-source origin.
## Example 4 — filtered metric creates a new measure
```yaml
metrics:
- name: paid_order_count
type: simple
type_params:
measure: order_count
filter: "{{ Dimension('orders__status') }} = 'paid'"
```
```yaml
# <connId>/orders.yaml
measures:
- {name: order_count, expr: "count(order_id)"}
- {name: paid_order_count, expr: "count(order_id)", filter: "status = 'paid'"}
```
Translate `{{ Dimension('orders__status') }}` to the bare column name `status` (the table alias prefix is implicit within the SL source's scope).

View file

@ -0,0 +1,69 @@
---
name: notion_synthesize
description: Synthesize durable KLO wiki pages and semantic-layer sources from staged Notion pages, databases, data-source rows, and clustered Notion evidence. Load when a WorkUnit contains Notion raw files or Notion evidence chunks.
callers: [memory_agent]
---
# Notion Cluster Synthesis
Use this skill when a WorkUnit contains staged Notion content from `pages/**`, `databases/**`, `data-sources/**`, or clustered Notion evidence.
## Role
Each WorkUnit is either a single Notion page/span or a topical cluster of related Notion pages, pre-grouped by embedding similarity. Read the assigned raw files, then write a small set of durable wiki entries and, when applicable, semantic-layer sources that synthesize the WorkUnit's knowledge. Write final memory directly; do not write candidates.
## Required Workflow
1. Read the WorkUnit notes and rawFiles list. Page content lives in `page.md`; `metadata.json` holds title, path, object type, data-source ids, last edited metadata, and properties.
2. For each assigned page, call `read_raw_file`, or `read_raw_span` for oversized pages when the notes specify a span.
3. Search `wiki_search` for existing pages that overlap the WorkUnit topics. Prefer updating an existing page over creating a duplicate.
4. Use `context_evidence_search`, `context_evidence_read`, and `context_evidence_neighbors` to pull supporting chunks when indexed evidence is relevant. Pass `chunkId` and `documentId` values verbatim as returned by the evidence tools.
5. Write durable business knowledge with `wiki_write`. Aim for a small number of high-quality pages per WorkUnit or cluster.
6. When the Notion content defines a reusable dataset, metric, segment, join rule, source-of-truth mapping, or table with explicit columns, load `sl_capture`, discover existing sources first with `sl_discover` or `sl_read_source`, then use `sl_write_source` or `sl_edit_source`.
7. For every deleted raw path in the Eviction Set, call `eviction_list`, decide retention, then `context_eviction_decision_write`. Do this even when no wiki write is needed.
## What To Capture
Capture durable, reusable company knowledge:
- metric definitions, KPI formulas, named business concepts, and reusable filters
- workflows, policies, ownership rules, approval conventions, and source-of-truth mappings
- data-source row pages that describe tables, columns, semantic models, dashboards, or business entities
- cross-system aliases connecting Notion terms to warehouse, dbt, Looker, Metabase, or MetricFlow names
- caveats, conflicts, supersession notes, and customer/product assumptions affecting future analysis
Skip noisy or transient content:
- meeting notes with no reusable rule
- task lists, project status updates, and time-bounded snapshots
- duplicate docs with no new fact
- database metadata pages when row pages contain the actual business content
- transient announcements and long page summaries
## Quality
Prefer fewer, stronger entries. Every wiki entry must cite at least one Notion page or row using its path and last edited date when available. When evidence conflicts, write a conflict note inside the wiki page rather than choosing silently.
If a clustered WorkUnit includes several related pages, synthesize the shared rule or concept instead of writing one thin page per source. For oversized page spans, read only the assigned span unless the WorkUnit explicitly asks for neighboring context.
## Citation Style
```md
## Revenue Recognition
- Booked revenue excludes refunds and test accounts.
- Source: Notion - Company Handbook / Finance / Revenue Recognition, last edited 2026-04-12.
- Conflict note: An older Sales Ops page uses gross revenue before refunds; treat the Finance Handbook as current unless Finance says otherwise.
```
## Semantic-Layer Rules
- Load `sl_capture` before writing or editing SL sources.
- Discover existing sources first with `sl_discover`; read existing source YAML before editing.
- Prefer overlays on manifest-backed sources over standalone SQL.
- If Notion describes a dashboard or metric but does not define executable logic, write a wiki page and attach `sl_refs` only after confirming the referenced source exists.
## Tools
Allowed: `read_raw_file`, `read_raw_span`, `wiki_search`, `wiki_read`, `wiki_write`, `sl_discover`, `sl_read_source`, `sl_write_source`, `sl_edit_source`, `sl_validate`, `context_evidence_search`, `context_evidence_read`, `context_evidence_neighbors`, `eviction_list`, `context_eviction_decision_write`.
Not allowed: `context_candidate_write`, `context_candidate_mark`.

View file

@ -0,0 +1,240 @@
---
name: sl
description: KLO's semantic layer — a structured catalog of sources (tables/views), measures, joins, and segments expressed as YAML. Covers the schema and how to query it via `semantic_query`. Use when the task involves querying pre-defined metrics (ARR, churn, retention, LTV, MAU) or reading SL source YAML to understand the catalog. Capture is handled by the `sl_capture` skill (memory-agent only).
---
# Semantic Layer
KLO's semantic layer (SL) is a structured catalog. Each **source** represents a table, a SQL view, or an overlay that enriches a manifest-backed table with measures, computed columns, joins, and named segments. The catalog is the single source of truth for reusable business metrics.
This skill covers two parts:
- **Part 1** — Schema reference (what an SL source looks like).
- **Part 2** — Querying via `semantic_query`.
Capture (when and how to add new patterns to the SL) is a separate concern handled by the memory-agent — see the `sl_capture` skill if you are running in capture mode. The research agent **reads** and **queries** the SL via the tools described here; it does not write to it.
---
## Part 1 — Schema reference
An SL source is a YAML file at `semantic-layer/<connectionId>/<source_name>.yaml`. There are three flavors:
### Overlay sources
Enrich a manifest-backed table with measures, computed columns, joins, and segments. No `table` or `sql` field. The base table's columns and grain are inherited from the manifest.
```yaml
name: fct_orders # must match an existing manifest table
description: "Overlay adding business measures to the orders fact table."
measures:
- name: total_revenue
expr: sum(amount)
description: Total order revenue — filter by status or region at query time
columns: # computed dimensions only
- name: is_large_order
type: boolean
expr: "amount > 1000"
segments:
- name: paid_non_refunded
expr: "is_paid = true AND is_refunded = false"
joins:
- to: customers
on: "customer_id = customers.id"
relationship: many_to_one
```
Rules:
- Do **not** repeat base-table columns, grain, `table`, or `source_type` in an overlay — those are inherited.
- Overlay columns MUST be computed (`expr` + `type`).
- `exclude_columns` hides specific manifest columns; `disable_joins` suppresses specific auto-detected joins.
### Standalone table sources
Self-contained; own their schema. Has `source_type: table` and `table:`.
```yaml
name: account_health_scores
source_type: table
table: "analytics.account_health_scores"
grain: [account_id, snapshot_date]
columns:
- name: account_id
type: string
- name: snapshot_date
type: time
role: time
- name: health_score
type: number
measures:
- name: avg_health_score
expr: avg(health_score)
```
### Standalone SQL sources
Self-contained; schema derived from a SQL query. Has `source_type: sql` and `sql:`.
```yaml
name: monthly_cancellations
source_type: sql
sql: |
SELECT
date_trunc('month', cancelled_at) AS month,
customer_id,
plan_name,
mrr_amount
FROM subscriptions
WHERE status = 'cancelled'
grain: [customer_id, month]
columns:
- name: month
type: time
role: time
- name: customer_id
type: string
- name: plan_name
type: string
- name: mrr_amount
type: number
measures:
- name: cancellation_count
expr: count(*)
```
An SQL source is a one-shot answer: the aggregation is frozen, callers cannot re-group or re-filter by columns the SQL has collapsed, and the source is disconnected from the join graph. Prefer overlays + measures over SQL sources when possible — the `sl_capture` skill covers when SQL is justified.
### Columns
Every standalone column requires `name` and `type`. Overlays have computed columns only.
- `type`: one of `string`, `number`, `boolean`, `time`. Map LookML `date`/`datetime`/`timestamp``time`. Map LookML `yesno``boolean`.
- `role` (optional): `time` enables time-granularity queries (month, week, day). `default` is the implicit fallback.
- `visibility` (optional): `public`, `internal`, or `hidden`.
- `expr` (optional for standalone, required for overlay columns): SQL expression that computes the value. Expanded by sqlglot before generating SQL, so you can reference other columns on the same source.
### Grain
`grain: [col_a, col_b]` — the set of columns that uniquely identify one row. The query engine uses grain to prevent fan-out in joins. Overlays inherit grain from the manifest unless they override.
### Joins
```yaml
joins:
- to: customers # target source name
on: "customer_id = customers.id" # local_col = TARGET.target_col
relationship: many_to_one # or one_to_many, one_to_one
alias: primary_customer # optional — lets you join the same target twice
```
- `on` format: `local_col = TARGET.target_col`. Always qualify the right side with the target source name.
- `relationship` is the cardinality **from this source to the target**. Most joins are `many_to_one` (FK → PK on the parent).
### Measures
```yaml
measures:
- name: total_arr
expr: sum(arr_amount)
description: Sum of ARR — filter by plan_name at query time
filter: "is_active = true"
segments: [paid_non_refunded]
```
- `name` (required, snake_case).
- `expr` (required): any valid SQL aggregate — `sum(x)`, `count(*)`, `count(distinct user_id)`, `avg(score)`.
- `description` (required on capture): what the measure computes and how to use it.
- `filter` (optional): SQL predicate applied as a WHERE clause specific to this measure.
- `segments` (optional): names of segments defined on the same source. The engine AND-composes each segment's `expr` into this measure's effective filter.
Use `safe_divide(num, den)` for ratio measures to avoid division by zero.
### Segments
```yaml
segments:
- name: paid_non_refunded
expr: "is_paid = true AND is_refunded = false"
description: Orders that were paid and not refunded
```
Named, reusable boolean predicates scoped to one source. Reference by bare name in a measure's `segments: []`, or by dotted form `source.segment_name` in a `semantic_query`. Segments are predicates only — they are NOT selectable as dimensions. If you need to group by the predicate, add a `columns[]` entry instead.
### Cross-references with the wiki
The reverse edge (wiki pages that cite this source) is derived automatically from each wiki's `sl_refs:` — you don't emit anything on the SL side. Author the edge once on the wiki via `sl_refs:`; the post-write reconciler populates the knowledge↔SL index.
---
## Part 2 — Querying via `semantic_query`
The `semantic_query` tool generates correct SQL from a structured query. It handles joins, fan-out prevention, aggregation correctness, and filter classification automatically. Prefer it over writing raw SQL whenever the SL has the relevant sources.
### When to prefer semantic_query over raw SQL
- A pre-defined measure already exists (`source.measure_name` appears in the catalog).
- The question combines fields from multiple sources — the engine resolves the join path automatically.
- The question asks for a standard metric (revenue, ARR, churn, retention, LTV, conversion, MAU, etc.) — even if no pre-defined measure exists, a runtime aggregation over a catalog column is usually correct.
Use raw SQL (`sql_execution`) only when:
- The computation requires multi-step CTEs whose intermediate grain is not a column in any source.
- The question explicitly asks for a one-off exploration that will never be asked again.
### Input shape
```json
{
"connectionId": "uuid-of-the-connection",
"reasoning": "Brief note on what this query analyzes",
"query": {
"measures": ["orders.total_revenue", "sum(orders.amount)"],
"dimensions": ["customers.segment", { "field": "orders.created_at", "granularity": "month" }],
"filters": ["orders.status != 'cancelled'", "orders.total_revenue > 10000"],
"segments": ["orders.paid_non_refunded"],
"order_by": [{ "field": "orders.created_at", "direction": "desc" }],
"limit": 1000
}
}
```
- **`measures`**: mix pre-defined refs (`source.measure`) and runtime aggregations (`sum(source.column)`).
- **`dimensions`**: column refs or `{ field, granularity }` objects for time grains (`day`, `week`, `month`, `quarter`, `year`).
- **`filters`**: free-form SQL predicates. The engine auto-classifies each as WHERE or HAVING based on whether it references an aggregated measure.
- **`segments`**: dotted `source.segment_name`. Each segment is AND-ed into the effective filter of every measure whose base source matches. Segments never become a global WHERE — use `filters` for cross-source predicates.
- **`order_by`**: string or `{ field, direction }`. Direction defaults to `asc`.
- **`limit`**: integer row cap.
### Join resolution
You don't specify a base table. The engine infers the set of sources needed from the fields you reference and resolves the shortest join path through the catalog's declared joins. If no path exists between two sources, the query fails with a path-not-found error — check `discover_data` or `sl_discover` to see which sources are connected.
### Worked examples
Cross-source query — engine resolves `account_health_scores → accounts ← opportunities` automatically:
```json
{
"measures": ["account_health_scores.avg_health_score"],
"dimensions": ["opportunities.stage"],
"filters": ["opportunities.stage != 'Closed Won'"]
}
```
Monthly ARR trend with a segment:
```json
{
"measures": ["subscriptions.arr"],
"dimensions": [{ "field": "subscriptions.month", "granularity": "month" }],
"segments": ["subscriptions.paid_non_refunded"],
"order_by": [{ "field": "subscriptions.month", "direction": "asc" }]
}
```
Multi-source with runtime aggregation:
```json
{
"measures": ["sum(orders.amount)", "count(support_tickets.ticket_id)"],
"dimensions": ["customers.segment"]
}
```

View file

@ -0,0 +1,276 @@
---
name: sl_capture
description: How to capture new reusable patterns into KLO's semantic layer — when a measure, segment, or join belongs in the catalog and how to write it generically so it stays small and useful over time. Loaded by the post-turn memory-agent only. The research agent does not write to the SL.
callers: [memory_agent]
---
# Semantic Layer — Capture
This skill covers **when** and **how** to capture new patterns into the semantic layer. For schema reference and query grammar, load the `sl` skill first.
When the current turn produces a reusable pattern (business metric, derived view, join pattern, computed dimension), capture it so future queries can reach for it instead of rediscovering it.
## SQL dialect
The user-facing prompt includes a `Warehouse:` line under the SL Sources index
(e.g. `Warehouse: BIGQUERY`). All `expr` strings — measure expressions, segment
predicates, computed-column SQL — execute on that warehouse and must use its
syntax. Date arithmetic in particular varies by dialect:
- **BigQuery**: `transaction_date >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 90 DAY)` (when the column is `TIMESTAMP`); `event_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 90 DAY)` (when `DATE`).
- **Postgres / Redshift**: `transaction_date >= current_date - interval '90 days'`.
- **Snowflake**: `transaction_date >= dateadd(day, -90, current_timestamp())`.
Match the column's manifest type (`type: time` → TIMESTAMP/DATETIME on the
warehouse) — comparing TIMESTAMP to a DATE-arithmetic result fails on
BigQuery. After every `sl_edit_source`/`sl_write_source`, the inline validator runs a
`LIMIT 1` warehouse probe per measure and surfaces dialect mismatches; if
you see an error trailer, fix the expression and retry rather than leaving
the source for the post-squash gate to revert.
## What's worth capturing
- Business metric aggregations (ARR, MRR, revenue, churn, retention, conversion, LTV, CAC).
- Derived calculations combining multiple signals (risk scores, health scores, composite KPIs).
- Multi-table join patterns producing a reusable analytical view.
- Computed categories or flags useful as reusable dimensions (`case when num_protocols >= 3 then 'power' else 'regular' end`).
- Missing joins between two sources that both exist but aren't connected in the join graph.
Skip:
- Simple `SELECT * LIMIT 10` previews.
- Trivial `COUNT(*)` on one table with no business filtering.
- One-off ad-hoc explorations unlikely to repeat.
- Equivalent measures that already exist (cite the existing one as `source.measure_name`).
When in doubt, capture. Measures are easy to remove but impossible to recover from a lost conversation.
## Generalization rules
The SL must stay small and general over time. Before adding a measure, decide whether it belongs as a generic pattern or a specific constant.
**Prefer one generic measure with query-time filters over N hardcoded variants.**
Anti-pattern:
```yaml
- name: revenue_us_region
expr: sum(case when region = 'US' then amount end)
- name: revenue_eu_region
expr: sum(case when region = 'EU' then amount end)
```
Preferred:
```yaml
- name: total_revenue
expr: sum(amount)
```
Callers filter `region = 'US'` at `semantic_query` time.
**Bake constants in only when the filter has named business meaning that won't change** (`enterprise_arr` for a contractually defined tier), cannot be expressed via the source's dimensions, or comes from a regulated/fixed list.
**Time anchors and value lists belong in callers' filters, not in measure expressions or source SQL.**
- Anti-pattern (date anchor inlined): `expr: count(distinct case when transaction_date >= '2026-04-12' then customer_id end)` — the date will need editing every time the question shifts, and every reader has to discover it.
- Anti-pattern (value list inlined in source SQL): `WHERE product_category_1 IN ('Testosterone', 'Weight Loss', …)` — locks the source to today's catalog and blocks callers from broadening or narrowing.
- Preferred: a generic measure (`count(distinct customer_id)`) plus either a named segment that captures the *meaning* of the anchor (`gh_new_products_since_launch`) or a query-time filter. Callers compose; the source stays small.
- A date is durable to bake in only when it represents a regulatory cutover, a contractually fixed boundary, or a one-time event that reshapes how the source itself is read.
**If you create a segment whose expr matches a measure's filter, the measure MUST reference the segment via `segments: [segment_name]` rather than re-inlining the predicate.** This is the canonical pattern even with a single measure — duplicating the predicate inline defeats the purpose of naming it.
Anti-pattern:
```yaml
segments:
- name: engaged_subscriber
expr: "is_paid = true AND <date-window-90-days-on-transaction_date>"
measures:
- name: engaged_subscriber_count
expr: "count(distinct case when is_paid = true and transaction_date >= current_date - interval '90 day' then admin_user_id end)"
```
Preferred:
```yaml
segments:
- name: engaged_subscriber
expr: "is_paid = true AND <date-window-90-days-on-transaction_date>"
measures:
- name: engaged_subscriber_count
expr: "count(distinct admin_user_id)"
segments: [engaged_subscriber]
```
**Use computed dimensions for derived categories.** A flag like `is_power_user` belongs on `columns[]` with `expr`, not inlined into every measure.
**Extract repeated filter bundles into named segments.** If the same predicate appears on multiple measures of the same source, lift it to a `segments[]` entry and have each measure reference it. One edit updates every measure that depends on it.
**Never write a standalone file on a manifest-backed name.** If `sl_discover({ tableName })` finds an existing schema for that name, you MUST write an overlay (`name:` + `measures:`/`segments:`/`description:` only — no `sql:`, `table:`, `grain:`, `columns:`, `joins:`). A standalone with `sql:` or `table:` on a manifest-backed name clobbers the inherited columns and joins; `sl_write_source` and `sl_validate` both reject this shape with a clear fix hint. Always run `sl_discover` before your first write on any existing name.
**Prefer overlay decomposition over standalone SQL sources.** Before reaching for `source_type: sql`, check whether the metric decomposes into measures on existing overlays (including cross-source derived measures). Use `source_type: sql` only when:
- The metric requires per-user/per-entity derivation that cannot be expressed as a single `expr` (e.g., `EXISTS` over a time-windowed subset), OR
- The metric requires multi-step CTEs whose intermediate grain is not a column in any existing source.
When an `sql` source is unavoidable, note in its `description` which SL gap forced the choice so it can be retired once the primitive ships. It must target a name NOT in the manifest — pick a distinct one (e.g. `mrr_waterfall_rollup`, not `fct_orders`).
## Slim standalone sources via `inherits_columns_from`
When a standalone SQL source filters or projects from a single manifest-backed base table (the common pattern for derived views like `aav_consignments` over `MARTS.CONSIGNMENTS`), set `inherits_columns_from:` to the base table's manifest key and list only column **names** in `columns:`. Compose-time enrichment fills `type`, `descriptions`, and `role` from the matching manifest column.
Discover the manifest key with `sl_discover` — pass the bare name (`CONSIGNMENTS`), the fully-qualified path (`ANALYTICS.MARTS.CONSIGNMENTS`), or any suffix; the tool resolves all forms and prints the canonical key in its output.
```yaml
name: aav_consignments
description: AAV consignments — filtered view of MARTS.CONSIGNMENTS for the auto-auction-vaulting channel.
source_type: sql
sql: |
SELECT CONSIGNED_ITEM_ID, CASH_ADV_AMOUNT, ALT_VALUE_COMBINED, my_derived_flag
FROM MARTS.CONSIGNMENTS
WHERE IS_AUTO_AUCTION_VAULTING_SUBMISSION = TRUE
AND IS_CARD_SHOW_SUBMISSION = FALSE
AND CONSIGNMENT_CANCELED_FLAG = FALSE
inherits_columns_from: CONSIGNMENTS
grain: [CONSIGNED_ITEM_ID]
columns:
- { name: CONSIGNED_ITEM_ID } # type/description inherited from manifest
- { name: CASH_ADV_AMOUNT }
- { name: ALT_VALUE_COMBINED }
- { name: my_derived_flag, type: boolean, expr: "CASH_ADV_AMOUNT > 0", description: "Computed locally — has any cash advance." }
measures:
- name: total_cash_advance
expr: sum(CASH_ADV_AMOUNT)
```
Rules:
- Inheritance fills only **blank** fields. If you set a `description` locally, it wins — useful when the base description is misleading in the filtered view.
- A column not in the manifest (a derived/aliased column, or one from a different table in a `JOIN`) needs its own `type` and `description` declared.
- If `inherits_columns_from` doesn't resolve, the source still loads, but every column without a type triggers a validator error on the warehouse probe — `sl_discover` first to confirm the key.
- Don't use `inherits_columns_from` for sources backed by `table:` (those should be overlays — see the rule against shadowing the manifest above).
## Refinement — replace, don't append
When the user corrects a prior answer, the existing measure is wrong by the user's own standard. Replace it, don't add a parallel measure.
Signals that the current turn is a refinement:
- "no, I meant...", "actually use X", "exclude Y", "wait, by X I mean Z".
- Pushback on a prior result ("that's wrong because...", "this should be higher").
- Redefinition of a term used in an existing measure.
Distinguishing question: *would the prior measure still be correct for someone else asking the prior question?* If no → replace. If yes → add.
## Edit SL vs document in wiki
If the user explicitly names an SL artifact and asks to change it, the primary
action is always an SL tool call. Examples:
- "edit the source", "edit the YAML", "edit `fct_intakes.yaml`" → `sl_edit_source` or
`sl_write_source`.
- "refine the measure", "change the filter on `active_users`", "fix the expr",
"add `is_test = false`" → `sl_edit_source` on the source that owns the measure.
- "don't create a new one, update the existing" → `sl_edit_source` (never `sl_write_source`
with a new source name; never `wiki_write` as the only action).
A wiki update may ALSO make sense in the same turn (owner note, lineage,
caveat), but it is never a substitute for editing the YAML when the user's
request is about changing the measure/source definition itself.
Wiki-only is correct when the user is documenting *about* the measure
(definition in business terms, owner, policy, glossary, examples of when to
use it) without changing its SQL expression or filters.
## Tool sequence
1. `sl_discover` — see what source files exist.
2. `sl_discover({ tableName })`**REQUIRED before the first write on any name**. Shows columns/joins/grain from the manifest. If the call returns a schema, you MUST write an overlay, not a standalone. Skipping this is the #1 cause of accidentally shadowing the manifest.
3. `sl_read_source({ sourceName })` — read the raw YAML before editing.
4. For modifications: `sl_edit_source({ sourceName, old_string, new_string })` with exact-string replacements. `old_string` must match exactly and be unique in the file.
5. For new sources or full rewrites: `sl_write_source({ sourceName, content })` with the full YAML content.
6. For join discovery: `sql_execution({ sql })` to verify the join key exists in both tables and assess cardinality before declaring the join.
7. Cross-reference knowledge: author the edge once on the **wiki** side via `sl_refs: [source_name]` in the page's front-matter. The reverse edge (wiki pages that cite an SL source) is derived automatically by the reconciler — do not add a `knowledge_refs:` field to SL YAMLs.
8. `sl_validate` — run after writing or editing to surface schema issues, duplicate measure names, and cross-source validation errors. Read-only; the writes are already committed (the squash-at-end flow will collapse them into one commit).
## Editing patterns
- **`sl_edit_source`** is the workhorse for additive changes: add a measure, add a join, tweak a description, replace a filter. Cheap, targeted, preserves the rest of the file.
- **`sl_write_source`** is for brand-new sources or when the entire file needs restructuring. It overwrites the file completely.
- Do NOT modify existing measures or their descriptions unless the current turn explicitly corrects them.
## Worked example — additive overlay
Conversation:
- User: "What was the average order value last quarter?"
- Assistant fell back to SQL: `SELECT AVG(amount) FROM orders WHERE order_date >= ...`
Existing index: `orders [measures=0, joins=0] — candidate for enrichment`.
```
sl_discover()
→ orders.yaml does not exist yet
sl_discover({ tableName: "orders" })
→ see grain, columns, no current overlay
sl_write_source({
sourceName: "orders",
content: "name: orders\nmeasures:\n - name: avg_order_value\n expr: avg(amount)\n description: Mean order transaction amount — filter by product_category at query time\n"
})
sl_validate()
→ clean
```
The overlay only contains `name` and `measures` — no columns, grain, or table. Those are inherited from the manifest.
## Worked example — refinement (replace)
Prior turn:
- [user] "How many active users do we have per region?"
- [assistant] "… used `count(*) filter: last_login_at > now() - interval '30 days'`"
Current user: "Wait, by 'active' I mean users who have placed an order in the last 30 days, not just logged in."
The existing `users.active_count` measure is wrong by the new definition.
```
sl_read_source({ sourceName: "users" })
→ see the wrong measure
sl_edit_source({
sourceName: "users",
yaml_edits: [{
oldText: " - name: active_count\n expr: \"count(*)\"\n filter: \"last_login_at > now() - interval '30 days'\"\n description: Users who logged in within the last 30 days",
newText: " - name: active_count\n expr: \"count(distinct case when last_order_at > now() - interval '30 days' then user_id end)\"\n description: Users with at least one order in the last 30 days"
}]
})
sl_validate()
```
If you only added a new measure, the old incorrect `active_count` would stay and future queries would keep answering the wrong question.
## Worked example — new join
Prior turn: user asked to correlate LTV with protocol count; assistant joined `fct_orders` with `fct_mau_multiprotocol` on `admin_user_id` in raw SQL.
```
sl_read_source({ sourceName: "fct_orders" })
→ no joins section yet
sql_execution({
sql: "SELECT COUNT(*), COUNT(DISTINCT a.admin_user_id) FROM fct_orders a JOIN fct_mau_multiprotocol b ON a.admin_user_id = b.admin_user_id LIMIT 1"
})
→ confirms cardinality (many orders per MAU row = many_to_one)
sl_edit_source({
sourceName: "fct_orders",
yaml_edits: [{
oldText: "measures:",
newText: "joins:\n - to: fct_mau_multiprotocol\n on: admin_user_id = fct_mau_multiprotocol.admin_user_id\n relationship: many_to_one\nmeasures:"
}]
})
sl_validate()
```
Always verify joins with `sql_execution` before adding them.
## Rules recap
- Read existing sources before editing (`sl_read_source` or `sl_discover`).
- Prefer overlays over standalone sources on manifest-backed tables.
- Prefer generic measures + query-time filters over per-value variants.
- Time anchors and value lists belong in callers' filters, not in measure expressions.
- A measure whose filter matches a segment MUST reference the segment via `segments: [name]`.
- Extract repeated predicates into named segments.
- Use computed dimensions for derived categories.
- When the user corrects a prior answer, replace — don't append.
- Always run `sl_validate` after writing to surface issues.
- If nothing is worth capturing, respond without calling any SL write tool.

View file

@ -0,0 +1,330 @@
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
vi.mock('ai', () => ({
generateText: vi.fn(),
stepCountIs: (n: number) => n,
tool: (def: unknown) => def,
}));
import { generateText } from 'ai';
import { AgentRunnerService, type RunLoopStepInfo } from './agent-runner.service.js';
describe('AgentRunnerService.runLoop', () => {
let runner: AgentRunnerService;
const llmProvider = {
getModel: vi.fn().mockReturnValue({ modelId: 'claude-sonnet-4-6', provider: 'anthropic' }),
getModelByName: vi.fn(),
cacheMarker: vi.fn(),
repairToolCallHandler: vi.fn(),
thinkingProviderOptions: vi.fn(),
telemetryConfig: vi.fn(),
promptCachingConfig: vi.fn(() => ({
enabled: false,
systemTtl: '1h',
toolsTtl: '1h',
historyTtl: '5m',
cacheSystem: true,
cacheTools: true,
cacheHistory: true,
vertexFallbackTo5m: false,
})),
activeBackend: vi.fn(() => 'anthropic'),
};
beforeEach(() => {
vi.clearAllMocks();
runner = new AgentRunnerService({ llmProvider: llmProvider as any });
});
afterEach(() => vi.clearAllMocks());
it('passes systemPrompt, userPrompt, tools, and step budget through to generateText', async () => {
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
const tools = { noop: { description: 'noop', inputSchema: {}, execute: vi.fn() } };
await runner.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: 'SYS',
userPrompt: 'USR',
toolSet: tools as any,
stepBudget: 17,
telemetryTags: { source: 'test' },
});
const call = (generateText as any).mock.calls[0][0];
expect(call.messages).toEqual([
{ role: 'system', content: 'SYS' },
{ role: 'user', content: 'USR' },
]);
expect(call.system).toBeUndefined();
expect(call.prompt).toBeUndefined();
expect(call.tools).toEqual(tools);
expect(call.stopWhen).toBe(17);
expect(call.temperature).toBe(0);
expect(llmProvider.getModel).toHaveBeenCalledWith('candidateExtraction');
});
it('returns stopReason=natural when the loop completes without error', async () => {
(generateText as any).mockResolvedValue({ text: 'done', toolCalls: [], steps: [] });
const result = await runner.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: 'system',
userPrompt: 'user',
toolSet: {},
stepBudget: 10,
telemetryTags: {},
});
expect(result.stopReason).toBe('natural');
expect(result.error).toBeUndefined();
expect(llmProvider.getModel).toHaveBeenCalledWith('candidateExtraction');
expect(generateText).toHaveBeenCalledWith(
expect.objectContaining({
messages: [
{ role: 'system', content: 'system' },
{ role: 'user', content: 'user' },
],
}),
);
});
it('returns stopReason=error with the error on generateText failure', async () => {
const err = new Error('LLM unavailable');
(generateText as any).mockRejectedValue(err);
const result = await runner.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: {},
});
expect(result.stopReason).toBe('error');
expect(result.error).toBe(err);
});
it('invokes caller onStepFinish with incrementing stepIndex and total budget', async () => {
const calls: RunLoopStepInfo[] = [];
(generateText as any).mockImplementation(async (opts: any) => {
for (let i = 0; i < 3; i++) {
await opts.onStepFinish({});
}
return { text: 'ok', toolCalls: [], steps: [] };
});
await runner.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: {},
onStepFinish: (info) => {
calls.push(info);
},
});
expect(calls).toEqual([
{ stepIndex: 1, stepBudget: 10 },
{ stepIndex: 2, stepBudget: 10 },
{ stepIndex: 3, stepBudget: 10 },
]);
});
it('swallows errors thrown from caller onStepFinish without aborting the loop', async () => {
(generateText as any).mockImplementation(async (opts: any) => {
await opts.onStepFinish({});
return { text: 'ok', toolCalls: [], steps: [] };
});
const result = await runner.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: {},
onStepFinish: () => {
throw new Error('boom');
},
});
expect(result.stopReason).toBe('natural');
});
it('forwards telemetryTags.source through experimental_telemetry metadata', async () => {
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
const telemetryConfigEnabled = {
isEnabled: () => true,
devtoolsEnabled: false,
appSettingsService: {
settings: { telemetry: { recordInputs: false, recordOutputs: false } },
},
systemConfigService: {
config: { instance: { name: 'test-instance' } },
},
} as any;
const runnerWithTelemetry = new AgentRunnerService({
llmProvider: llmProvider as any,
telemetry: {
createTelemetry: (tags) => ({
isEnabled: telemetryConfigEnabled.isEnabled(),
metadata: {
source: tags.source ?? 'RESEARCH',
jobId: tags.jobId,
unitKey: tags.unitKey,
},
}),
},
});
await runnerWithTelemetry.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: { source: 'metabase', jobId: 'job-123', unitKey: 'u/1' },
});
const call = (generateText as any).mock.calls[0][0];
expect(call.experimental_telemetry.metadata.source).toBe('metabase');
});
it('defaults to source=RESEARCH when telemetryTags omits source', async () => {
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
const telemetryConfigEnabled = {
isEnabled: () => true,
devtoolsEnabled: false,
appSettingsService: {
settings: { telemetry: { recordInputs: false, recordOutputs: false } },
},
systemConfigService: {
config: { instance: { name: 'test-instance' } },
},
} as any;
const runnerWithTelemetry = new AgentRunnerService({
llmProvider: llmProvider as any,
telemetry: {
createTelemetry: (tags) => ({
isEnabled: telemetryConfigEnabled.isEnabled(),
metadata: {
source: tags.source ?? 'RESEARCH',
jobId: tags.jobId,
unitKey: tags.unitKey,
},
}),
},
});
await runnerWithTelemetry.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: { operationName: 'memory-agent-ingest' },
});
const call = (generateText as any).mock.calls[0][0];
expect(call.experimental_telemetry.metadata.source).toBe('RESEARCH');
});
it('forwards jobId and unitKey through experimental_telemetry metadata', async () => {
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
const telemetryConfigEnabled = {
isEnabled: () => true,
devtoolsEnabled: false,
appSettingsService: {
settings: { telemetry: { recordInputs: false, recordOutputs: false } },
},
systemConfigService: {
config: { instance: { name: 'test-instance' } },
},
} as any;
const runnerWithTelemetry = new AgentRunnerService({
llmProvider: llmProvider as any,
telemetry: {
createTelemetry: (tags) => ({
isEnabled: telemetryConfigEnabled.isEnabled(),
metadata: {
source: tags.source ?? 'RESEARCH',
jobId: tags.jobId,
unitKey: tags.unitKey,
},
}),
},
});
await runnerWithTelemetry.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: '',
userPrompt: '',
toolSet: {},
stepBudget: 10,
telemetryTags: { source: 'metabase', jobId: 'job-777', unitKey: 'sources/users' },
});
const call = (generateText as any).mock.calls[0][0];
expect(call.experimental_telemetry.metadata.jobId).toBe('job-777');
expect(call.experimental_telemetry.metadata.unitKey).toBe('sources/users');
});
it('records a sanitized LLM debug request when a recorder is injected', async () => {
(generateText as any).mockResolvedValue({ text: 'ok', toolCalls: [], steps: [] });
const record = vi.fn();
const provider = {
...llmProvider,
cacheMarker: vi.fn((ttl: '5m' | '1h') => ({
anthropic: { cacheControl: { type: 'ephemeral' as const, ttl } },
})),
promptCachingConfig: vi.fn(() => ({
enabled: true,
systemTtl: '1h',
toolsTtl: '1h',
historyTtl: '5m',
cacheSystem: true,
cacheTools: true,
cacheHistory: true,
vertexFallbackTo5m: false,
})),
};
const runnerWithDebug = new AgentRunnerService({
llmProvider: provider as any,
debugRequestRecorder: { record },
});
await runnerWithDebug.runLoop({
modelRole: 'candidateExtraction',
systemPrompt: 'SECRET SYSTEM PROMPT',
userPrompt: 'SECRET USER PROMPT',
toolSet: {
emit_candidate: {
description: 'SECRET TOOL DESCRIPTION',
inputSchema: {},
execute: vi.fn(),
} as any,
},
stepBudget: 10,
telemetryTags: { operationName: 'ingest-bundle-wu', source: 'metabase', jobId: 'job-1', unitKey: 'cards/1' },
});
expect(record).toHaveBeenCalledTimes(1);
expect(record).toHaveBeenCalledWith(
expect.objectContaining({
operationName: 'ingest-bundle-wu',
source: 'metabase',
jobId: 'job-1',
unitKey: 'cards/1',
modelRole: 'candidateExtraction',
modelId: 'claude-sonnet-4-6',
messageCount: 2,
toolNames: ['emit_candidate'],
}),
);
const providerOptions = record.mock.calls[0][0].providerOptions;
expect(providerOptions).toEqual(
expect.arrayContaining([
expect.objectContaining({ target: 'message', index: 0, role: 'system' }),
expect.objectContaining({ target: 'message-part', index: 1, role: 'user', partIndex: 0 }),
expect.objectContaining({ target: 'tool', name: 'emit_candidate' }),
]),
);
expect(providerOptions).toHaveLength(3);
const serialized = JSON.stringify(record.mock.calls[0][0]);
expect(serialized).not.toContain('SECRET SYSTEM PROMPT');
expect(serialized).not.toContain('SECRET USER PROMPT');
expect(serialized).not.toContain('SECRET TOOL DESCRIPTION');
});
});

View file

@ -0,0 +1,101 @@
import { KloMessageBuilder, type KloLlmProvider, type KloModelRole } from '@klo/llm';
import { generateText, stepCountIs, type TelemetrySettings, type Tool } from 'ai';
import { noopLogger, type KloLogger } from '../core/index.js';
import { summarizeKloLlmDebugRequest, type KloLlmDebugRequestRecorder } from '../llm/index.js';
export type RunLoopStopReason = 'budget' | 'natural' | 'error';
export interface RunLoopStepInfo {
stepIndex: number;
stepBudget: number;
}
export interface RunLoopParams {
modelRole: KloModelRole;
systemPrompt: string;
userPrompt: string;
toolSet: Record<string, Tool>;
stepBudget: number;
telemetryTags: Record<string, string>;
onStepFinish?: (info: RunLoopStepInfo) => void | Promise<void>;
}
export interface RunLoopResult {
stopReason: RunLoopStopReason;
error?: Error;
}
export interface AgentTelemetryPort {
createTelemetry(tags: Record<string, string>): TelemetrySettings;
}
export interface AgentRunnerServiceDeps {
llmProvider: KloLlmProvider;
telemetry?: AgentTelemetryPort;
debugRequestRecorder?: KloLlmDebugRequestRecorder;
logger?: KloLogger;
}
export class AgentRunnerService {
private readonly logger: KloLogger;
constructor(private readonly deps: AgentRunnerServiceDeps) {
this.logger = deps.logger ?? noopLogger;
}
async runLoop(params: RunLoopParams): Promise<RunLoopResult> {
let stepIndex = 0;
try {
const model = this.deps.llmProvider.getModel(params.modelRole);
const builder = new KloMessageBuilder(this.deps.llmProvider);
const built = builder.wrapSimple({
system: params.systemPrompt,
messages: [{ role: 'user', content: params.userPrompt }],
tools: params.toolSet,
model,
});
await this.deps.debugRequestRecorder?.record(
summarizeKloLlmDebugRequest({
operationName: params.telemetryTags.operationName ?? 'klo-agent-runner',
source: params.telemetryTags.source,
jobId: params.telemetryTags.jobId,
unitKey: params.telemetryTags.unitKey,
modelRole: params.modelRole,
modelId: (model as { modelId?: string }).modelId ?? params.modelRole,
messages: built.messages,
tools: built.tools as Record<string, { providerOptions?: unknown }>,
}),
);
await generateText({
model,
temperature: 0,
stopWhen: stepCountIs(params.stepBudget),
experimental_telemetry: this.deps.telemetry?.createTelemetry(params.telemetryTags),
messages: built.messages,
tools: built.tools as Record<string, Tool>,
onStepFinish: async () => {
stepIndex += 1;
if (!params.onStepFinish) {
return;
}
try {
await params.onStepFinish({ stepIndex, stepBudget: params.stepBudget });
} catch (err) {
this.logger.warn(
`[agent-runner] onStepFinish callback threw; ignoring: ${
err instanceof Error ? err.message : String(err)
}`,
);
}
},
});
return { stopReason: 'natural' };
} catch (error) {
const err = error instanceof Error ? error : new Error(String(error));
this.logger.warn(`[agent-runner] loop failed: ${err.message}`);
return { stopReason: 'error', error: err };
}
}
}

View file

@ -0,0 +1,9 @@
export type {
AgentRunnerServiceDeps,
AgentTelemetryPort,
RunLoopParams,
RunLoopResult,
RunLoopStepInfo,
RunLoopStopReason,
} from './agent-runner.service.js';
export { AgentRunnerService } from './agent-runner.service.js';

View file

@ -0,0 +1,28 @@
import { z } from 'zod';
export const connectionTypeSchema = z.enum([
'POSTGRESQL',
'SQLITE',
'SQLSERVER',
'BIGQUERY',
'SNOWFLAKE',
'CENTRALREACH',
'EPIC',
'CERNER',
'ATHENA',
'QUICKBOOKS',
'WORKDAY',
'REST',
'S3',
'SLACK',
'METABASE',
'LOOKER',
'NOTION',
'POSTHOG',
'MYSQL',
'CLICKHOUSE',
'PLAIN',
'BETTERSTACK',
]);
export type ConnectionType = z.infer<typeof connectionTypeSchema>;

View file

@ -0,0 +1,27 @@
export type {
KloSqlQueryExecutionInput,
KloSqlQueryExecutionResult,
KloSqlQueryExecutorPort,
} from './query-executor.js';
export { createDefaultLocalQueryExecutor, type DefaultLocalQueryExecutorOptions } from './local-query-executor.js';
export { normalizeQueryRows } from './query-executor.js';
export { createPostgresQueryExecutor } from './postgres-query-executor.js';
export { assertReadOnlySql, limitSqlForExecution } from './read-only-sql.js';
export { createSqliteQueryExecutor, sqliteDatabasePathFromConnection } from './sqlite-query-executor.js';
export { connectionTypeSchema, type ConnectionType } from './connection-type.js';
export {
localConnectionInfoFromConfig,
localConnectionToWarehouseDescriptor,
localConnectionTypeForConfig,
type LocalConnectionInfo,
type LocalWarehouseDescriptor,
} from './local-warehouse-descriptor.js';
export {
KLO_NOTION_ORG_KNOWLEDGE_WARNING,
notionConnectionToPullConfig,
parseNotionConnectionConfig,
redactNotionConnectionConfig,
resolveNotionAuthToken,
type KloNotionConnectionConfig,
type RedactedKloNotionConnectionConfig,
} from './notion-config.js';

View file

@ -0,0 +1,59 @@
import { describe, expect, it, vi } from 'vitest';
import { createDefaultLocalQueryExecutor } from './local-query-executor.js';
describe('createDefaultLocalQueryExecutor', () => {
it('dispatches postgres and sqlite drivers to their executors', async () => {
const postgres = {
execute: vi.fn(async () => ({
headers: ['pg'],
rows: [[1]],
totalRows: 1,
command: 'SELECT',
rowCount: 1,
})),
};
const sqlite = {
execute: vi.fn(async () => ({
headers: ['sqlite'],
rows: [[2]],
totalRows: 1,
command: 'SELECT',
rowCount: 1,
})),
};
const executor = createDefaultLocalQueryExecutor({ postgres, sqlite });
await expect(
executor.execute({
connectionId: 'pg',
connection: { driver: 'postgres', readonly: true },
sql: 'select 1',
}),
).resolves.toMatchObject({ headers: ['pg'] });
await expect(
executor.execute({
connectionId: 'local',
connection: { driver: 'sqlite', readonly: true },
sql: 'select 1',
}),
).resolves.toMatchObject({ headers: ['sqlite'] });
expect(postgres.execute).toHaveBeenCalledTimes(1);
expect(sqlite.execute).toHaveBeenCalledTimes(1);
});
it('rejects unsupported local execution drivers', async () => {
const executor = createDefaultLocalQueryExecutor({
postgres: { execute: vi.fn() },
sqlite: { execute: vi.fn() },
});
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'snowflake', readonly: true },
sql: 'select 1',
}),
).rejects.toThrow('No local query executor is configured for driver "snowflake".');
});
});

View file

@ -0,0 +1,34 @@
import { createPostgresQueryExecutor } from './postgres-query-executor.js';
import type {
KloSqlQueryExecutionInput,
KloSqlQueryExecutionResult,
KloSqlQueryExecutorPort,
} from './query-executor.js';
import { createSqliteQueryExecutor } from './sqlite-query-executor.js';
export interface DefaultLocalQueryExecutorOptions {
postgres?: KloSqlQueryExecutorPort;
sqlite?: KloSqlQueryExecutorPort;
}
function driverFor(input: KloSqlQueryExecutionInput): string {
return String(input.connection?.driver ?? '').toLowerCase();
}
export function createDefaultLocalQueryExecutor(options: DefaultLocalQueryExecutorOptions = {}): KloSqlQueryExecutorPort {
const postgres = options.postgres ?? createPostgresQueryExecutor();
const sqlite = options.sqlite ?? createSqliteQueryExecutor();
return {
async execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult> {
const driver = driverFor(input);
if (driver === 'postgres' || driver === 'postgresql') {
return postgres.execute(input);
}
if (driver === 'sqlite' || driver === 'sqlite3') {
return sqlite.execute(input);
}
throw new Error(`No local query executor is configured for driver "${input.connection?.driver ?? 'unknown'}".`);
},
};
}

View file

@ -0,0 +1,63 @@
import { describe, expect, it } from 'vitest';
import {
localConnectionInfoFromConfig,
localConnectionToWarehouseDescriptor,
localConnectionTypeForConfig,
} from './local-warehouse-descriptor.js';
describe('localConnectionToWarehouseDescriptor', () => {
it('maps local Postgres URLs to canonical warehouse descriptors', () => {
expect(
localConnectionToWarehouseDescriptor('warehouse', {
driver: 'postgres',
url: 'postgresql://readonly@db.example.test/analytics',
}),
).toMatchObject({
id: 'warehouse',
connection_type: 'POSTGRESQL',
host: 'db.example.test',
database: 'analytics',
});
});
it('maps BigQuery project and dataset from explicit fields', () => {
expect(
localConnectionToWarehouseDescriptor('bq', {
driver: 'bigquery',
project_id: 'acme',
dataset_id: 'warehouse',
}),
).toMatchObject({
id: 'bq',
connection_type: 'BIGQUERY',
project_id: 'acme',
dataset_id: 'warehouse',
});
});
it('returns null for non-warehouse adapters', () => {
expect(localConnectionToWarehouseDescriptor('looker', { driver: 'looker' })).toBeNull();
});
});
describe('local connection info helpers', () => {
it('returns canonical warehouse connection types for local catalogs', () => {
expect(localConnectionTypeForConfig('warehouse', { driver: 'postgres' })).toBe('POSTGRESQL');
expect(localConnectionTypeForConfig('bq', { driver: 'bigquery', project_id: 'acme' })).toBe('BIGQUERY');
expect(localConnectionTypeForConfig('snowflake', { driver: 'snowflake' })).toBe('SNOWFLAKE');
});
it('keeps non-warehouse adapter labels for display-only local connection surfaces', () => {
expect(localConnectionTypeForConfig('prod-metabase', { driver: 'metabase' })).toBe('metabase');
expect(localConnectionTypeForConfig('missing-driver', {} as never)).toBe('unknown');
});
it('builds nullable local connection info records', () => {
expect(localConnectionInfoFromConfig('warehouse', { driver: 'postgres' })).toEqual({
id: 'warehouse',
name: 'warehouse',
connectionType: 'POSTGRESQL',
});
expect(localConnectionInfoFromConfig('missing', undefined)).toBeNull();
});
});

View file

@ -0,0 +1,102 @@
import type { KloProjectConnectionConfig } from '../project/config.js';
import type { ConnectionType } from './connection-type.js';
export interface LocalWarehouseDescriptor {
id: string;
connection_type: ConnectionType;
host?: string | null;
database?: string | null;
account?: string | null;
project_id?: string | null;
dataset_id?: string | null;
connection_params: Record<string, unknown>;
}
export interface LocalConnectionInfo {
id: string;
name: string;
connectionType: string;
}
const DRIVER_TO_CONNECTION_TYPE: Record<string, ConnectionType> = {
postgres: 'POSTGRESQL',
postgresql: 'POSTGRESQL',
sqlite: 'SQLITE',
sqlserver: 'SQLSERVER',
mssql: 'SQLSERVER',
mysql: 'MYSQL',
clickhouse: 'CLICKHOUSE',
snowflake: 'SNOWFLAKE',
bigquery: 'BIGQUERY',
};
export function localConnectionToWarehouseDescriptor(
id: string,
connection: KloProjectConnectionConfig | undefined,
): LocalWarehouseDescriptor | null {
if (!connection) {
return null;
}
const connectionType = DRIVER_TO_CONNECTION_TYPE[String(connection.driver ?? '').toLowerCase()];
if (!connectionType) {
return null;
}
const info: LocalWarehouseDescriptor = {
id,
connection_type: connectionType,
connection_params: { ...connection },
};
const url = typeof connection.url === 'string' ? connection.url : null;
if (url && !url.startsWith('env:') && !url.startsWith('file:')) {
try {
const parsed = new URL(url);
info.host = parsed.hostname || null;
if (parsed.pathname.length > 1) {
const [first, second] = parsed.pathname.slice(1).split('/');
if (connectionType === 'BIGQUERY') {
info.project_id = stringField(connection.project_id) ?? parsed.hostname ?? first ?? null;
info.dataset_id = stringField(connection.dataset_id) ?? second ?? null;
} else {
info.database = first ?? null;
}
}
} catch {
info.host = stringField(connection.host);
}
}
info.host = stringField(connection.host) ?? info.host ?? null;
info.database = stringField(connection.database) ?? info.database ?? null;
info.account = stringField(connection.account) ?? null;
info.project_id = stringField(connection.project_id) ?? info.project_id ?? null;
info.dataset_id = stringField(connection.dataset_id) ?? info.dataset_id ?? null;
return info;
}
export function localConnectionTypeForConfig(id: string, connection: KloProjectConnectionConfig | undefined): string {
const descriptor = localConnectionToWarehouseDescriptor(id, connection);
if (descriptor) {
return descriptor.connection_type;
}
const driver = typeof connection?.driver === 'string' ? connection.driver.trim() : '';
return driver.length > 0 ? driver : 'unknown';
}
export function localConnectionInfoFromConfig(
id: string,
connection: KloProjectConnectionConfig | undefined,
): LocalConnectionInfo | null {
if (!connection) {
return null;
}
return {
id,
name: id,
connectionType: localConnectionTypeForConfig(id, connection),
};
}
function stringField(value: unknown): string | null {
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : null;
}

View file

@ -0,0 +1,120 @@
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import {
notionConnectionToPullConfig,
parseNotionConnectionConfig,
redactNotionConnectionConfig,
resolveNotionAuthToken,
} from './notion-config.js';
describe('standalone Notion connection config', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-notion-config-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('parses selected-root Notion config with safe defaults', () => {
const parsed = parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
crawl_mode: 'selected_roots',
root_page_ids: ['page-1'],
});
expect(parsed).toEqual({
driver: 'notion',
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
crawl_mode: 'selected_roots',
root_page_ids: ['page-1'],
root_database_ids: [],
root_data_source_ids: [],
max_pages_per_run: 1000,
max_knowledge_creates_per_run: 5,
max_knowledge_updates_per_run: 20,
last_successful_cursor: null,
});
});
it('redacts token references from display output', () => {
expect(
redactNotionConnectionConfig(
parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'file:/Users/example/.config/notion-token',
crawl_mode: 'all_accessible',
max_pages_per_run: 80,
}),
),
).toEqual({
driver: 'notion',
hasAuthToken: true,
crawlMode: 'all_accessible',
rootPageIds: [],
rootDatabaseIds: [],
rootDataSourceIds: [],
maxPagesPerRun: 80,
maxKnowledgeCreatesPerRun: 5,
maxKnowledgeUpdatesPerRun: 20,
warning: 'Anything accessible to this Notion integration can become organization knowledge.',
});
});
it('requires at least one selected root in selected_roots mode', () => {
expect(() =>
parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
crawl_mode: 'selected_roots',
}),
).toThrow('selected_roots requires at least one root page, database, or data source id');
});
it('resolves env and file token references without exposing the reference in errors', async () => {
const tokenPath = join(tempDir, 'notion-token.txt');
await writeFile(tokenPath, 'ntn_file_token\n', 'utf-8');
await expect(
resolveNotionAuthToken('env:NOTION_AUTH_TOKEN', {
env: { NOTION_AUTH_TOKEN: 'ntn_env_token' },
}),
).resolves.toBe('ntn_env_token');
await expect(resolveNotionAuthToken(`file:${tokenPath}`)).resolves.toBe('ntn_file_token');
await expect(resolveNotionAuthToken('env:MISSING_NOTION_TOKEN', { env: {} })).rejects.toThrow(
'Notion token environment variable MISSING_NOTION_TOKEN is not set',
);
});
it('converts standalone config into adapter pull config', async () => {
const pullConfig = await notionConnectionToPullConfig(
parseNotionConnectionConfig({
driver: 'notion',
auth_token_ref: 'env:NOTION_AUTH_TOKEN',
crawl_mode: 'all_accessible',
max_pages_per_run: 12,
max_knowledge_creates_per_run: 2,
max_knowledge_updates_per_run: 7,
last_successful_cursor: '{"phase":"all_accessible_pages","cursor":"cursor-1"}',
}),
{ env: { NOTION_AUTH_TOKEN: 'ntn_env_token' } },
);
expect(pullConfig).toEqual({
authToken: 'ntn_env_token',
crawlMode: 'all_accessible',
rootPageIds: [],
rootDatabaseIds: [],
rootDataSourceIds: [],
maxPagesPerRun: 12,
maxKnowledgeCreatesPerRun: 2,
maxKnowledgeUpdatesPerRun: 7,
lastSuccessfulCursor: '{"phase":"all_accessible_pages","cursor":"cursor-1"}',
});
});
});

View file

@ -0,0 +1,196 @@
import { readFile } from 'node:fs/promises';
import { homedir } from 'node:os';
import { resolve } from 'node:path';
import { type NotionPullConfig, notionPullConfigSchema } from '../ingest/adapters/notion/types.js';
import type { KloProjectConnectionConfig } from '../project/config.js';
export const KLO_NOTION_ORG_KNOWLEDGE_WARNING =
'Anything accessible to this Notion integration can become organization knowledge.';
type KloNotionCrawlMode = 'all_accessible' | 'selected_roots';
export interface KloNotionConnectionConfig extends KloProjectConnectionConfig {
driver: 'notion';
auth_token_ref: string;
crawl_mode: KloNotionCrawlMode;
root_page_ids: string[];
root_database_ids: string[];
root_data_source_ids: string[];
max_pages_per_run: number;
max_knowledge_creates_per_run: number;
max_knowledge_updates_per_run: number;
last_successful_cursor: string | null;
}
export interface RedactedKloNotionConnectionConfig {
driver: 'notion';
hasAuthToken: boolean;
crawlMode: KloNotionCrawlMode;
rootPageIds: string[];
rootDatabaseIds: string[];
rootDataSourceIds: string[];
maxPagesPerRun: number;
maxKnowledgeCreatesPerRun: number;
maxKnowledgeUpdatesPerRun: number;
warning: typeof KLO_NOTION_ORG_KNOWLEDGE_WARNING;
}
interface ResolveNotionTokenOptions {
env?: Record<string, string | undefined>;
readTextFile?: (path: string) => Promise<string>;
}
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function record(value: unknown): Record<string, unknown> {
if (!isRecord(value)) {
throw new Error('Notion connection config must be an object');
}
return value;
}
function stringValue(value: unknown, fallback: string): string {
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : fallback;
}
function optionalString(value: unknown): string | null {
return typeof value === 'string' && value.trim().length > 0 ? value.trim() : null;
}
function stringArray(value: unknown): string[] {
if (!Array.isArray(value)) {
return [];
}
return value.filter((item): item is string => typeof item === 'string' && item.trim().length > 0);
}
function integerWithFallback(value: unknown, fallback: number, name: string): number {
if (value === undefined || value === null) {
return fallback;
}
if (typeof value !== 'number' || !Number.isInteger(value)) {
throw new Error(`${name} must be an integer`);
}
return value;
}
function boundedInteger(value: unknown, fallback: number, name: string, min: number, max: number): number {
const parsed = integerWithFallback(value, fallback, name);
if (parsed < min || parsed > max) {
throw new Error(`${name} must be between ${min} and ${max}`);
}
return parsed;
}
export function parseNotionConnectionConfig(raw: unknown): KloNotionConnectionConfig {
const input = record(raw);
if (input.driver !== 'notion') {
throw new Error('Notion connection config requires driver: notion');
}
const authTokenRef = stringValue(input.auth_token_ref, '');
if (!authTokenRef) {
throw new Error('Notion connection config requires auth_token_ref');
}
if (!authTokenRef.startsWith('env:') && !authTokenRef.startsWith('file:')) {
throw new Error('Notion auth_token_ref must use env:NAME or file:/path');
}
const crawlMode = stringValue(input.crawl_mode, 'selected_roots');
if (crawlMode !== 'selected_roots' && crawlMode !== 'all_accessible') {
throw new Error(`Unsupported Notion crawl_mode: ${crawlMode}`);
}
const rootPageIds = stringArray(input.root_page_ids);
const rootDatabaseIds = stringArray(input.root_database_ids);
const rootDataSourceIds = stringArray(input.root_data_source_ids);
if (crawlMode === 'selected_roots' && rootPageIds.length + rootDatabaseIds.length + rootDataSourceIds.length === 0) {
throw new Error('selected_roots requires at least one root page, database, or data source id');
}
return {
...input,
driver: 'notion',
auth_token_ref: authTokenRef,
crawl_mode: crawlMode,
root_page_ids: rootPageIds,
root_database_ids: rootDatabaseIds,
root_data_source_ids: rootDataSourceIds,
max_pages_per_run: boundedInteger(input.max_pages_per_run, 1000, 'max_pages_per_run', 1, 10_000),
max_knowledge_creates_per_run: boundedInteger(
input.max_knowledge_creates_per_run,
5,
'max_knowledge_creates_per_run',
0,
25,
),
max_knowledge_updates_per_run: boundedInteger(
input.max_knowledge_updates_per_run,
20,
'max_knowledge_updates_per_run',
0,
100,
),
last_successful_cursor: optionalString(input.last_successful_cursor),
};
}
export function redactNotionConnectionConfig(config: KloNotionConnectionConfig): RedactedKloNotionConnectionConfig {
return {
driver: 'notion',
hasAuthToken: Boolean(config.auth_token_ref),
crawlMode: config.crawl_mode,
rootPageIds: config.root_page_ids,
rootDatabaseIds: config.root_database_ids,
rootDataSourceIds: config.root_data_source_ids,
maxPagesPerRun: config.max_pages_per_run,
maxKnowledgeCreatesPerRun: config.max_knowledge_creates_per_run,
maxKnowledgeUpdatesPerRun: config.max_knowledge_updates_per_run,
warning: KLO_NOTION_ORG_KNOWLEDGE_WARNING,
};
}
function expandHome(path: string): string {
return path === '~' || path.startsWith('~/') ? resolve(homedir(), path.slice(2)) : path;
}
export async function resolveNotionAuthToken(
authTokenRef: string,
options: ResolveNotionTokenOptions = {},
): Promise<string> {
if (authTokenRef.startsWith('env:')) {
const envName = authTokenRef.slice('env:'.length);
const value = (options.env ?? process.env)[envName];
if (!value) {
throw new Error(`Notion token environment variable ${envName} is not set`);
}
return value.trim();
}
if (authTokenRef.startsWith('file:')) {
const path = expandHome(authTokenRef.slice('file:'.length));
const readTextFile = options.readTextFile ?? ((filePath: string) => readFile(filePath, 'utf-8'));
const value = (await readTextFile(path)).trim();
if (!value) {
throw new Error(`Notion token file is empty: ${path}`);
}
return value;
}
throw new Error('Notion auth_token_ref must use env:NAME or file:/path');
}
export async function notionConnectionToPullConfig(
config: KloNotionConnectionConfig,
options: ResolveNotionTokenOptions = {},
): Promise<NotionPullConfig> {
return notionPullConfigSchema.parse({
authToken: await resolveNotionAuthToken(config.auth_token_ref, options),
crawlMode: config.crawl_mode,
rootPageIds: config.root_page_ids,
rootDatabaseIds: config.root_database_ids,
rootDataSourceIds: config.root_data_source_ids,
maxPagesPerRun: config.max_pages_per_run,
maxKnowledgeCreatesPerRun: config.max_knowledge_creates_per_run,
maxKnowledgeUpdatesPerRun: config.max_knowledge_updates_per_run,
lastSuccessfulCursor: config.last_successful_cursor,
});
}

View file

@ -0,0 +1,111 @@
import { describe, expect, it, vi } from 'vitest';
import { createPostgresQueryExecutor } from './postgres-query-executor.js';
function makeClient() {
const calls: unknown[] = [];
const client = {
connect: vi.fn(async () => undefined),
query: vi.fn(async (input: unknown) => {
calls.push(input);
if (input === 'BEGIN READ ONLY') {
return { rows: [], fields: [], rowCount: null, command: 'BEGIN' };
}
if (input === 'COMMIT') {
return { rows: [], fields: [], rowCount: null, command: 'COMMIT' };
}
return {
rows: [
['paid', 2],
['open', 1],
],
fields: [{ name: 'status' }, { name: 'order_count' }],
rowCount: 2,
command: 'SELECT',
};
}),
end: vi.fn(async () => undefined),
};
return { client, calls };
}
describe('createPostgresQueryExecutor', () => {
it('runs a read-only transaction in array row mode and closes the client', async () => {
const { client, calls } = makeClient();
const executor = createPostgresQueryExecutor({
clientFactory: vi.fn(() => client),
});
const result = await executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres', url: 'postgres://example/db', readonly: true },
sql: 'select status, count(*) as order_count from public.orders group by status',
maxRows: 50,
});
expect(client.connect).toHaveBeenCalledTimes(1);
expect(calls[0]).toBe('BEGIN READ ONLY');
expect(calls[1]).toEqual({
text: 'select * from (select status, count(*) as order_count from public.orders group by status) as klo_query_result limit 50',
rowMode: 'array',
});
expect(calls[2]).toBe('COMMIT');
expect(client.end).toHaveBeenCalledTimes(1);
expect(result).toEqual({
headers: ['status', 'order_count'],
rows: [
['paid', 2],
['open', 1],
],
totalRows: 2,
command: 'SELECT',
rowCount: 2,
});
});
it('rolls back and closes the client when query execution fails', async () => {
const client = {
connect: vi.fn(async () => undefined),
query: vi.fn(async (input: unknown) => {
if (input === 'BEGIN READ ONLY' || input === 'ROLLBACK') {
return { rows: [], fields: [], rowCount: null, command: String(input) };
}
throw new Error('syntax error');
}),
end: vi.fn(async () => undefined),
};
const executor = createPostgresQueryExecutor({
clientFactory: vi.fn(() => client),
});
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres', url: 'postgres://example/db', readonly: true },
sql: 'select * from broken',
maxRows: 10,
}),
).rejects.toThrow('syntax error');
expect(client.query).toHaveBeenCalledWith('ROLLBACK');
expect(client.end).toHaveBeenCalledTimes(1);
});
it('requires a Postgres url and read-only connection config', async () => {
const executor = createPostgresQueryExecutor({ clientFactory: vi.fn() });
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres', readonly: true },
sql: 'select 1',
}),
).rejects.toThrow('Local Postgres execution requires connections.warehouse.url');
await expect(
executor.execute({
connectionId: 'warehouse',
connection: { driver: 'postgres', url: 'postgres://example/db', readonly: false },
sql: 'select 1',
}),
).rejects.toThrow('Local query execution requires connections.warehouse.readonly: true');
});
});

View file

@ -0,0 +1,80 @@
import { Client, type ClientConfig } from 'pg';
import type {
KloSqlQueryExecutionInput,
KloSqlQueryExecutionResult,
KloSqlQueryExecutorPort,
} from './query-executor.js';
import { limitSqlForExecution } from './read-only-sql.js';
interface PgClientLike {
connect(): Promise<unknown>;
query(input: string | { text: string; rowMode: 'array' }): Promise<{
fields: Array<{ name: string }>;
rows: unknown[][];
command: string;
rowCount: number | null;
}>;
end(): Promise<void>;
}
interface PostgresQueryExecutorOptions {
statementTimeoutMs?: number;
queryTimeoutMs?: number;
connectionTimeoutMs?: number;
clientFactory?: (config: ClientConfig) => PgClientLike;
}
function connectionDriver(input: KloSqlQueryExecutionInput): string {
return String(input.connection?.driver ?? '').toLowerCase();
}
function createDefaultClient(config: ClientConfig): PgClientLike {
return new Client(config);
}
export function createPostgresQueryExecutor(options: PostgresQueryExecutorOptions = {}): KloSqlQueryExecutorPort {
const clientFactory = options.clientFactory ?? createDefaultClient;
return {
async execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult> {
const driver = connectionDriver(input);
if (driver !== 'postgres' && driver !== 'postgresql') {
throw new Error(`Local Postgres execution cannot run driver "${input.connection?.driver ?? 'unknown'}".`);
}
if (input.connection?.readonly !== true) {
throw new Error(`Local query execution requires connections.${input.connectionId}.readonly: true.`);
}
if (typeof input.connection.url !== 'string' || input.connection.url.trim().length === 0) {
throw new Error(`Local Postgres execution requires connections.${input.connectionId}.url.`);
}
const client = clientFactory({
connectionString: input.connection.url,
statement_timeout: options.statementTimeoutMs ?? 30_000,
query_timeout: options.queryTimeoutMs ?? 35_000,
connectionTimeoutMillis: options.connectionTimeoutMs ?? 5_000,
application_name: 'klo-local-query',
});
await client.connect();
try {
await client.query('BEGIN READ ONLY');
const result = await client.query({
text: limitSqlForExecution(input.sql, input.maxRows),
rowMode: 'array',
});
await client.query('COMMIT');
return {
headers: result.fields.map((field) => field.name),
rows: result.rows,
totalRows: result.rows.length,
command: result.command,
rowCount: result.rowCount,
};
} catch (error) {
await client.query('ROLLBACK').catch(() => undefined);
throw error;
} finally {
await client.end();
}
},
};
}

View file

@ -0,0 +1,25 @@
import type { KloProjectConnectionConfig } from '../project/index.js';
export interface KloSqlQueryExecutionInput {
connectionId: string;
projectDir?: string;
connection: KloProjectConnectionConfig | undefined;
sql: string;
maxRows?: number;
}
export interface KloSqlQueryExecutionResult {
headers: string[];
rows: unknown[][];
totalRows: number;
command: string;
rowCount: number | null;
}
export interface KloSqlQueryExecutorPort {
execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult>;
}
export function normalizeQueryRows(rows: unknown[]): unknown[][] {
return rows.map((row) => (Array.isArray(row) ? row : Object.values(row as Record<string, unknown>)));
}

View file

@ -0,0 +1,30 @@
import { describe, expect, it } from 'vitest';
import { assertReadOnlySql, limitSqlForExecution } from './read-only-sql.js';
describe('assertReadOnlySql', () => {
it('allows select and with queries', () => {
expect(assertReadOnlySql('select * from orders')).toBe('select * from orders');
expect(assertReadOnlySql('with paid as (select * from orders) select * from paid')).toContain('with paid');
});
it('rejects mutating statements before opening a database connection', () => {
expect(() => assertReadOnlySql('delete from orders')).toThrow(
'Only read-only SELECT/WITH queries can be executed locally',
);
expect(() => assertReadOnlySql('create table x(id int)')).toThrow(
'Only read-only SELECT/WITH queries can be executed locally',
);
});
});
describe('limitSqlForExecution', () => {
it('wraps compiled SQL and strips trailing semicolons', () => {
expect(limitSqlForExecution('select * from public.orders; ', 25)).toBe(
'select * from (select * from public.orders) as klo_query_result limit 25',
);
});
it('returns the trimmed SQL when no maxRows value is provided', () => {
expect(limitSqlForExecution('select * from orders; ', undefined)).toBe('select * from orders');
});
});

View file

@ -0,0 +1,22 @@
const MUTATING_SQL =
/^\s*(insert|update|delete|merge|alter|drop|create|truncate|grant|revoke|copy|call|do|vacuum|analyze|refresh)\b/i;
const READ_SQL = /^\s*(select|with)\b/i;
export function assertReadOnlySql(sql: string): string {
const trimmed = sql.trim();
if (!READ_SQL.test(trimmed) || MUTATING_SQL.test(trimmed)) {
throw new Error('Only read-only SELECT/WITH queries can be executed locally.');
}
return trimmed;
}
export function limitSqlForExecution(sql: string, maxRows: number | undefined): string {
const trimmed = assertReadOnlySql(sql).replace(/;+\s*$/, '');
if (!maxRows) {
return trimmed;
}
if (!Number.isInteger(maxRows) || maxRows <= 0) {
throw new Error('maxRows must be a positive integer.');
}
return `select * from (${trimmed}) as klo_query_result limit ${maxRows}`;
}

View file

@ -0,0 +1,148 @@
import { mkdtemp, rm } from 'node:fs/promises';
import { writeFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import Database from 'better-sqlite3';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { createSqliteQueryExecutor, sqliteDatabasePathFromConnection } from './sqlite-query-executor.js';
describe('createSqliteQueryExecutor', () => {
let tempDir: string;
let dbPath: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-sqlite-query-'));
dbPath = join(tempDir, 'warehouse.db');
const db = new Database(dbPath);
db.exec(`
CREATE TABLE orders (
id INTEGER PRIMARY KEY,
status TEXT NOT NULL,
amount INTEGER NOT NULL
);
INSERT INTO orders (status, amount) VALUES
('paid', 20),
('paid', 30),
('open', 10);
`);
db.close();
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('executes read-only SELECT SQL against a relative SQLite path', async () => {
const executor = createSqliteQueryExecutor();
const result = await executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: 'warehouse.db', readonly: true },
sql: 'select status, count(*) as order_count from orders group by status order by status',
maxRows: 10,
});
expect(result).toEqual({
headers: ['status', 'order_count'],
rows: [
['open', 1],
['paid', 2],
],
totalRows: 2,
command: 'SELECT',
rowCount: 2,
});
});
it('supports file urls for SQLite database paths', async () => {
expect(
sqliteDatabasePathFromConnection({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', url: `file://${dbPath}`, readonly: true },
sql: 'select 1',
}),
).toBe(dbPath);
});
it('resolves file references for SQLite path fields', async () => {
const pointerPath = join(tempDir, 'sqlite-path.txt');
writeFileSync(pointerPath, dbPath, 'utf-8');
expect(
sqliteDatabasePathFromConnection({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: `file:${pointerPath}`, readonly: true },
sql: 'select 1',
}),
).toBe(dbPath);
});
it('resolves env references for SQLite database urls', async () => {
const originalDatabaseUrl = process.env.KLO_SQLITE_TEST_URL;
process.env.KLO_SQLITE_TEST_URL = `sqlite:${dbPath}`;
try {
expect(
sqliteDatabasePathFromConnection({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', url: 'env:KLO_SQLITE_TEST_URL', readonly: true },
sql: 'select 1',
}),
).toBe(dbPath);
} finally {
if (originalDatabaseUrl === undefined) {
delete process.env.KLO_SQLITE_TEST_URL;
} else {
process.env.KLO_SQLITE_TEST_URL = originalDatabaseUrl;
}
}
});
it('rejects mutating SQL before opening the database', async () => {
const executor = createSqliteQueryExecutor();
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: 'warehouse.db', readonly: true },
sql: 'delete from orders',
}),
).rejects.toThrow('Only read-only SELECT/WITH queries can be executed locally');
});
it('requires a SQLite driver, read-only config, and a database path', async () => {
const executor = createSqliteQueryExecutor();
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'postgres', path: 'warehouse.db', readonly: true },
sql: 'select 1',
}),
).rejects.toThrow('Local SQLite execution cannot run driver "postgres"');
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', path: 'warehouse.db', readonly: false },
sql: 'select 1',
}),
).rejects.toThrow('Local query execution requires connections.warehouse.readonly: true');
await expect(
executor.execute({
connectionId: 'warehouse',
projectDir: tempDir,
connection: { driver: 'sqlite', readonly: true },
sql: 'select 1',
}),
).rejects.toThrow('Local SQLite execution requires connections.warehouse.path or connections.warehouse.url');
});
});

View file

@ -0,0 +1,94 @@
import { isAbsolute, resolve } from 'node:path';
import { fileURLToPath } from 'node:url';
import Database from 'better-sqlite3';
import { readFileSync } from 'node:fs';
import { homedir } from 'node:os';
import type {
KloSqlQueryExecutionInput,
KloSqlQueryExecutionResult,
KloSqlQueryExecutorPort,
} from './query-executor.js';
import { normalizeQueryRows } from './query-executor.js';
import { limitSqlForExecution } from './read-only-sql.js';
type SqliteConnectionConfig = Record<string, unknown> | undefined;
function connectionDriver(input: KloSqlQueryExecutionInput): string {
return String(input.connection?.driver ?? '').toLowerCase();
}
function stringConfigValue(connection: SqliteConnectionConfig, key: string): string | undefined {
const value = connection?.[key];
return typeof value === 'string' && value.trim().length > 0 ? resolveStringReference(key, value.trim()) : undefined;
}
function resolveStringReference(key: string, value: string): string {
if (value.startsWith('env:')) {
return process.env[value.slice('env:'.length)] ?? '';
}
if (key !== 'url' && value.startsWith('file:')) {
const rawPath = value.slice('file:'.length);
const path = rawPath.startsWith('~') ? resolve(homedir(), rawPath.slice(1)) : rawPath;
return readFileSync(path, 'utf-8').trim();
}
return value;
}
function sqlitePathFromUrl(url: string): string {
if (url.startsWith('file:')) {
return fileURLToPath(url);
}
if (url.startsWith('sqlite:')) {
const parsed = new URL(url);
if (parsed.pathname.length > 0) {
return decodeURIComponent(parsed.pathname);
}
}
return url;
}
export function sqliteDatabasePathFromConnection(input: KloSqlQueryExecutionInput): string {
const driver = connectionDriver(input);
if (driver !== 'sqlite' && driver !== 'sqlite3') {
throw new Error(`Local SQLite execution cannot run driver "${input.connection?.driver ?? 'unknown'}".`);
}
if (input.connection?.readonly !== true) {
throw new Error(`Local query execution requires connections.${input.connectionId}.readonly: true.`);
}
const pathValue = stringConfigValue(input.connection, 'path');
const urlValue = stringConfigValue(input.connection, 'url');
if (!pathValue && !urlValue) {
throw new Error(
`Local SQLite execution requires connections.${input.connectionId}.path or connections.${input.connectionId}.url.`,
);
}
const candidate = pathValue ?? sqlitePathFromUrl(urlValue as string);
return isAbsolute(candidate) ? candidate : resolve(input.projectDir ?? process.cwd(), candidate);
}
export function createSqliteQueryExecutor(): KloSqlQueryExecutorPort {
return {
async execute(input: KloSqlQueryExecutionInput): Promise<KloSqlQueryExecutionResult> {
const sql = limitSqlForExecution(input.sql, input.maxRows);
const dbPath = sqliteDatabasePathFromConnection(input);
const db = new Database(dbPath, { readonly: true, fileMustExist: true });
try {
const statement = db.prepare(sql);
const rows = statement.all() as unknown[];
return {
headers: statement.columns().map((column) => column.name),
rows: normalizeQueryRows(rows),
totalRows: rows.length,
command: 'SELECT',
rowCount: rows.length,
};
} finally {
db.close();
}
},
};
}

View file

@ -0,0 +1,34 @@
import { mkdir, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { resolveKloConfigReference, resolveKloHomePath } from './config-reference.js';
describe('KLO config references', () => {
it('resolves env references without returning empty values', () => {
expect(resolveKloConfigReference('env:AI_GATEWAY_API_KEY', { AI_GATEWAY_API_KEY: ' gateway-key ' })).toBe(
'gateway-key',
);
expect(resolveKloConfigReference('env:AI_GATEWAY_API_KEY', { AI_GATEWAY_API_KEY: ' ' })).toBeUndefined();
expect(resolveKloConfigReference('env:AI_GATEWAY_API_KEY', {})).toBeUndefined();
});
it('resolves file references and trims file content', async () => {
const dir = join(tmpdir(), `klo-config-reference-${process.pid}`);
await mkdir(dir, { recursive: true });
const keyPath = join(dir, 'gateway-key.txt');
await writeFile(keyPath, 'file-gateway-key\n', 'utf8');
expect(resolveKloConfigReference(`file:${keyPath}`, {})).toBe('file-gateway-key');
});
it('returns literal values unchanged after trimming blank-only values', () => {
expect(resolveKloConfigReference('provider/model', {})).toBe('provider/model');
expect(resolveKloConfigReference(' ', {})).toBeUndefined();
expect(resolveKloConfigReference(undefined, {})).toBeUndefined();
});
it('resolves home-prefixed paths', () => {
expect(resolveKloHomePath('~/klo/key.txt')).toContain('/klo/key.txt');
});
});

View file

@ -0,0 +1,36 @@
import { readFileSync } from 'node:fs';
import { homedir } from 'node:os';
import { resolve } from 'node:path';
export function resolveKloHomePath(path: string): string {
if (path === '~') {
return homedir();
}
if (path.startsWith('~/')) {
return resolve(homedir(), path.slice(2));
}
return resolve(path);
}
export function resolveKloConfigReference(value: string | undefined, env: NodeJS.ProcessEnv): string | undefined {
if (!value) {
return undefined;
}
if (value.startsWith('env:')) {
const envName = value.slice('env:'.length).trim();
const envValue = env[envName];
return envValue && envValue.trim().length > 0 ? envValue.trim() : undefined;
}
if (value.startsWith('file:')) {
const filePath = resolveKloHomePath(value.slice('file:'.length).trim());
const fileValue = readFileSync(filePath, 'utf8').trim();
return fileValue.length > 0 ? fileValue : undefined;
}
const trimmed = value.trim();
return trimmed.length > 0 ? trimmed : undefined;
}

View file

@ -0,0 +1,42 @@
export interface KloStorageConfig {
configDir?: string;
homeDir?: string;
worktreesDir?: string;
}
export interface KloGitConfig {
userName: string;
userEmail: string;
bootstrapMessage?: string;
bootstrapAuthor?: string;
bootstrapAuthorEmail?: string;
}
export interface KloCoreConfig {
storage: KloStorageConfig;
git: KloGitConfig;
}
export interface KloLogger {
debug(message: string): void;
log(message: string): void;
warn(message: string): void;
error(message: string, error?: unknown): void;
}
export const noopLogger: KloLogger = {
debug: () => undefined,
log: () => undefined,
warn: () => undefined,
error: () => undefined,
};
export function resolveConfigDir(config: KloCoreConfig): string {
const homeDir = config.storage.homeDir ?? '/tmp';
return config.storage.configDir ?? `${homeDir}/klo/config`;
}
export function resolveWorktreesDir(config: KloCoreConfig): string {
const homeDir = config.storage.homeDir ?? '/tmp';
return config.storage.worktreesDir ?? `${homeDir}/.worktrees`;
}

View file

@ -0,0 +1,5 @@
export interface KloEmbeddingPort {
maxBatchSize: number;
computeEmbedding(text: string): Promise<number[]>;
computeEmbeddingsBulk(texts: string[]): Promise<number[][]>;
}

View file

@ -0,0 +1,43 @@
export interface KloFileWriteResult {
commitHash?: string | null;
[key: string]: unknown;
}
export interface KloFileReadResult {
content: string;
[key: string]: unknown;
}
export interface KloFileListResult {
files: string[];
}
export interface KloFileHistoryEntry {
sha?: string;
message?: string;
author?: string;
date?: string | Date;
[key: string]: unknown;
}
export interface KloFileStorePort<TSelf = unknown> {
writeFile(
path: string,
content: string,
author: string,
authorEmail: string,
commitMessage: string,
options?: { skipLock?: boolean },
): Promise<KloFileWriteResult>;
readFile(path: string): Promise<KloFileReadResult>;
deleteFile(
path: string,
author: string,
authorEmail: string,
commitMessage: string,
options?: { skipLock?: boolean },
): Promise<KloFileWriteResult | null>;
listFiles(path: string, recursive?: boolean): Promise<KloFileListResult>;
getFileHistory(path: string): Promise<KloFileHistoryEntry[] | unknown>;
forWorktree(workdir: string): TSelf;
}

View file

@ -0,0 +1,29 @@
import { simpleGit, type SimpleGit } from 'simple-git';
const GIT_HOOK_ENV_KEYS = [
'GIT_ALTERNATE_OBJECT_DIRECTORIES',
'GIT_DIR',
'GIT_INDEX_FILE',
'GIT_OBJECT_DIRECTORY',
'GIT_PREFIX',
'GIT_QUARANTINE_PATH',
'GIT_WORK_TREE',
'GIT_EDITOR',
'GIT_EXEC_PATH',
'GIT_PAGER',
'PAGER',
'VISUAL',
'EDITOR',
] as const;
function sanitizedGitEnv(env: NodeJS.ProcessEnv = process.env): NodeJS.ProcessEnv {
const sanitized = { ...env };
for (const key of GIT_HOOK_ENV_KEYS) {
delete sanitized[key];
}
return sanitized;
}
export function createSimpleGit(baseDir: string): SimpleGit {
return simpleGit({ baseDir }).env(sanitizedGitEnv());
}

View file

@ -0,0 +1,75 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import type { KloCoreConfig } from './config.js';
import { createSimpleGit } from './git-env.js';
import { GitService } from './git.service.js';
describe('GitService.assertWorktreeClean', () => {
let workdir: string;
let git: SimpleGit;
let gitService: GitService;
beforeEach(async () => {
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-clean-'));
git = createSimpleGit(workdir);
await git.init();
await git.addConfig('user.email', 't@test');
await git.addConfig('user.name', 'Test');
await writeFile(join(workdir, 'init'), 'init');
await git.add('.');
await git.commit('init');
const coreConfig: KloCoreConfig = {
storage: { configDir: workdir, homeDir: workdir },
git: { userName: 'Test', userEmail: 't@test' },
};
gitService = new GitService(coreConfig);
(gitService as any).git = git;
(gitService as any).configDir = workdir;
});
afterEach(async () => rm(workdir, { recursive: true, force: true }));
it('does not throw on a clean worktree', async () => {
await expect(gitService.assertWorktreeClean()).resolves.toBeUndefined();
});
it('throws when MERGE_HEAD exists', async () => {
await writeFile(join(workdir, '.git', 'MERGE_HEAD'), 'deadbeef\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/MERGE_HEAD/);
});
it('throws when CHERRY_PICK_HEAD exists', async () => {
await writeFile(join(workdir, '.git', 'CHERRY_PICK_HEAD'), 'deadbeef\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/CHERRY_PICK_HEAD/);
});
it('throws when REVERT_HEAD exists', async () => {
await writeFile(join(workdir, '.git', 'REVERT_HEAD'), 'deadbeef\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/REVERT_HEAD/);
});
it('throws when sequencer/todo exists (interrupted multi-commit revert/cherry-pick)', async () => {
await mkdir(join(workdir, '.git', 'sequencer'), { recursive: true });
await writeFile(join(workdir, '.git', 'sequencer', 'todo'), 'pick deadbeef foo\n');
await expect(gitService.assertWorktreeClean()).rejects.toThrow(/sequencer/);
});
it('throws when the index has unmerged paths', async () => {
await git.checkoutLocalBranch('a');
await writeFile(join(workdir, 'shared'), 'A version');
await git.add('.');
await git.commit('a');
await git.checkout('master').catch(() => git.checkout('main'));
await git.checkoutLocalBranch('b');
await writeFile(join(workdir, 'shared'), 'B version');
await git.add('.');
await git.commit('b');
await git.raw(['merge', 'a']).catch(() => undefined);
await expect(gitService.assertWorktreeClean()).rejects.toThrow();
});
});

View file

@ -0,0 +1,78 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { mkdir, mkdtemp, readdir, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import type { KloCoreConfig } from './config.js';
import { createSimpleGit } from './git-env.js';
import { GitService } from './git.service.js';
describe('GitService.deleteDirectories', () => {
let workdir: string;
let git: SimpleGit;
let gitService: GitService;
beforeEach(async () => {
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-dd-'));
git = createSimpleGit(workdir);
await git.init();
await git.addConfig('user.email', 't@test');
await git.addConfig('user.name', 'Test');
await writeFile(join(workdir, 'keep'), 'k');
await git.add('.');
await git.commit('init');
const coreConfig: KloCoreConfig = {
storage: { configDir: workdir, homeDir: workdir },
git: { userName: 'Test', userEmail: 't@test' },
};
gitService = new GitService(coreConfig);
(gitService as any).git = git;
(gitService as any).configDir = workdir;
});
afterEach(async () => rm(workdir, { recursive: true, force: true }));
it('removes multiple directories in a single commit', async () => {
for (const name of ['a', 'b', 'c']) {
await mkdir(join(workdir, name), { recursive: true });
await writeFile(join(workdir, name, 'f.txt'), name);
}
await git.add('.');
await git.commit('seed 3 dirs');
const beforeCommits = (await git.log()).total;
const result = await gitService.deleteDirectories(['a', 'b'], 'gc: drop a+b', 'System User', 'system@example.com');
expect(result.commitHash).toBeTruthy();
const entries = await readdir(workdir);
expect(entries).not.toContain('a');
expect(entries).not.toContain('b');
expect(entries).toContain('c');
const afterCommits = (await git.log()).total;
expect(afterCommits).toBe(beforeCommits + 1);
});
it('no-ops and returns a null hash when the input list is empty', async () => {
const result = await gitService.deleteDirectories([], 'empty', 'X', 'x@example.com');
expect(result.commitHash).toBe('');
expect(result.created).toBe(false);
});
it('ignores paths that have already been deleted — commits only the remaining ones', async () => {
await mkdir(join(workdir, 'stale'), { recursive: true });
await writeFile(join(workdir, 'stale', 'x'), 'x');
await git.add('.');
await git.commit('seed stale');
const result = await gitService.deleteDirectories(
['stale', 'missing'],
'gc: drop stale + missing',
'System User',
'system@example.com',
);
expect(result.commitHash).toBeTruthy();
const entries = await readdir(workdir);
expect(entries).not.toContain('stale');
});
});

View file

@ -0,0 +1,56 @@
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import type { KloCoreConfig } from './config.js';
import { createSimpleGit } from './git-env.js';
import { GitService } from './git.service.js';
describe('GitService.resetHardTo', () => {
let workdir: string;
let git: SimpleGit;
let gitService: GitService;
beforeEach(async () => {
workdir = await mkdtemp(join(tmpdir(), 'gitsvc-reset-'));
git = createSimpleGit(workdir);
await git.init();
await git.addConfig('user.email', 't@test');
await git.addConfig('user.name', 'Test');
await writeFile(join(workdir, 'init'), 'init');
await git.add('.');
await git.commit('init');
const coreConfig: KloCoreConfig = {
storage: { configDir: workdir, homeDir: workdir },
git: { userName: 'Test', userEmail: 't@test' },
};
gitService = new GitService(coreConfig);
(gitService as any).git = git;
(gitService as any).configDir = workdir;
});
afterEach(async () => rm(workdir, { recursive: true, force: true }));
it('rewinds HEAD to the target SHA, removing later commits and their files', async () => {
const baseSha = (await git.revparse(['HEAD'])).trim();
await writeFile(join(workdir, 'a'), 'a1');
await git.add('.');
await git.commit('a');
await writeFile(join(workdir, 'b'), 'b1');
await git.add('.');
await git.commit('b');
await gitService.resetHardTo(baseSha);
expect((await git.revparse(['HEAD'])).trim()).toBe(baseSha);
expect(await readFile(join(workdir, 'a'), 'utf-8').catch(() => null)).toBeNull();
expect(await readFile(join(workdir, 'b'), 'utf-8').catch(() => null)).toBeNull();
});
it('is a no-op when target SHA equals current HEAD', async () => {
const sha = (await git.revparse(['HEAD'])).trim();
await gitService.resetHardTo(sha);
expect((await git.revparse(['HEAD'])).trim()).toBe(sha);
});
});

View file

@ -0,0 +1,358 @@
import { mkdtemp, realpath, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import type { KloCoreConfig } from './config.js';
import { GitService } from './git.service.js';
// These tests drive a real git repo inside a temp directory — simple-git shells out to the
// system `git` binary. They are fast enough to run as unit tests and catch real issues that
// would be invisible with mocked git.
describe('GitService', () => {
let service: GitService;
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'git-service-spec-'));
const coreConfig: KloCoreConfig = {
storage: { configDir: tempDir, homeDir: tempDir },
git: {
userName: 'Test User',
userEmail: 'test@example.com',
bootstrapMessage: 'Initialize test config repo',
bootstrapAuthor: 'test-system',
bootstrapAuthorEmail: 'system@example.com',
},
};
service = new GitService(coreConfig);
await service.onModuleInit();
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
const writeAndCommit = async (filePath: string, content: string, message = 'msg') => {
await writeFile(join(tempDir, filePath), content, 'utf-8');
return service.commitFile(filePath, message, 'Test', 'test@example.com');
};
describe('cold-start bootstrap commit', () => {
it('writes an empty commit on init so HEAD always resolves', async () => {
// beforeEach already ran onModuleInit() against an empty temp dir.
const head = await service.revParseHead();
expect(head).toMatch(/^[0-9a-f]{40}$/);
});
it('does not double-commit when re-initialized', async () => {
const before = await service.revParseHead();
await service.onModuleInit();
const after = await service.revParseHead();
expect(after).toBe(before);
});
});
describe('commitFile `created` flag', () => {
it('is true for a real commit', async () => {
const info = await writeAndCommit('a.md', '# Hello');
expect(info.created).toBe(true);
});
it('is false on a no-op write (content unchanged)', async () => {
await writeAndCommit('a.md', '# Hello');
const second = await writeAndCommit('a.md', '# Hello', 'unused');
expect(second.created).toBe(false);
});
});
describe('addNote / getNote', () => {
it('attaches a note and reads it back', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, 'Rich message from LLM');
expect(await service.getNote(info.commitHash)).toBe('Rich message from LLM');
});
it('returns undefined when no note exists', async () => {
const info = await writeAndCommit('a.md', '# Hello');
expect(await service.getNote(info.commitHash)).toBeUndefined();
});
it('overwrites an existing note (idempotent retries)', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, 'First');
await service.addNote(info.commitHash, 'Second');
expect(await service.getNote(info.commitHash)).toBe('Second');
});
it('skips empty/whitespace messages silently', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, ' ');
expect(await service.getNote(info.commitHash)).toBeUndefined();
});
});
describe('getFileHistory', () => {
it('surfaces enhancedMessage when a note is present', async () => {
const info = await writeAndCommit('a.md', '# Hello');
await service.addNote(info.commitHash, 'Note body');
const history = await service.getFileHistory('a.md');
expect(history[0]?.enhancedMessage).toBe('Note body');
});
it('leaves enhancedMessage undefined when no note is attached', async () => {
await writeAndCommit('a.md', '# Hello');
const history = await service.getFileHistory('a.md');
expect(history[0]?.enhancedMessage).toBeUndefined();
});
});
describe('getCommitDiff', () => {
it('returns the patch scoped to the requested path', async () => {
const info = await writeAndCommit('a.md', '# Hello');
const diff = await service.getCommitDiff(info.commitHash, 'a.md');
expect(diff).toContain('diff --git');
expect(diff).toContain('Hello');
});
it('handles the repository initial commit without throwing', async () => {
const info = await writeAndCommit('first.md', 'first');
await expect(service.getCommitDiff(info.commitHash, 'first.md')).resolves.toBeDefined();
});
});
describe('squashTo', () => {
const writeAsSystem = async (filePath: string, content: string, message = 'msg') => {
await writeFile(join(tempDir, filePath), content, 'utf-8');
return service.commitFile(filePath, message, 'System User', 'system@example.com');
};
it('collapses 3 commits after preHead into a single commit', async () => {
const pre = await writeAsSystem('a.md', 'v1');
const preHead = pre.commitHash;
await writeAsSystem('b.md', 'b', 'add b');
await writeAsSystem('c.md', 'c', 'add c');
await writeAsSystem('a.md', 'v2', 'update a');
const result = await service.squashTo(preHead, {
message: 'Ingest: bundle 3 writes',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(true);
expect(result.squashedCount).toBe(3);
expect(result.commitHash).toBeTruthy();
expect(result.commitHash).not.toBe(preHead);
const commitHash = result.commitHash;
if (!commitHash) {
throw new Error('Expected squash commit hash');
}
// The squashed commit should preserve the final tree state.
const fileAtSquash = await service.getFileAtCommit('a.md', commitHash);
expect(fileAtSquash).toBe('v2');
const bAtSquash = await service.getFileAtCommit('b.md', commitHash);
expect(bAtSquash).toBe('b');
});
it('is a no-op when preHead equals HEAD', async () => {
const pre = await writeAsSystem('a.md', 'v1');
const result = await service.squashTo(pre.commitHash, {
message: 'nothing to squash',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(false);
expect(result.commitHash).toBe(pre.commitHash);
});
it('skips squash when a foreign-author commit sits between preHead and HEAD', async () => {
const pre = await writeAsSystem('a.md', 'v1');
const preHead = pre.commitHash;
await writeAsSystem('b.md', 'from us', 'ours');
// Foreign commit
await writeAndCommit('c.md', 'from someone else', 'foreign');
await writeAsSystem('d.md', 'ours again', 'ours 2');
const result = await service.squashTo(preHead, {
message: 'should be skipped',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(false);
expect(result.reason).toContain('foreign');
expect(result.squashedCount).toBe(3);
});
it('returns cleanly when preHead is empty (no starting commit)', async () => {
const result = await service.squashTo('', {
message: 'would have squashed',
author: 'System User',
authorEmail: 'system@example.com',
});
expect(result.squashed).toBe(false);
expect(result.commitHash).toBeNull();
});
});
describe('worktree lifecycle', () => {
// macOS canonicalizes tmp paths (/var/folders → /private/var/folders) when git
// returns them from `worktree list`. Resolve through realpath() before comparing.
const canonicalSiblingPath = async (suffix: string): Promise<string> => {
const parent = await realpath(join(tempDir, '..'));
return join(parent, `wt-${Date.now()}-${suffix}`);
};
it('addWorktree creates a branch + directory at the given startSha', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const wtDir = await canonicalSiblingPath('add');
await service.addWorktree(wtDir, 'session/alpha', commitHash);
const list = await service.listWorktrees();
expect(list.find((e) => e.path === wtDir && e.branch === 'refs/heads/session/alpha')).toBeTruthy();
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('removeWorktree detaches the worktree entry', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const wtDir = await canonicalSiblingPath('rm');
await service.addWorktree(wtDir, 'session/beta', commitHash);
await service.removeWorktree(wtDir);
const list = await service.listWorktrees();
expect(list.find((e) => e.path === wtDir)).toBeFalsy();
});
it('deleteBranch removes a branch ref', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const wtDir = await canonicalSiblingPath('br');
await service.addWorktree(wtDir, 'session/gamma', commitHash);
await service.removeWorktree(wtDir);
await service.deleteBranch('session/gamma', true);
const branches = await (service as unknown as { git: import('simple-git').SimpleGit }).git.branchLocal();
expect(branches.all).not.toContain('session/gamma');
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
});
describe('forWorktree', () => {
it('returns a GitService whose operations run inside the given worktree', async () => {
const { commitHash } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-fw`);
await service.addWorktree(wtDir, 'session/delta', commitHash);
const scoped = service.forWorktree(wtDir);
expect(await scoped.revParseHead()).toBe(commitHash);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
});
describe('squashMergeIntoMain', () => {
it('merges a session branch as one commit on main, returning the new SHA + touched paths', async () => {
const { commitHash: baseSha } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-sm`);
await service.addWorktree(wtDir, 'session/happy', baseSha);
const scoped = service.forWorktree(wtDir);
await writeFile(join(wtDir, 'a.yaml'), 'one: 1\n', 'utf-8');
await scoped.commitFile('a.yaml', 'wip a', 'System User', 'system@example.com');
await writeFile(join(wtDir, 'b.yaml'), 'two: 2\n', 'utf-8');
await scoped.commitFile('b.yaml', 'wip b', 'System User', 'system@example.com');
const result = await service.squashMergeIntoMain(
'session/happy',
'System User',
'system@example.com',
'Memory capture: 2 files [chat=abcd1234]',
);
expect(result.ok).toBe(true);
if (!result.ok) {
throw new Error('unreachable');
}
expect(result.squashSha).toMatch(/^[0-9a-f]{40}$/);
expect(result.touchedPaths.sort()).toEqual(['a.yaml', 'b.yaml']);
const mainHead = await service.revParseHead();
expect(mainHead).toBe(result.squashSha);
expect(mainHead).not.toBe(baseSha);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('returns ok with empty touchedPaths when the session branch has no diff vs main', async () => {
const { commitHash: baseSha } = await writeAndCommit('seed.md', 'seed');
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-sm-empty`);
await service.addWorktree(wtDir, 'session/empty', baseSha);
const result = await service.squashMergeIntoMain(
'session/empty',
'System User',
'system@example.com',
'should be a no-op',
);
expect(result.ok).toBe(true);
if (!result.ok) {
throw new Error('unreachable');
}
expect(result.touchedPaths).toEqual([]);
expect(result.squashSha).toBe(baseSha);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
it('returns conflict=true and leaves main clean when session+main touched same file differently', async () => {
await writeAndCommit('shared.yaml', 'base\n');
const base = await service.revParseHead();
if (!base) {
throw new Error('no base head');
}
const parent = await realpath(join(tempDir, '..'));
const wtDir = join(parent, `wt-${Date.now()}-conf`);
await service.addWorktree(wtDir, 'session/conf', base);
const scoped = service.forWorktree(wtDir);
await writeFile(join(wtDir, 'shared.yaml'), 'session-edit\n', 'utf-8');
await scoped.commitFile('shared.yaml', 'session edit', 'System User', 'system@example.com');
// Main edits the same file a different way, after the session branched.
await writeAndCommit('shared.yaml', 'main-edit\n');
const result = await service.squashMergeIntoMain(
'session/conf',
'System User',
'system@example.com',
'Memory capture: 1 file [chat=dead1234]',
);
expect(result.ok).toBe(false);
if (result.ok) {
throw new Error('unreachable');
}
expect(result.conflict).toBe(true);
expect(result.conflictPaths).toContain('shared.yaml');
const status = await (service as unknown as { git: import('simple-git').SimpleGit }).git.status();
expect(status.isClean()).toBe(true);
await service.removeWorktree(wtDir).catch(() => undefined);
await rm(wtDir, { recursive: true, force: true }).catch(() => undefined);
});
});
});

View file

@ -0,0 +1,855 @@
import { promises as fs } from 'node:fs';
import { join } from 'node:path';
import type { SimpleGit } from 'simple-git';
import { noopLogger, resolveConfigDir, type KloCoreConfig, type KloLogger } from './config.js';
import { createSimpleGit } from './git-env.js';
export interface GitCommitInfo {
commitHash: string;
shortHash: string;
message: string;
author: string;
authorEmail: string;
timestamp: string;
committedDate: string;
/**
* True if this call produced a new commit. False when the file was already up-to-date
* and the returned info describes the pre-existing HEAD commit (no-op write).
*/
created: boolean;
/** Async LLM-generated commit summary attached as a git note. Undefined if no note present. */
enhancedMessage?: string;
}
export interface WorktreeEntry {
path: string;
branch: string | null;
head: string | null;
}
export type SquashMergeResult =
| { ok: true; squashSha: string; touchedPaths: string[] }
| { ok: false; conflict: true; conflictPaths: string[] };
export class GitService {
private readonly logger: KloLogger;
private git!: SimpleGit;
private configDir: string;
constructor(
private readonly config: KloCoreConfig,
logger?: KloLogger,
) {
this.logger = logger ?? noopLogger;
this.configDir = resolveConfigDir(config);
}
async onModuleInit(): Promise<void> {
// Ensure config directory exists
await fs.mkdir(this.configDir, { recursive: true });
this.logger.log(`Config directory ensured at: ${this.configDir}`);
// Initialize simple-git
this.git = createSimpleGit(this.configDir);
// Initialize git repository
await this.initialize();
}
private async initialize(): Promise<void> {
try {
// Check if already initialized
const isRepo = await this.git.checkIsRepo();
if (!isRepo) {
await this.git.init();
const gitConfig = this.config.git;
await this.git.addConfig('user.name', gitConfig.userName);
await this.git.addConfig('user.email', gitConfig.userEmail);
this.logger.log('Initialized git repository');
}
// Ensure HEAD always resolves to a commit so callers (e.g., the memory-agent squash flow)
// can rely on `revParseHead()` returning a SHA. Idempotent: skip if HEAD already exists.
const head = await this.revParseHead();
if (!head) {
await this.git.commit(this.config.git.bootstrapMessage ?? 'Initialize klo project repository', {
'--allow-empty': null,
'--author': `${this.config.git.bootstrapAuthor ?? 'klo system'} <${
this.config.git.bootstrapAuthorEmail ?? 'system@klo.local'
}>`,
});
this.logger.log('Wrote bootstrap commit to config repo');
}
} catch (error) {
this.logger.error('Failed to initialize git repository', error);
throw new Error('Failed to initialize git repository');
}
}
async commitFile(
filePath: string,
commitMessage: string,
author: string,
authorEmail: string,
): Promise<GitCommitInfo> {
try {
// Stage the file
await this.git.add(filePath);
// Check if there are any staged changes to commit
const stagedChanges = await this.git.diff(['--cached', '--name-only']);
if (!stagedChanges.trim()) {
// No changes to commit, file already matches what's in git
this.logger.debug(`No changes to commit for ${filePath}, file already up to date`);
// Return info about the current HEAD commit
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: false,
};
}
// There are changes to commit
const result = await this.git.commit(commitMessage, {
'--author': `${author} <${authorEmail}>`,
});
if (!result.commit) {
throw new Error('No commit hash returned');
}
// Get commit details
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
};
} catch (error) {
this.logger.error(`Failed to commit file ${filePath}`, error);
throw new Error(`Failed to commit file: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Stage multiple files and produce a single commit. Mirrors `commitFile` but batches
* N paths into one atomic commit used by the SL capture agent to commit all edits at once.
*/
async commitFiles(
filePaths: string[],
commitMessage: string,
author: string,
authorEmail: string,
): Promise<GitCommitInfo> {
try {
for (const filePath of filePaths) {
await this.git.add(filePath);
}
const stagedChanges = await this.git.diff(['--cached', '--name-only']);
if (!stagedChanges.trim()) {
this.logger.debug(`No changes to commit for ${filePaths.length} file(s), already up to date`);
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: false,
};
}
const result = await this.git.commit(commitMessage, {
'--author': `${author} <${authorEmail}>`,
});
if (!result.commit) {
throw new Error('No commit hash returned');
}
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
};
} catch (error) {
this.logger.error(`Failed to batch commit ${filePaths.length} file(s)`, error);
throw new Error(`Failed to batch commit: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Revert working-tree changes for the given paths (equivalent to `git checkout -- <paths>`).
* Used to roll back dirty files when validation fails.
*/
async checkoutFiles(filePaths: string[]): Promise<void> {
if (filePaths.length === 0) {
return;
}
try {
await this.git.checkout(['--', ...filePaths]);
} catch (error) {
this.logger.warn(
`Failed to checkout ${filePaths.length} file(s): ${error instanceof Error ? error.message : String(error)}`,
);
}
}
/**
* Read the content of `filePath` as it existed at `commitHash`. Equivalent to
* `git show <sha>:<path>`. Reads from git object storage, so it's safe against
* concurrent working-tree mutations.
*/
async getFileAtCommit(filePath: string, commitHash: string): Promise<string> {
try {
return await this.git.show([`${commitHash}:${filePath}`]);
} catch (error) {
this.logger.error(`Failed to read ${filePath} at ${commitHash}`, error);
throw new Error(`Failed to read file at commit: ${error instanceof Error ? error.message : String(error)}`);
}
}
async getFileHistory(filePath: string, limit: number = 50): Promise<GitCommitInfo[]> {
try {
const log = await this.git.log({
file: filePath,
maxCount: limit,
});
// N+1 fetch of notes is fine here: capped at 100 commits, cold UI path.
return Promise.all(
log.all.map(async (commit) => ({
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
enhancedMessage: await this.getNote(commit.hash),
})),
);
} catch (error) {
this.logger.error(`Failed to get history for ${filePath}`, error);
throw new Error(`Failed to retrieve file history: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Attach or overwrite an LLM-generated summary note on a commit.
* Uses `-f` so retries overwrite rather than fail on existing notes (idempotent).
* Callers are responsible for holding `config:repo` Redlock notes writes mutate
* `.git/refs/notes/commits` and must serialize with commits.
*/
async addNote(commitHash: string, message: string): Promise<void> {
const trimmed = message.trim();
if (!trimmed) {
return;
}
try {
await this.git.raw(['notes', 'add', '-f', '-m', trimmed, commitHash]);
} catch (error) {
this.logger.error(`Failed to attach note to ${commitHash}`, error);
throw new Error(`Failed to attach git note: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Read the LLM-generated note for a commit, or undefined if none present.
* Swallows `simple-git`'s "no note found" error so callers can treat it as optional.
*/
async getNote(commitHash: string): Promise<string | undefined> {
try {
const note = await this.git.raw(['notes', 'show', commitHash]);
const trimmed = note.trim();
return trimmed ? trimmed : undefined;
} catch {
// `git notes show` exits non-zero when no note exists — treat as "no note".
return undefined;
}
}
/**
* Return the patch for a commit, optionally scoped to a single path.
* Strips the commit header above the first `diff --git` so only the patch body remains,
* and clips to 12 KB to bound LLM token cost. Returns '' if the commit changed nothing
* on the requested path (e.g. a commit that only touched other files).
*/
async getCommitDiff(commitHash: string, path?: string): Promise<string> {
const args = ['show', '--format=', '--no-color', '--patch', commitHash];
if (path) {
args.push('--', path);
}
try {
const raw = await this.git.raw(args);
const diffStart = raw.indexOf('diff --git');
const body = diffStart >= 0 ? raw.slice(diffStart) : raw.trim();
const MAX_DIFF_BYTES = 12_000;
return body.length > MAX_DIFF_BYTES ? `${body.slice(0, MAX_DIFF_BYTES)}\n… [diff truncated]` : body;
} catch (error) {
this.logger.error(`Failed to read diff for ${commitHash}`, error);
throw new Error(`Failed to read commit diff: ${error instanceof Error ? error.message : String(error)}`);
}
}
async deleteFile(
filePath: string,
commitMessage: string,
author: string,
authorEmail: string,
): Promise<GitCommitInfo> {
try {
// Remove the file from git
await this.git.rm(filePath);
// Commit the deletion
const result = await this.git.commit(commitMessage, {
'--author': `${author} <${authorEmail}>`,
});
if (!result.commit) {
throw new Error('No commit hash returned');
}
// Get commit details
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
};
} catch (error) {
this.logger.error(`Failed to delete file ${filePath}`, error);
throw new Error(`Failed to delete file: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Resolve HEAD to a full commit SHA. Returns the empty string if the repo has no commits yet
* (a freshly-init'd repo before any writes), so callers can treat that as "nothing to reconcile".
*/
async revParseHead(): Promise<string> {
try {
const sha = await this.git.revparse(['HEAD']);
return sha.trim();
} catch {
return '';
}
}
/**
* Verify a commit object exists in the local repo. Used by the reconciler to detect
* the "history was rewritten / partial clone" case before attempting `git diff $sha..HEAD`.
*/
async commitExists(commitHash: string): Promise<boolean> {
if (!commitHash) {
return false;
}
try {
await this.git.raw(['cat-file', '-e', `${commitHash}^{commit}`]);
return true;
} catch {
return false;
}
}
/**
* `git diff --name-status $from..$to -- $pathSpec`. Returns one entry per changed path.
* Renames (`R{score}\told\tnew`) are split into a `D` for the old path plus an `A` for
* the new the reconciler treats each path independently and the new path's row will
* upsert with whatever content the file actually has.
*/
async diffNameStatus(
from: string,
to: string,
pathSpec?: string,
): Promise<Array<{ status: 'A' | 'M' | 'D'; path: string }>> {
const args = ['diff', '--name-status', '-z', `${from}..${to}`];
if (pathSpec) {
args.push('--', pathSpec);
}
const raw = await this.git.raw(args);
if (!raw) {
return [];
}
// -z output: NUL-separated fields. For A/M/D: "<status>\0<path>\0". For R/C: "<status>\0<old>\0<new>\0".
const fields = raw.split('\0').filter((f) => f.length > 0);
const out: Array<{ status: 'A' | 'M' | 'D'; path: string }> = [];
let i = 0;
while (i < fields.length) {
const status = fields[i];
const code = status[0];
if (code === 'R' || code === 'C') {
const oldPath = fields[i + 1];
const newPath = fields[i + 2];
out.push({ status: 'D', path: oldPath });
out.push({ status: 'A', path: newPath });
i += 3;
} else if (code === 'A' || code === 'M' || code === 'D') {
out.push({ status: code, path: fields[i + 1] });
i += 2;
} else {
// Unknown status (T type-change, U unmerged, X unknown) — treat as modify, skip if no path
if (fields[i + 1]) {
out.push({ status: 'M', path: fields[i + 1] });
}
i += 2;
}
}
return out;
}
/**
* List all paths under the working tree that match `pathSpec`, scoped to HEAD.
* Used for the reconciler's first-ever run when there's no watermark to diff from.
*/
async listFilesAtHead(pathSpec: string): Promise<string[]> {
try {
const raw = await this.git.raw(['ls-tree', '-r', '-z', '--name-only', 'HEAD', '--', pathSpec]);
if (!raw) {
return [];
}
return raw.split('\0').filter((f) => f.length > 0);
} catch {
return [];
}
}
/**
* Collapse all commits between `preHead` and current HEAD into a single commit with the given
* message. Used by the memory agent to squash N per-tool-call commits into one ingest commit.
*
* Author-check guard: if any commit between preHead..HEAD has an author other than
* `expectedAuthor`, skips the squash and returns `{ squashed: false, reason: ... }`. This
* prevents accidentally collapsing another writer's commits if writes interleaved with ours.
*
* Caller is responsible for holding the `config:repo` lock so writes and squash serialize.
*/
async squashTo(
preHead: string,
options: { message: string; author: string; authorEmail: string; expectedAuthor?: string },
): Promise<{ squashed: boolean; commitHash: string | null; reason?: string; squashedCount?: number }> {
const { message, author, authorEmail } = options;
const expectedAuthor = options.expectedAuthor ?? author;
if (!preHead) {
return { squashed: false, commitHash: null, reason: 'no pre-head recorded (empty repo at start)' };
}
let currentHead: string;
try {
currentHead = (await this.git.revparse(['HEAD'])).trim();
} catch {
return { squashed: false, commitHash: null, reason: 'no HEAD (repo is empty)' };
}
if (currentHead === preHead) {
return { squashed: false, commitHash: preHead, reason: 'no new commits' };
}
try {
const log = await this.git.log({ from: preHead, to: 'HEAD' });
const commits = log.all;
if (commits.length === 0) {
return { squashed: false, commitHash: preHead, reason: 'no new commits' };
}
const foreign = commits.find((c) => c.author_name !== expectedAuthor);
if (foreign) {
this.logger.warn(
`Skipping squash: commit ${foreign.hash.substring(0, 8)} authored by "${foreign.author_name}" ` +
`differs from expected "${expectedAuthor}". Leaving ${commits.length} commit(s) as-is.`,
);
return {
squashed: false,
commitHash: currentHead,
reason: `foreign commit by ${foreign.author_name}`,
squashedCount: commits.length,
};
}
// Soft reset to preHead, then produce a single commit with all the staged changes.
await this.git.reset(['--soft', preHead]);
const staged = await this.git.diff(['--cached', '--name-only']);
if (!staged.trim()) {
// All intervening commits cancelled each other out — return to preHead and commit nothing.
return { squashed: true, commitHash: preHead, reason: 'no net changes', squashedCount: commits.length };
}
await this.git.commit(message, { '--author': `${author} <${authorEmail}>` });
const newHead = (await this.git.revparse(['HEAD'])).trim();
this.logger.log(
`squashTo: collapsed ${commits.length} commit(s) into ${newHead.substring(0, 8)} (was ${currentHead.substring(0, 8)})`,
);
return { squashed: true, commitHash: newHead, squashedCount: commits.length };
} catch (error) {
this.logger.error('Failed to squash commits', error);
throw new Error(`Failed to squash commits: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Squash-merge `branch` into the currently-checked-out branch of THIS worktree (the
* main worktree, when called on the root GitService instance). Produces a single
* commit whose tree equals the source branch's tree, with the given message/author.
* Returns `{ ok: false, conflict: true, conflictPaths }` and leaves the main worktree
* clean if git reports merge conflicts.
*
* Caller must hold the `config:repo` lock so interactive writes don't race against the
* merge window.
*/
async squashMergeIntoMain(
branch: string,
author: string,
authorEmail: string,
commitMessage: string,
): Promise<SquashMergeResult> {
// Diff of HEAD..branch (two dots) lists commits/files reachable from `branch` that
// aren't on HEAD — i.e. exactly what the squash would apply. Three dots (HEAD...branch)
// is symmetric difference and would mis-classify cases where main moved ahead.
const diff = await this.git.raw(['diff', '--name-only', `HEAD..${branch}`]);
const touchedPaths = diff
.split('\n')
.map((l) => l.trim())
.filter(Boolean);
if (touchedPaths.length === 0) {
const head = (await this.git.revparse(['HEAD'])).trim();
return { ok: true, squashSha: head, touchedPaths: [] };
}
// `git merge --squash` may NOT throw on a textual conflict — it stages the clean
// hunks and leaves conflicted paths unmerged in the index. simple-git may also
// throw if the underlying git exits non-zero. Handle both: try the merge, then
// independently inspect the index for unmerged paths before committing.
let mergeError: unknown = null;
try {
await this.git.raw(['merge', '--squash', branch]);
} catch (error) {
mergeError = error;
}
const unmergedOut = await this.git.raw(['diff', '--name-only', '--diff-filter=U']).catch(() => '');
const conflictPaths = unmergedOut
.split('\n')
.map((l) => l.trim())
.filter(Boolean);
if (conflictPaths.length > 0 || mergeError !== null) {
// `merge --abort` only works for an in-progress merge; squash sets MERGE_MSG but not
// MERGE_HEAD, so fall back to a hard reset which clears the index and worktree.
await this.git.raw(['merge', '--abort']).catch(() => undefined);
await this.git.raw(['reset', '--hard', 'HEAD']).catch(() => undefined);
this.logger.warn(
`squashMergeIntoMain: conflict merging ${branch} — aborted. conflictPaths=${conflictPaths.join(',')}` +
(mergeError ? ` error=${mergeError instanceof Error ? mergeError.message : String(mergeError)}` : ''),
);
return { ok: false, conflict: true, conflictPaths };
}
await this.git.commit(commitMessage, { '--author': `${author} <${authorEmail}>` });
const squashSha = (await this.git.revparse(['HEAD'])).trim();
return { ok: true, squashSha, touchedPaths };
}
/**
* Rewinds the current branch's HEAD to `targetSha`, discarding all later commits and any
* uncommitted worktree changes. Used by Stage-3 to back out a failed work-unit's commits
* on the session worktree - simpler and more robust than `git revert` over a multi-commit
* range, which can pause the sequencer on conflicts.
*/
async resetHardTo(targetSha: string): Promise<void> {
await this.git.raw(['reset', '--hard', targetSha]);
}
/**
* Throws if the worktree is in a state that would make a downstream merge unsafe: an
* in-progress merge, rebase, cherry-pick, revert, interrupted sequencer operation, or
* unmerged paths in the index.
*/
async assertWorktreeClean(): Promise<void> {
const inProgressMarkers: ReadonlyArray<{ relPath: string; label: string }> = [
{ relPath: 'MERGE_HEAD', label: 'MERGE_HEAD' },
{ relPath: 'REBASE_HEAD', label: 'REBASE_HEAD' },
{ relPath: 'CHERRY_PICK_HEAD', label: 'CHERRY_PICK_HEAD' },
{ relPath: 'REVERT_HEAD', label: 'REVERT_HEAD' },
{ relPath: 'sequencer/todo', label: 'sequencer (interrupted multi-commit op)' },
];
for (const { relPath, label } of inProgressMarkers) {
const gitPath = (await this.git.raw(['rev-parse', '--git-path', relPath])).trim();
const fullPath = gitPath.startsWith('/') ? gitPath : join(this.configDir, gitPath);
if (await this.fileExists(fullPath)) {
throw new Error(
`Worktree has in-progress git operation (${label} present at ${fullPath}); refusing to proceed`,
);
}
}
const unmerged = (await this.git.raw(['diff', '--name-only', '--diff-filter=U']).catch(() => ''))
.split('\n')
.map((line) => line.trim())
.filter(Boolean);
if (unmerged.length > 0) {
throw new Error(
`Worktree has ${unmerged.length} unmerged path(s): ${unmerged.slice(0, 5).join(', ')}; refusing to proceed`,
);
}
}
private async fileExists(path: string): Promise<boolean> {
try {
await fs.access(path);
return true;
} catch {
return false;
}
}
/**
* Create a new worktree at `path` with a new branch `branch` pointing at `startSha`.
* Used by the memory agent to isolate per-session writes from interactive saves on main.
*/
async addWorktree(path: string, branch: string, startSha: string): Promise<void> {
try {
await this.git.raw(['worktree', 'add', '-b', branch, path, startSha]);
} catch (error) {
throw new Error(`Failed to add worktree at ${path}: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Remove the worktree entry and its on-disk directory. Uses `--force` because session
* worktrees are klo-internal a clean working tree is not required.
*/
async removeWorktree(path: string): Promise<void> {
try {
await this.git.raw(['worktree', 'remove', '--force', path]);
} catch (error) {
this.logger.warn(
`removeWorktree failed for ${path}: ${error instanceof Error ? error.message : String(error)} — attempting prune`,
);
await this.git.raw(['worktree', 'prune']).catch(() => undefined);
}
}
/**
* List all worktrees attached to this repo, parsed from `worktree list --porcelain`.
* The main worktree is included.
*/
async listWorktrees(): Promise<WorktreeEntry[]> {
const out = await this.git.raw(['worktree', 'list', '--porcelain']);
const entries: WorktreeEntry[] = [];
let current: Partial<WorktreeEntry> = {};
for (const line of out.split('\n')) {
if (line.startsWith('worktree ')) {
if (current.path) {
entries.push({
path: current.path,
branch: current.branch ?? null,
head: current.head ?? null,
});
}
current = { path: line.slice('worktree '.length) };
} else if (line.startsWith('HEAD ')) {
current.head = line.slice('HEAD '.length);
} else if (line.startsWith('branch ')) {
current.branch = line.slice('branch '.length);
}
}
if (current.path) {
entries.push({
path: current.path,
branch: current.branch ?? null,
head: current.head ?? null,
});
}
return entries;
}
async deleteBranch(branch: string, force = false): Promise<void> {
await this.git.raw(['branch', force ? '-D' : '-d', branch]);
}
/**
* Lightweight factory returning a GitService instance whose simple-git client is scoped
* to `workdir`. Used by memory-agent session worktrees. The returned instance shares
* config and the logger with the parent; it does NOT run `onModuleInit`
* (the main instance has already initialized the repo).
*/
forWorktree(workdir: string): GitService {
const scoped = new GitService(this.config, this.logger);
scoped.git = createSimpleGit(workdir);
scoped.configDir = workdir;
return scoped;
}
async deleteDirectory(
directoryPath: string,
commitMessage: string,
author: string,
authorEmail: string,
): Promise<GitCommitInfo> {
try {
// Remove the directory recursively from git
await this.git.rm(['-r', directoryPath]);
// Commit the deletion
const result = await this.git.commit(commitMessage, {
'--author': `${author} <${authorEmail}>`,
});
if (!result.commit) {
throw new Error('No commit hash returned');
}
// Get commit details
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
};
} catch (error) {
this.logger.error(`Failed to delete directory ${directoryPath}`, error);
throw new Error(`Failed to delete directory: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Remove multiple directories recursively and commit them as one change.
* Paths that don't exist in the working tree are skipped silently (useful for GC
* where the DB-known path has already been evicted by a previous run).
* Returns a GitCommitInfo with created=false and an empty commitHash when no
* paths were actually removed.
*/
async deleteDirectories(
directoryPaths: string[],
commitMessage: string,
author: string,
authorEmail: string,
): Promise<GitCommitInfo> {
if (directoryPaths.length === 0) {
return {
commitHash: '',
shortHash: '',
message: commitMessage,
author,
authorEmail,
timestamp: new Date().toISOString(),
committedDate: new Date().toISOString(),
created: false,
};
}
const removed: string[] = [];
for (const path of directoryPaths) {
try {
await this.git.rm(['-r', path]);
removed.push(path);
} catch (error) {
this.logger.warn(
`deleteDirectories: skipping ${path}: ${error instanceof Error ? error.message : String(error)}`,
);
}
}
if (removed.length === 0) {
return {
commitHash: '',
shortHash: '',
message: commitMessage,
author,
authorEmail,
timestamp: new Date().toISOString(),
committedDate: new Date().toISOString(),
created: false,
};
}
const result = await this.git.commit(commitMessage, { '--author': `${author} <${authorEmail}>` });
if (!result.commit) {
throw new Error('No commit hash returned from deleteDirectories');
}
const log = await this.git.log({ maxCount: 1 });
const commit = log.latest;
if (!commit) {
throw new Error('Failed to retrieve commit details after deleteDirectories');
}
return {
commitHash: commit.hash,
shortHash: commit.hash.substring(0, 8),
message: commit.message,
author: commit.author_name,
authorEmail: commit.author_email,
timestamp: commit.date,
committedDate: new Date(commit.date).toISOString(),
created: true,
};
}
}

View file

@ -0,0 +1,27 @@
export type { KloCoreConfig, KloGitConfig, KloLogger, KloStorageConfig } from './config.js';
export { noopLogger, resolveConfigDir, resolveWorktreesDir } from './config.js';
export { resolveKloConfigReference, resolveKloHomePath } from './config-reference.js';
export type { KloEmbeddingPort } from './embedding.js';
export {
REDACTED_KLO_CREDENTIAL_VALUE,
redactKloSensitiveMetadata,
redactKloSensitiveText,
redactKloSensitiveValue,
} from './redaction.js';
export type {
KloFileHistoryEntry,
KloFileListResult,
KloFileReadResult,
KloFileStorePort,
KloFileWriteResult,
} from './file-store.js';
export type { GitCommitInfo, SquashMergeResult, WorktreeEntry } from './git.service.js';
export { GitService } from './git.service.js';
export type {
SentinelPayload,
SessionOutcome,
SessionWorktree,
SessionWorktreeServiceDeps,
WorktreeConfigPort,
} from './session-worktree.service.js';
export { SessionWorktreeService } from './session-worktree.service.js';

View file

@ -0,0 +1,47 @@
export const REDACTED_KLO_CREDENTIAL_VALUE = '<redacted>';
const SENSITIVE_FIELD_NAME = /(password|secret|token|api[_-]?key|private[_-]?key|passphrase|credential|authorization|url)/i;
const URL_CREDENTIAL_PATTERN = /([a-z][a-z0-9+.-]*:\/\/[^:\s/@]+:)([^@\s/]+)(@)/gi;
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
function isSensitiveField(key: string): boolean {
return SENSITIVE_FIELD_NAME.test(key);
}
export function redactKloSensitiveValue(key: string, value: unknown): unknown {
if (isSensitiveField(key)) {
return REDACTED_KLO_CREDENTIAL_VALUE;
}
if (Array.isArray(value)) {
return value.map((item) => redactKloSensitiveValue(key, item));
}
if (isRecord(value)) {
return redactKloSensitiveMetadata(value);
}
return value;
}
export function redactKloSensitiveMetadata(metadata: Record<string, unknown>): Record<string, unknown> {
const redacted: Record<string, unknown> = {};
for (const [key, value] of Object.entries(metadata)) {
if (Array.isArray(value)) {
redacted[key] = value.map((item) =>
isRecord(item) ? redactKloSensitiveMetadata(item) : redactKloSensitiveValue(key, item),
);
continue;
}
if (isRecord(value)) {
redacted[key] = redactKloSensitiveValue(key, value);
continue;
}
redacted[key] = redactKloSensitiveValue(key, value);
}
return redacted;
}
export function redactKloSensitiveText(value: string): string {
return value.replace(URL_CREDENTIAL_PATTERN, `$1${REDACTED_KLO_CREDENTIAL_VALUE}$3`);
}

View file

@ -0,0 +1,124 @@
import { mkdtemp, realpath, rm, stat } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import type { KloCoreConfig } from './config.js';
import { GitService } from './git.service.js';
import { SessionWorktreeService, type WorktreeConfigPort } from './session-worktree.service.js';
interface TestWorktreeConfig extends WorktreeConfigPort<TestWorktreeConfig> {
workdir?: string;
}
// SessionWorktreeService glues a real GitService to a scoped config adapter.
describe('SessionWorktreeService', () => {
let sessionService: SessionWorktreeService<TestWorktreeConfig>;
let gitService: GitService;
let homeDir: string;
beforeEach(async () => {
homeDir = await mkdtemp(join(tmpdir(), 'sws-spec-'));
homeDir = await realpath(homeDir);
const coreConfig: KloCoreConfig = {
storage: { configDir: homeDir, homeDir },
git: {
userName: 'System User',
userEmail: 'system@example.com',
bootstrapMessage: 'Initialize test config repo',
bootstrapAuthor: 'test-system',
bootstrapAuthorEmail: 'system@example.com',
},
};
gitService = new GitService(coreConfig);
await gitService.onModuleInit();
const configService: TestWorktreeConfig = {
forWorktree: vi.fn(
(workdir: string): TestWorktreeConfig => ({ workdir, forWorktree: configService.forWorktree }),
),
};
sessionService = new SessionWorktreeService({
coreConfig,
gitService,
configService,
});
});
afterEach(async () => {
await rm(homeDir, { recursive: true, force: true });
});
describe('create', () => {
it('creates a worktree + branch and returns scoped services', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const session = await sessionService.create('chat-abc', baseSha);
expect(session.workdir).toBe(join(homeDir, '.worktrees', 'session-chat-abc'));
expect(session.branch).toBe('session/chat-abc');
expect(session.baseSha).toBe(baseSha);
const stats = await stat(session.workdir);
expect(stats.isDirectory()).toBe(true);
// Scoped git instance reports the worktree's HEAD (= baseSha at creation time).
expect(await session.git.revParseHead()).toBe(baseSha);
const list = await gitService.listWorktrees();
expect(list.find((e) => e.path === session.workdir)).toBeTruthy();
});
it('appends a timestamp suffix when the primary dir already exists', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const first = await sessionService.create('chat-dup', baseSha);
const second = await sessionService.create('chat-dup', baseSha);
expect(first.workdir).not.toBe(second.workdir);
expect(second.branch).toMatch(/^session\/chat-dup-\d+$/);
});
});
describe('cleanup', () => {
it('success removes the worktree dir and deletes the branch', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const session = await sessionService.create('chat-cleanup-ok', baseSha);
await sessionService.cleanup(session, 'success');
const list = await gitService.listWorktrees();
expect(list.find((e) => e.path === session.workdir)).toBeFalsy();
await expect(stat(session.workdir)).rejects.toThrow();
});
it('conflict keeps the worktree and writes a sentinel file', async () => {
const baseSha = await gitService.revParseHead();
if (!baseSha) {
throw new Error('no base sha');
}
const session = await sessionService.create('chat-cleanup-conflict', baseSha);
await sessionService.cleanup(session, 'conflict', { conflictPaths: ['shared.yaml'] });
// Dir still exists.
await expect(stat(session.workdir)).resolves.toBeTruthy();
const { readFile } = await import('node:fs/promises');
const raw = await readFile(join(session.workdir, '.klo-outcome'), 'utf-8');
const parsed = JSON.parse(raw);
expect(parsed.outcome).toBe('conflict');
expect(parsed.chatId).toBe('chat-cleanup-conflict');
expect(parsed.conflictPaths).toEqual(['shared.yaml']);
expect(typeof parsed.at).toBe('string');
});
});
});

View file

@ -0,0 +1,113 @@
import { mkdir, stat, writeFile } from 'node:fs/promises';
import { join } from 'node:path';
import { noopLogger, resolveWorktreesDir, type KloCoreConfig, type KloLogger } from './config.js';
import { GitService } from './git.service.js';
export type SessionOutcome = 'success' | 'empty' | 'conflict' | 'crash';
export interface SentinelPayload {
outcome: SessionOutcome;
at: string;
chatId: string;
baseSha: string;
conflictPaths?: string[];
}
export interface WorktreeConfigPort<TConfig> {
forWorktree(workdir: string): TConfig;
}
export interface SessionWorktree<TConfig> {
chatId: string;
workdir: string;
branch: string;
baseSha: string;
createdAt: Date;
git: GitService;
config: TConfig;
}
export interface SessionWorktreeServiceDeps<TConfig extends WorktreeConfigPort<TConfig>> {
coreConfig: KloCoreConfig;
gitService: GitService;
configService: TConfig;
logger?: KloLogger;
}
export class SessionWorktreeService<TConfig extends WorktreeConfigPort<TConfig> = WorktreeConfigPort<never>> {
private readonly logger: KloLogger;
private readonly worktreesRoot: string;
constructor(private readonly deps: SessionWorktreeServiceDeps<TConfig>) {
this.logger = deps.logger ?? noopLogger;
this.worktreesRoot = resolveWorktreesDir(deps.coreConfig);
}
async create(sessionKey: string, baseSha: string): Promise<SessionWorktree<TConfig>> {
await mkdir(this.worktreesRoot, { recursive: true });
let dirName = `session-${sessionKey}`;
let branch = `session/${sessionKey}`;
let workdir = join(this.worktreesRoot, dirName);
try {
await stat(workdir);
const suffix = Date.now().toString();
dirName = `session-${sessionKey}-${suffix}`;
branch = `session/${sessionKey}-${suffix}`;
workdir = join(this.worktreesRoot, dirName);
this.logger.warn(`session worktree collision for key=${sessionKey}; using suffix ${suffix}`);
} catch {
// no collision: primary name is free
}
await this.deps.gitService.addWorktree(workdir, branch, baseSha);
return {
chatId: sessionKey,
workdir,
branch,
baseSha,
createdAt: new Date(),
git: this.deps.gitService.forWorktree(workdir),
config: this.deps.configService.forWorktree(workdir),
};
}
async cleanup(
session: SessionWorktree<TConfig>,
outcome: SessionOutcome,
extra?: { conflictPaths?: string[] },
): Promise<void> {
if (outcome === 'success' || outcome === 'empty') {
try {
await this.deps.gitService.removeWorktree(session.workdir);
await this.deps.gitService.deleteBranch(session.branch, true);
} catch (error) {
this.logger.warn(
`cleanup(${outcome}) failed for ${session.chatId}: ${
error instanceof Error ? error.message : String(error)
}`,
);
}
return;
}
const payload: SentinelPayload = {
outcome,
at: new Date().toISOString(),
chatId: session.chatId,
baseSha: session.baseSha,
...(extra?.conflictPaths ? { conflictPaths: extra.conflictPaths } : {}),
};
try {
await writeFile(join(session.workdir, '.klo-outcome'), JSON.stringify(payload, null, 2), 'utf-8');
} catch (error) {
this.logger.warn(
`cleanup(${outcome}) failed to write sentinel for ${session.chatId}: ${
error instanceof Error ? error.message : String(error)
}`,
);
}
}
}

View file

@ -0,0 +1 @@
export * from './semantic-layer-compute.js';

View file

@ -0,0 +1,339 @@
import { once } from 'node:events';
import { createServer } from 'node:http';
import { describe, expect, it, vi } from 'vitest';
import { createHttpSemanticLayerComputePort, createPythonSemanticLayerComputePort } from './semantic-layer-compute.js';
const source = {
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'number' }],
joins: [],
measures: [{ name: 'order_count', expr: 'count(*)' }],
};
const sourceGenerationInput = {
tables: [
{
name: 'orders',
db: 'public',
comment: 'Orders table',
columns: [
{ name: 'id', type: 'integer', primaryKey: true, nullable: false, comment: 'Order ID' },
{ name: 'customer_id', type: 'integer' },
{ name: 'amount', type: 'decimal', comment: 'Order amount' },
],
},
{
name: 'customers',
db: 'public',
columns: [
{ name: 'id', type: 'integer', primaryKey: true },
{ name: 'email', type: 'varchar' },
],
},
],
links: [
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
relationshipType: 'MANY_TO_ONE',
},
],
dialect: 'postgres',
};
const sourceGenerationDaemonPayload = {
tables: [
{
name: 'orders',
db: 'public',
comment: 'Orders table',
columns: [
{ name: 'id', type: 'integer', primary_key: true, nullable: false, comment: 'Order ID' },
{ name: 'customer_id', type: 'integer' },
{ name: 'amount', type: 'decimal', comment: 'Order amount' },
],
},
{
name: 'customers',
db: 'public',
columns: [
{ name: 'id', type: 'integer', primary_key: true },
{ name: 'email', type: 'varchar' },
],
},
],
links: [
{
from_table: 'orders',
from_column: 'customer_id',
to_table: 'customers',
to_column: 'id',
relationship_type: 'MANY_TO_ONE',
},
],
dialect: 'postgres',
};
const sourceGenerationDaemonResponse = {
source_count: 2,
sources: [
{
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'number' }],
joins: [
{
to: 'customers',
on: 'customer_id = customers.id',
relationship: 'many_to_one',
},
],
measures: [{ name: 'record_count', expr: 'count(id)' }],
},
],
};
describe('createPythonSemanticLayerComputePort', () => {
it('calls the semantic-query stdio command', async () => {
const runJson = vi.fn(async () => ({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
}));
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toEqual({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
});
expect(runJson).toHaveBeenCalledWith('semantic-query', {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
});
});
it('calls the semantic-validate stdio command', async () => {
const runJson = vi.fn(async () => ({
valid: true,
errors: [],
warnings: [],
per_source_warnings: {},
}));
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(
port.validateSources({
sources: [source],
dialect: 'postgres',
recentlyTouched: ['orders'],
}),
).resolves.toEqual({
valid: true,
errors: [],
warnings: [],
perSourceWarnings: {},
});
expect(runJson).toHaveBeenCalledWith('semantic-validate', {
sources: [source],
dialect: 'postgres',
recently_touched: ['orders'],
});
});
it('calls the semantic-generate-sources stdio command', async () => {
const runJson = vi.fn(async () => sourceGenerationDaemonResponse);
const port = createPythonSemanticLayerComputePort({ runJson });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(runJson).toHaveBeenCalledWith('semantic-generate-sources', sourceGenerationDaemonPayload);
});
});
describe('createHttpSemanticLayerComputePort', () => {
it('calls semantic query and validate HTTP endpoints through an injected runner', async () => {
const requestJson = vi.fn(async (path: string) => {
if (path === '/semantic-layer/query') {
return {
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
};
}
return {
valid: true,
errors: [],
warnings: [],
per_source_warnings: {},
};
});
const port = createHttpSemanticLayerComputePort({ baseUrl: 'http://127.0.0.1:8765/', requestJson });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toEqual({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
});
await expect(
port.validateSources({
sources: [source],
dialect: 'postgres',
recentlyTouched: ['orders'],
}),
).resolves.toEqual({
valid: true,
errors: [],
warnings: [],
perSourceWarnings: {},
});
expect(requestJson).toHaveBeenNthCalledWith(1, '/semantic-layer/query', {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
});
expect(requestJson).toHaveBeenNthCalledWith(2, '/semantic-layer/validate', {
sources: [source],
dialect: 'postgres',
recently_touched: ['orders'],
});
});
it('calls the semantic source-generation HTTP endpoint through an injected runner', async () => {
const requestJson = vi.fn(async () => sourceGenerationDaemonResponse);
const port = createHttpSemanticLayerComputePort({ baseUrl: 'http://127.0.0.1:8765/', requestJson });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(requestJson).toHaveBeenCalledWith('/semantic-layer/generate-sources', sourceGenerationDaemonPayload);
});
it('posts JSON to a running HTTP daemon endpoint', async () => {
const requests: Array<{ url: string | undefined; body: unknown }> = [];
const server = createServer((request, response) => {
const chunks: Buffer[] = [];
request.on('data', (chunk: Buffer) => chunks.push(chunk));
request.on('end', () => {
requests.push({
url: request.url,
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
});
response.writeHead(200, { 'content-type': 'application/json' });
response.end(
JSON.stringify({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
columns: [{ name: 'orders.order_count' }],
plan: { sources_used: ['orders'] },
}),
);
});
});
server.listen(0, '127.0.0.1');
await once(server, 'listening');
try {
const address = server.address();
if (!address || typeof address === 'string') {
throw new Error('expected TCP server address');
}
const port = createHttpSemanticLayerComputePort({ baseUrl: `http://127.0.0.1:${address.port}` });
await expect(
port.query({
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
}),
).resolves.toMatchObject({
sql: 'select count(*) from public.orders',
dialect: 'postgres',
});
expect(requests).toEqual([
{
url: '/semantic-layer/query',
body: {
sources: [source],
dialect: 'postgres',
query: { measures: ['orders.order_count'], dimensions: [] },
},
},
]);
} finally {
server.close();
}
});
it('posts source-generation JSON to a running HTTP daemon endpoint', async () => {
const requests: Array<{ url: string | undefined; body: unknown }> = [];
const server = createServer((request, response) => {
const chunks: Buffer[] = [];
request.on('data', (chunk: Buffer) => chunks.push(chunk));
request.on('end', () => {
requests.push({
url: request.url,
body: JSON.parse(Buffer.concat(chunks).toString('utf8')),
});
response.writeHead(200, { 'content-type': 'application/json' });
response.end(JSON.stringify(sourceGenerationDaemonResponse));
});
});
server.listen(0, '127.0.0.1');
await once(server, 'listening');
try {
const address = server.address();
if (!address || typeof address === 'string') {
throw new Error('expected TCP server address');
}
const port = createHttpSemanticLayerComputePort({ baseUrl: `http://127.0.0.1:${address.port}` });
await expect(port.generateSources(sourceGenerationInput)).resolves.toEqual({
sourceCount: 2,
sources: sourceGenerationDaemonResponse.sources,
});
expect(requests).toEqual([
{
url: '/semantic-layer/generate-sources',
body: sourceGenerationDaemonPayload,
},
]);
} finally {
server.close();
}
});
});

View file

@ -0,0 +1,304 @@
import { request as httpRequest } from 'node:http';
import { request as httpsRequest } from 'node:https';
import { URL } from 'node:url';
import { spawn } from 'node:child_process';
import type { SemanticLayerQueryInput, SemanticLayerSource } from '../sl/index.js';
export interface KloSemanticLayerComputeQueryResult {
sql: string;
dialect: string;
columns: Array<Record<string, unknown>>;
plan: Record<string, unknown>;
}
export interface KloSemanticLayerComputeValidationResult {
valid: boolean;
errors: string[];
warnings: string[];
perSourceWarnings: Record<string, string[]>;
}
export interface KloSemanticLayerSourceGenerationColumnInput {
name: string;
type: string;
primaryKey?: boolean;
nullable?: boolean;
comment?: string | null;
}
export interface KloSemanticLayerSourceGenerationTableInput {
name: string;
catalog?: string | null;
db?: string | null;
comment?: string | null;
columns: KloSemanticLayerSourceGenerationColumnInput[];
}
export interface KloSemanticLayerSourceGenerationLinkInput {
fromTable: string;
fromColumn: string;
toTable: string;
toColumn: string;
relationshipType: string;
}
export interface KloSemanticLayerSourceGenerationInput {
tables: KloSemanticLayerSourceGenerationTableInput[];
links: KloSemanticLayerSourceGenerationLinkInput[];
dialect?: string;
}
export interface KloSemanticLayerSourceGenerationResult {
sources: Array<Record<string, unknown>>;
sourceCount: number;
}
export interface KloSemanticLayerComputePort {
query(input: {
sources: Array<Record<string, unknown> | SemanticLayerSource>;
query: SemanticLayerQueryInput;
dialect: string;
}): Promise<KloSemanticLayerComputeQueryResult>;
validateSources(input: {
sources: Array<Record<string, unknown> | SemanticLayerSource>;
dialect: string;
recentlyTouched?: string[];
}): Promise<KloSemanticLayerComputeValidationResult>;
generateSources(input: KloSemanticLayerSourceGenerationInput): Promise<KloSemanticLayerSourceGenerationResult>;
}
export type KloDaemonCommand = 'semantic-query' | 'semantic-validate' | 'semantic-generate-sources';
export type KloDaemonJsonRunner = (
subcommand: KloDaemonCommand,
payload: Record<string, unknown>,
) => Promise<Record<string, unknown>>;
export type KloDaemonHttpJsonRunner = (path: string, payload: Record<string, unknown>) => Promise<Record<string, unknown>>;
export interface PythonSemanticLayerComputeOptions {
command?: string;
args?: string[];
cwd?: string;
env?: NodeJS.ProcessEnv;
runJson?: KloDaemonJsonRunner;
}
export interface HttpSemanticLayerComputeOptions {
baseUrl: string;
requestJson?: KloDaemonHttpJsonRunner;
}
function parseJsonObject(raw: string, subcommand: string): Record<string, unknown> {
const parsed = JSON.parse(raw) as unknown;
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
throw new Error(`klo-daemon ${subcommand} returned non-object JSON`);
}
return parsed as Record<string, unknown>;
}
function runProcessJson(
options: Required<Pick<PythonSemanticLayerComputeOptions, 'command' | 'args'>> &
Pick<PythonSemanticLayerComputeOptions, 'cwd' | 'env'>,
): KloDaemonJsonRunner {
return async (subcommand: KloDaemonCommand, payload: Record<string, unknown>): Promise<Record<string, unknown>> =>
new Promise((resolve, reject) => {
const child = spawn(options.command, [...options.args, subcommand], {
cwd: options.cwd,
env: { ...process.env, ...options.env },
stdio: ['pipe', 'pipe', 'pipe'],
});
const stdout: Buffer[] = [];
const stderr: Buffer[] = [];
child.stdout.on('data', (chunk: Buffer) => stdout.push(chunk));
child.stderr.on('data', (chunk: Buffer) => stderr.push(chunk));
child.on('error', reject);
child.on('close', (code) => {
const stdoutText = Buffer.concat(stdout).toString('utf8').trim();
const stderrText = Buffer.concat(stderr).toString('utf8').trim();
if (code !== 0) {
reject(new Error(`klo-daemon ${subcommand} failed: ${stderrText || `exit code ${code}`}`));
return;
}
try {
resolve(parseJsonObject(stdoutText, subcommand));
} catch (error) {
reject(error);
}
});
child.stdin.end(`${JSON.stringify(payload)}\n`);
});
}
function normalizedBaseUrl(baseUrl: string): string {
return baseUrl.endsWith('/') ? baseUrl : `${baseUrl}/`;
}
function postJson(baseUrl: string): KloDaemonHttpJsonRunner {
return async (path, payload) =>
new Promise((resolve, reject) => {
const target = new URL(path.replace(/^\//, ''), normalizedBaseUrl(baseUrl));
const body = JSON.stringify(payload);
const client = target.protocol === 'https:' ? httpsRequest : httpRequest;
const request = client(
target,
{
method: 'POST',
headers: {
accept: 'application/json',
'content-type': 'application/json',
'content-length': Buffer.byteLength(body),
},
},
(response) => {
const chunks: Buffer[] = [];
response.on('data', (chunk: Buffer) => chunks.push(chunk));
response.on('end', () => {
const text = Buffer.concat(chunks).toString('utf8');
const statusCode = response.statusCode ?? 0;
if (statusCode < 200 || statusCode >= 300) {
reject(new Error(`klo-daemon HTTP ${path} failed with ${statusCode}: ${text}`));
return;
}
try {
resolve(parseJsonObject(text, path));
} catch (error) {
reject(error);
}
});
},
);
request.on('error', reject);
request.end(body);
});
}
function stringArray(value: unknown): string[] {
return Array.isArray(value) ? value.filter((item): item is string => typeof item === 'string') : [];
}
function recordValue(value: unknown): Record<string, unknown> {
return value && typeof value === 'object' && !Array.isArray(value) ? (value as Record<string, unknown>) : {};
}
function recordArray(value: unknown): Array<Record<string, unknown>> {
return Array.isArray(value)
? value.filter(
(item): item is Record<string, unknown> => item !== null && typeof item === 'object' && !Array.isArray(item),
)
: [];
}
function sourceGenerationPayload(input: KloSemanticLayerSourceGenerationInput): Record<string, unknown> {
return {
tables: input.tables.map((table) => ({
name: table.name,
...(table.catalog !== undefined ? { catalog: table.catalog } : {}),
...(table.db !== undefined ? { db: table.db } : {}),
...(table.comment !== undefined ? { comment: table.comment } : {}),
columns: table.columns.map((column) => ({
name: column.name,
type: column.type,
...(column.primaryKey !== undefined ? { primary_key: column.primaryKey } : {}),
...(column.nullable !== undefined ? { nullable: column.nullable } : {}),
...(column.comment !== undefined ? { comment: column.comment } : {}),
})),
})),
links: input.links.map((link) => ({
from_table: link.fromTable,
from_column: link.fromColumn,
to_table: link.toTable,
to_column: link.toColumn,
relationship_type: link.relationshipType,
})),
dialect: input.dialect ?? 'postgres',
};
}
function sourceGenerationResult(raw: Record<string, unknown>): KloSemanticLayerSourceGenerationResult {
return {
sources: recordArray(raw.sources),
sourceCount: typeof raw.source_count === 'number' ? raw.source_count : recordArray(raw.sources).length,
};
}
export function createPythonSemanticLayerComputePort(
options: PythonSemanticLayerComputeOptions = {},
): KloSemanticLayerComputePort {
const command = options.command ?? 'python';
const args = options.args ?? ['-m', 'klo_daemon'];
const runJson = options.runJson ?? runProcessJson({ command, args, cwd: options.cwd, env: options.env });
return {
async query(input) {
const raw = await runJson('semantic-query', {
sources: input.sources,
dialect: input.dialect,
query: input.query,
});
return {
sql: typeof raw.sql === 'string' ? raw.sql : '',
dialect: typeof raw.dialect === 'string' ? raw.dialect : input.dialect,
columns: recordArray(raw.columns),
plan: recordValue(raw.plan),
};
},
async validateSources(input) {
const raw = await runJson('semantic-validate', {
sources: input.sources,
dialect: input.dialect,
recently_touched: input.recentlyTouched,
});
return {
valid: raw.valid === true,
errors: stringArray(raw.errors),
warnings: stringArray(raw.warnings),
perSourceWarnings: recordValue(raw.per_source_warnings) as Record<string, string[]>,
};
},
async generateSources(input) {
const raw = await runJson('semantic-generate-sources', sourceGenerationPayload(input));
return sourceGenerationResult(raw);
},
};
}
export function createHttpSemanticLayerComputePort(
options: HttpSemanticLayerComputeOptions,
): KloSemanticLayerComputePort {
const requestJson = options.requestJson ?? postJson(options.baseUrl);
return {
async query(input) {
const raw = await requestJson('/semantic-layer/query', {
sources: input.sources,
dialect: input.dialect,
query: input.query,
});
return {
sql: typeof raw.sql === 'string' ? raw.sql : '',
dialect: typeof raw.dialect === 'string' ? raw.dialect : input.dialect,
columns: recordArray(raw.columns),
plan: recordValue(raw.plan),
};
},
async validateSources(input) {
const raw = await requestJson('/semantic-layer/validate', {
sources: input.sources,
dialect: input.dialect,
recently_touched: input.recentlyTouched,
});
return {
valid: raw.valid === true,
errors: stringArray(raw.errors),
warnings: stringArray(raw.warnings),
perSourceWarnings: recordValue(raw.per_source_warnings) as Record<string, string[]>,
};
},
async generateSources(input) {
const raw = await requestJson('/semantic-layer/generate-sources', sourceGenerationPayload(input));
return sourceGenerationResult(raw);
},
};
}

View file

@ -0,0 +1,12 @@
import { describe, expect, it } from 'vitest';
import { kloContextPackageInfo } from './index.js';
describe('kloContextPackageInfo', () => {
it('identifies the context package', () => {
expect(kloContextPackageInfo).toEqual({
name: '@klo/context',
version: '0.0.0-private',
});
});
});

View file

@ -0,0 +1,144 @@
export interface KloContextPackageInfo {
name: '@klo/context';
version: '0.0.0-private';
}
export const kloContextPackageInfo: KloContextPackageInfo = {
name: '@klo/context',
version: '0.0.0-private',
};
export * from './agent/index.js';
export * from './core/index.js';
export * from './daemon/index.js';
export * from './ingest/index.js';
export * from './llm/index.js';
export type {
CaptureSession,
CaptureSignals,
MemoryAgentInput,
MemoryAgentResult,
MemoryAgentServiceDeps,
MemoryAgentSettings,
MemoryAgentSourceType,
MemoryCommitMessagePort,
MemoryConnectionPort,
MemoryFileStorePort,
MemoryKnowledgeSlRefsPort,
MemoryLockPort,
MemorySlSourceReconcilerPort,
MemoryTelemetryPort,
MemoryToolSetLike,
MemoryToolsetFactoryPort,
} from './memory/index.js';
export * from './project/index.js';
export * from './prompts/index.js';
export * from './search/index.js';
export * from './sql-analysis/index.js';
export type {
KloColumnAnalysisResult,
KloColumnDescriptionPromptInput,
KloColumnEmbeddingForeignKeys,
KloColumnEmbeddingTextInput,
KloColumnSampleInput,
KloColumnSampleResult,
KloColumnSampleUpdate,
KloColumnStatsInput,
KloColumnStatsResult,
KloConnectionDriver,
KloConnectorCapabilities,
KloCredentialEnvelope,
KloCredentialEnvReference,
KloCredentialFileReference,
KloDataDictionaryColumnState,
KloDataDictionarySampleDecision,
KloDataDictionarySettings,
KloDataDictionarySkipReason,
KloDataSourceDescriptionPromptInput,
KloDescriptionCachePort,
KloDescriptionColumn,
KloDescriptionColumnTable,
KloDescriptionGenerationSettings,
KloDescriptionGeneratorOptions,
KloDescriptionSource,
KloDescriptionTableInput,
KloDescriptionUpdate,
KloEmbeddingPort as KloScanEmbeddingPort,
KloEmbeddingUpdate,
KloEnrichedColumn,
KloEnrichedRelationship,
KloEnrichedSchema,
KloEnrichedTable,
KloEnrichmentScanPhaseResult,
KloGenerateColumnDescriptionsInput,
KloGenerateDataSourceDescriptionInput,
KloGenerateTableDescriptionInput,
KloOptionalConnectorCapabilities,
KloProgressPort,
KloQueryResult as KloScanQueryResult,
KloReadOnlyQueryInput,
KloRelationshipEndpoint,
KloRelationshipSource,
KloRelationshipType,
KloRelationshipUpdate,
KloResolvedCredentialEnvelope,
KloScanArtifactPaths,
KloScanConnector,
KloScanContext,
KloScanDiffSummary,
KloScanEnrichmentSummary,
KloScanInput,
KloScanLoggerPort,
KloScanMetadataStore,
KloScanMode,
KloScanOrchestratorOptions,
KloScanOrchestratorRunInput,
KloScanOrchestratorRunResult,
KloScanRelationshipSummary,
KloScanReport,
KloScanTrigger,
KloScanWarning,
KloScanWarningCode,
KloSchemaColumn,
KloSchemaDimensionType,
KloSchemaForeignKey,
KloSchemaScope,
KloSchemaSnapshot,
KloSchemaTable,
KloSchemaTableKind,
KloSkippedRelationship,
KloStructuralScanPhaseResult,
KloStructuralSyncPlan,
KloStructuralSyncStats,
KloTableDescriptionPromptInput,
KloTableRef,
KloTableSampleInput,
KloTableSampleResult,
KloColumnTypeMapping,
} from './scan/index.js';
export {
appendKloWordLimitInstruction,
buildKloColumnDescriptionPrompt,
buildKloColumnEmbeddingText,
buildKloDataSourceDescriptionPrompt,
buildKloTableDescriptionPrompt,
createKloConnectorCapabilities,
defaultKloDataDictionarySettings,
inferKloDimensionType,
isKloDataDictionaryCandidate,
kloColumnTypeMappingFromNative,
KloDescriptionGenerator,
KloScanOrchestrator,
normalizeKloNativeType,
REDACTED_KLO_CREDENTIAL_VALUE,
redactKloCredentialEnvelope,
redactKloCredentialValue,
redactKloScanMetadata,
redactKloScanReport,
redactKloScanWarning,
shouldKloSampleColumnForDictionary,
} from './scan/index.js';
export * from './skills/index.js';
export * from './sl/index.js';
export * from './tools/index.js';
export * from './wiki/index.js';

View file

@ -0,0 +1,42 @@
import { describe, expect, it } from 'vitest';
import { actionTargetConnectionId, memoryActionIdentity } from './action-identity.js';
describe('memory action target identity', () => {
it('keys SL actions by target connection and wiki actions by run connection', () => {
expect(
memoryActionIdentity(
{ target: 'sl', type: 'created', key: 'orders', detail: '', targetConnectionId: 'warehouse-b' },
'looker-run',
),
).toBe('sl:warehouse-b:orders');
expect(memoryActionIdentity({ target: 'sl', type: 'created', key: 'orders', detail: '' }, 'warehouse-a')).toBe(
'sl:warehouse-a:orders',
);
expect(
memoryActionIdentity(
{
target: 'wiki',
type: 'created',
key: 'knowledge/global/orders.md',
detail: '',
targetConnectionId: 'ignored',
},
'looker-run',
),
).toBe('wiki:looker-run:knowledge/global/orders.md');
});
it('resolves action target connection only for SL actions', () => {
expect(
actionTargetConnectionId(
{ target: 'sl', type: 'updated', key: 'orders', detail: '', targetConnectionId: 'warehouse-b' },
'looker-run',
),
).toBe('warehouse-b');
expect(actionTargetConnectionId({ target: 'wiki', type: 'updated', key: 'orders', detail: '' }, 'looker-run')).toBe(
'looker-run',
);
});
});

View file

@ -0,0 +1,9 @@
import type { MemoryAction } from '../memory/index.js';
export function actionTargetConnectionId(action: MemoryAction, runConnectionId: string): string {
return action.target === 'sl' ? (action.targetConnectionId ?? runConnectionId) : runConnectionId;
}
export function memoryActionIdentity(action: MemoryAction, runConnectionId: string): string {
return `${action.target}:${actionTargetConnectionId(action, runConnectionId)}:${action.key}`;
}

View file

@ -0,0 +1,75 @@
import { describe, expect, it } from 'vitest';
import type { DbtParsedTable } from './parse-schema.js';
import { findMatchingKloTable, matchDbtTables, type DbtHostTableLite } from './match-tables.js';
const hostTables: DbtHostTableLite[] = [
{ id: '1', name: 'orders', catalog: 'warehouse', db: 'analytics', columns: [{ id: 'c1', name: 'id' }] },
{ id: '2', name: 'orders', catalog: 'warehouse', db: 'staging', columns: [{ id: 'c2', name: 'id' }] },
{ id: '3', name: 'customers', catalog: null, db: null, columns: [{ id: 'c3', name: 'id' }] },
];
function table(input: Partial<DbtParsedTable>): DbtParsedTable {
return {
name: 'orders',
description: null,
database: null,
schema: null,
columns: [],
resourceType: 'model',
...input,
};
}
describe('dbt descriptions table matching', () => {
it('uses schema plus name first and checks catalog when dbt database is present', () => {
expect(
findMatchingKloTable(table({ database: 'warehouse', schema: 'analytics' }), hostTables, null)?.id,
).toBe('1');
});
it('does not fall back to name-only for source tables', () => {
expect(findMatchingKloTable(table({ resourceType: 'source' }), hostTables, null)).toBeUndefined();
});
it('uses targetSchema for models and name-only only when unique', () => {
expect(findMatchingKloTable(table({ resourceType: 'model' }), hostTables, 'staging')?.id).toBe('2');
expect(findMatchingKloTable(table({ name: 'customers', resourceType: 'model' }), hostTables, null)?.id).toBe(
'3',
);
expect(findMatchingKloTable(table({ resourceType: 'model' }), hostTables, null)).toBeUndefined();
});
it('summarizes matched columns and descriptions', () => {
const matches = matchDbtTables(
[
table({
name: 'customers',
description: 'Customers',
columns: [
{ name: 'id', description: 'Primary key', dataType: null },
{ name: 'missing', description: 'Missing', dataType: null },
],
}),
],
hostTables,
null,
);
expect(matches).toEqual([
{
dbtTable: 'customers',
dbtSchema: null,
dbtDatabase: null,
hostTableId: '3',
hostTableName: 'customers',
matched: true,
tableDescriptionAction: 'import',
tableDescriptionFound: true,
columnsToImport: 1,
columnsMatched: 1,
columnsTotal: 2,
columnDescriptionsFound: 1,
},
]);
});
});

View file

@ -0,0 +1,127 @@
import type { DbtParsedTable } from './parse-schema.js';
export interface DbtHostTableLite {
id: string;
name: string;
catalog: string | null;
db: string | null;
columns: Array<{ id: string; name: string }>;
}
export interface DbtTableMatch {
dbtTable: string;
dbtSchema: string | null;
dbtDatabase: string | null;
hostTableId: string | null;
hostTableName: string | null;
matched: boolean;
tableDescriptionAction: 'skip' | 'import';
tableDescriptionFound: boolean;
columnsToImport: number;
columnsMatched: number;
columnsTotal: number;
columnDescriptionsFound: number;
}
export function matchDbtTables(
dbtTables: DbtParsedTable[],
hostTables: DbtHostTableLite[],
targetSchema?: string | null,
): DbtTableMatch[] {
return dbtTables.map((dbtTable) => {
const hostTable = findMatchingKloTable(dbtTable, hostTables, targetSchema);
if (!hostTable) {
return {
dbtTable: dbtTable.name,
dbtSchema: dbtTable.schema,
dbtDatabase: dbtTable.database,
hostTableId: null,
hostTableName: null,
matched: false,
tableDescriptionAction: 'skip',
tableDescriptionFound: Boolean(dbtTable.description),
columnsToImport: 0,
columnsMatched: 0,
columnsTotal: dbtTable.columns.length,
columnDescriptionsFound: dbtTable.columns.filter((column) => Boolean(column.description)).length,
};
}
const analysis = analyzeColumns(dbtTable, hostTable);
return {
dbtTable: dbtTable.name,
dbtSchema: dbtTable.schema,
dbtDatabase: dbtTable.database,
hostTableId: hostTable.id,
hostTableName: hostTable.name,
matched: true,
tableDescriptionAction: dbtTable.description ? 'import' : 'skip',
tableDescriptionFound: Boolean(dbtTable.description),
...analysis,
};
});
}
export function findMatchingKloTable(
dbtTable: DbtParsedTable,
hostTables: DbtHostTableLite[],
targetSchema?: string | null,
): DbtHostTableLite | undefined {
const dbtName = dbtTable.name.toLowerCase();
const effectiveSchema = dbtTable.schema ?? targetSchema ?? null;
if (effectiveSchema) {
const strictMatch = hostTables.find((table) => {
const nameMatches = table.name.toLowerCase() === dbtName;
const schemaMatches = table.db?.toLowerCase() === effectiveSchema.toLowerCase();
if (!nameMatches || !schemaMatches) {
return false;
}
if (dbtTable.database && table.catalog) {
return table.catalog.toLowerCase() === dbtTable.database.toLowerCase();
}
return true;
});
if (strictMatch) {
return strictMatch;
}
}
if (dbtTable.resourceType === 'source') {
return undefined;
}
const nameMatches = hostTables.filter((table) => table.name.toLowerCase() === dbtName);
return nameMatches.length === 1 ? nameMatches[0] : undefined;
}
function analyzeColumns(
dbtTable: DbtParsedTable,
hostTable: DbtHostTableLite,
): Pick<DbtTableMatch, 'columnsToImport' | 'columnsMatched' | 'columnsTotal' | 'columnDescriptionsFound'> {
let columnsToImport = 0;
let columnsMatched = 0;
let columnDescriptionsFound = 0;
for (const dbtColumn of dbtTable.columns) {
const hostColumn = hostTable.columns.find(
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
);
if (!hostColumn) {
continue;
}
columnsMatched++;
if (dbtColumn.description) {
columnDescriptionsFound++;
columnsToImport++;
}
}
return {
columnsToImport,
columnsMatched,
columnsTotal: dbtTable.columns.length,
columnDescriptionsFound,
};
}

View file

@ -0,0 +1,62 @@
import { describe, expect, it } from 'vitest';
import type { ParsedSemanticModel } from '../metricflow/deep-parse.js';
import { mergeSemanticModelTables } from './merge-semantic-model-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
const semanticModel: ParsedSemanticModel = {
name: 'orders_semantic',
description: 'Order facts',
modelRef: 'fct_orders',
dimensions: [
{ name: 'status', column: 'status', type: 'categorical', description: 'Order status' },
{ name: 'ordered_at', column: 'ordered_at', type: 'time' },
],
measures: [],
entities: [],
defaultTimeDimension: null,
};
describe('mergeSemanticModelTables', () => {
it('adds missing MetricFlow model refs as dbt model tables', () => {
const input: DbtSchemaParseResult = { projectName: null, dbtVersion: null, tables: [], relationships: [] };
expect(mergeSemanticModelTables(input, [semanticModel])).toEqual({
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'fct_orders',
description: 'Order facts',
database: null,
schema: null,
resourceType: 'model',
columns: [
{ name: 'status', description: 'Order status', dataType: null },
{ name: 'ordered_at', description: null, dataType: 'TIMESTAMP' },
],
},
],
});
});
it('does not add a duplicate table when schema parsing already found the model ref', () => {
const input: DbtSchemaParseResult = {
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'FCT_ORDERS',
description: 'Existing',
database: null,
schema: null,
resourceType: 'model',
columns: [],
},
],
};
expect(mergeSemanticModelTables(input, [semanticModel]).tables).toHaveLength(1);
});
});

View file

@ -0,0 +1,37 @@
import type { ParsedSemanticModel } from '../metricflow/deep-parse.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export function mergeSemanticModelTables(
parseResult: DbtSchemaParseResult,
semanticModels: ParsedSemanticModel[],
): DbtSchemaParseResult {
const merged: DbtSchemaParseResult = {
...parseResult,
tables: [...parseResult.tables],
relationships: [...parseResult.relationships],
};
const existingTableNames = new Set(merged.tables.map((table) => table.name.toLowerCase()));
for (const model of semanticModels) {
const tableName = model.modelRef;
if (existingTableNames.has(tableName.toLowerCase())) {
continue;
}
merged.tables.push({
name: tableName,
description: model.description,
database: null,
schema: null,
columns: model.dimensions.map((dimension) => ({
name: dimension.column,
description: dimension.description ?? null,
dataType: dimension.type === 'time' ? 'TIMESTAMP' : null,
})),
resourceType: 'model',
});
existingTableNames.add(tableName.toLowerCase());
}
return merged;
}

View file

@ -0,0 +1,214 @@
import { describe, expect, it } from 'vitest';
import { parseDbtSchemaFile, parseDbtSchemaFiles } from './parse-schema.js';
describe('dbt descriptions schema parser', () => {
it('resolves shared dbt vars and defaults before parsing schema YAML', () => {
const result = parseDbtSchemaFile(
`
version: 2
sources:
- name: raw
database: "{{ var('database') }}"
schema: "{{ var('schema', 'fallback_schema') }}"
tables:
- name: orders
identifier: fct_orders
description: "Orders from {{ var('database') }}"
columns:
- name: customer_id
description: "Customer id"
tests:
- relationships:
to: ref('customers')
field: id
models:
- name: "{{ var('model_name', 'orders_model') }}"
schema: "{{ var('model_schema') }}"
columns:
- name: id
description: "Order id"
`,
{ path: 'models/schema.yml', variables: new Map([['database', 'analytics'], ['model_schema', 'mart']]) },
);
expect(result.tables).toEqual([
{
name: 'fct_orders',
description: 'Orders from analytics',
database: 'analytics',
schema: 'fallback_schema',
columns: [
{
name: 'customer_id',
description: 'Customer id',
dataType: null,
dataTests: [{ name: 'relationships', package: 'dbt', kwargs: { to: "ref('customers')", field: 'id' } }],
},
],
resourceType: 'source',
},
{
name: 'orders_model',
description: null,
database: null,
schema: 'mart',
columns: [{ name: 'id', description: 'Order id', dataType: null }],
resourceType: 'model',
},
]);
expect(result.relationships).toEqual([
{
fromTable: 'fct_orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
fromSchema: 'fallback_schema',
},
]);
});
it('deduplicates tables by database schema and name while merging columns', () => {
const result = parseDbtSchemaFiles([
{
path: 'models/a.yml',
content: `
version: 2
models:
- name: orders
description: Orders
columns:
- name: id
description: Primary key
`,
},
{
path: 'models/b.yml',
content: `
version: 2
models:
- name: orders
columns:
- name: status
description: Status
- name: id
data_type: integer
`,
},
]);
expect(result.tables).toEqual([
{
name: 'orders',
description: 'Orders',
database: null,
schema: null,
resourceType: 'model',
columns: [
{ name: 'id', description: 'Primary key', dataType: 'integer' },
{ name: 'status', description: 'Status', dataType: null },
],
},
]);
});
it('returns an empty result for malformed YAML and preserves unresolved Jinja text', () => {
expect(parseDbtSchemaFile('{{{{ invalid yaml', { path: 'broken.yml' })).toEqual({
projectName: null,
dbtVersion: null,
tables: [],
relationships: [],
});
const unresolved = parseDbtSchemaFile(
`
version: 2
models:
- name: "{{ var('missing_model') }}"
`,
{ variables: new Map() },
);
expect(unresolved.tables[0]?.name).toBe("{{ var('missing_model') }}");
});
it('extracts data tests, constraints, enum values, tags, and freshness', () => {
const result = parseDbtSchemaFile(`
version: 2
sources:
- name: raw
schema: jaffle
tags: ["raw"]
tables:
- name: customers
tags: ["core"]
loaded_at_field: updated_at
freshness:
warn_after: { count: 12, period: hour }
columns:
- name: id
tests:
- not_null
- unique
- name: status
data_tests:
- accepted_values:
values: ['active', 'inactive']
models:
- name: orders
tags: ["finance"]
loaded_at_field: run_at
columns:
- name: status
data_tests:
- dbt_utils.expression_is_true:
expression: "status is not null"
- accepted_values: ['placed', 'shipped']
`);
const customers = result.tables.find((table) => table.name === 'customers');
expect(customers?.tagsDbt).toEqual(['raw', 'core']);
expect(customers?.freshnessDbt?.loadedAtField).toBe('updated_at');
expect(customers?.freshnessDbt?.raw).toBeDefined();
const id = customers?.columns.find((column) => column.name === 'id');
expect(id?.constraints?.dbt).toEqual({ not_null: true, unique: true });
const status = customers?.columns.find((column) => column.name === 'status');
expect(status?.enumValuesDbt).toEqual(['active', 'inactive']);
const orders = result.tables.find((table) => table.name === 'orders');
expect(orders?.tagsDbt).toEqual(['finance']);
expect(orders?.freshnessDbt?.loadedAtField).toBe('run_at');
const ordersStatus = orders?.columns.find((column) => column.name === 'status');
expect(ordersStatus?.enumValuesDbt).toEqual(['placed', 'shipped']);
expect(ordersStatus?.dataTests).toEqual(
expect.arrayContaining([
expect.objectContaining({ package: 'dbt_utils', name: 'expression_is_true' }),
expect.objectContaining({ package: 'dbt', name: 'accepted_values' }),
]),
);
});
it('parses relationships from model column data tests', () => {
const result = parseDbtSchemaFile(`
version: 2
models:
- name: orders
schema: public
columns:
- name: customer_id
data_tests:
- relationships:
arguments:
to: "ref('customers')"
field: id
`);
expect(result.relationships).toEqual([
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
fromSchema: 'public',
},
]);
});
});

View file

@ -0,0 +1,655 @@
import { createHash } from 'node:crypto';
import { parse as parseYaml } from 'yaml';
import { type KloLogger, noopLogger } from '../../../core/index.js';
import { resolveJinjaVariables } from '../../dbt-shared/project-vars.js';
export interface DbtParsedColumn {
name: string;
description: string | null;
dataType: string | null;
dataTests?: DbtDataTestRef[];
constraints?: DbtColumnConstraints;
enumValuesDbt?: string[];
}
export interface DbtDataTestRef {
name: string;
package: string;
kwargs?: Record<string, unknown>;
}
export interface DbtColumnConstraints {
dbt: {
not_null?: boolean;
unique?: boolean;
};
}
export interface DbtParsedRelationship {
fromTable: string;
fromColumn: string;
toTable: string;
toColumn: string;
fromSchema?: string;
toSchema?: string;
description?: string;
}
export interface DbtParsedTable {
name: string;
description: string | null;
database: string | null;
schema: string | null;
columns: DbtParsedColumn[];
resourceType?: 'source' | 'model';
tagsDbt?: string[];
freshnessDbt?: {
raw?: unknown;
loadedAtField?: string | null;
};
}
export interface DbtSchemaParseResult {
projectName: string | null;
dbtVersion: string | null;
tables: DbtParsedTable[];
relationships: DbtParsedRelationship[];
}
export interface DbtSchemaFile {
content: string;
path: string;
}
interface ParseDbtSchemaOptions {
path?: string;
variables?: Map<string, string>;
projectName?: string | null;
logger?: KloLogger;
}
interface DbtSchemaYaml {
version?: number;
sources?: DbtSchemaSource[];
models?: DbtSchemaModel[];
}
interface DbtSchemaSource {
name: string;
description?: string;
database?: string;
schema?: string;
tags?: string[];
tables?: DbtSchemaTable[];
}
interface DbtSchemaTable {
name: string;
description?: string;
identifier?: string;
tags?: string[];
loaded_at_field?: string;
freshness?: unknown;
columns?: DbtSchemaColumn[];
}
interface DbtSchemaModel {
name: string;
description?: string;
database?: string;
schema?: string;
tags?: string[];
loaded_at_field?: string;
freshness?: unknown;
columns?: DbtSchemaColumn[];
}
interface DbtSchemaColumn {
name: string;
description?: string;
data_type?: string;
data_tests?: DbtSchemaDataTest[];
tests?: DbtSchemaDataTest[];
}
type DbtSchemaDataTest =
| string
| {
relationships?: {
to?: string;
field?: string;
arguments?: { to?: string; field?: string };
};
not_null?: unknown;
unique?: unknown;
accepted_values?: { values?: unknown } | unknown;
[key: string]: unknown;
};
export function parseDbtSchemaFile(content: string, options: ParseDbtSchemaOptions = {}): DbtSchemaParseResult {
return new DbtSchemaParser(options.logger ?? noopLogger).parseFile(content, options);
}
export function parseDbtSchemaFiles(
files: DbtSchemaFile[],
variables?: Map<string, string>,
options: { projectName?: string | null; logger?: KloLogger } = {},
): DbtSchemaParseResult {
return new DbtSchemaParser(options.logger ?? noopLogger).parseFiles(files, variables, options.projectName ?? null);
}
export function computeDbtSchemaHash(files: DbtSchemaFile[]): string {
const combined = [...files]
.sort((a, b) => a.path.localeCompare(b.path))
.map((file) => `${file.path}:${file.content}`)
.join('\n');
return createHash('sha256').update(combined).digest('hex').substring(0, 16);
}
class DbtSchemaParser {
constructor(private readonly logger: KloLogger) {}
parseFile(yamlContent: string, options: ParseDbtSchemaOptions = {}): DbtSchemaParseResult {
this.logger.debug(`Parsing schema file: ${options.path ?? 'unknown'}`);
const resolved = options.variables
? resolveJinjaVariables(yamlContent, options.variables)
: { content: yamlContent, unresolvedVars: [] };
if (resolved.unresolvedVars.length > 0) {
this.logger.warn(
`Unresolved dbt variables in ${options.path ?? 'schema file'}: ${resolved.unresolvedVars.join(', ')}`,
);
}
let schema: DbtSchemaYaml;
try {
schema = parseYaml(resolved.content) as DbtSchemaYaml;
} catch (error) {
this.logger.warn(`Failed to parse YAML${options.path ? ` at ${options.path}` : ''}: ${error}`);
return this.emptyResult(options.projectName ?? null);
}
if (!schema || typeof schema !== 'object') {
return this.emptyResult(options.projectName ?? null);
}
const tables = [...this.parseSources(schema.sources), ...this.parseModels(schema.models)];
const relationships = [
...this.parseSourceRelationships(schema.sources),
...this.parseModelRelationships(schema.models),
];
return {
projectName: options.projectName ?? null,
dbtVersion: null,
tables,
relationships,
};
}
parseFiles(
files: DbtSchemaFile[],
variables?: Map<string, string>,
projectName: string | null = null,
): DbtSchemaParseResult {
const allTables: DbtParsedTable[] = [];
const allRelationships: DbtParsedRelationship[] = [];
for (const file of files) {
const result = this.parseFile(file.content, { path: file.path, variables, projectName });
allTables.push(...result.tables);
allRelationships.push(...result.relationships);
}
return {
projectName,
dbtVersion: null,
tables: this.deduplicateTables(allTables),
relationships: this.deduplicateRelationships(allRelationships),
};
}
private parseSources(sources: DbtSchemaSource[] | undefined): DbtParsedTable[] {
if (!sources || !Array.isArray(sources)) {
return [];
}
const tables: DbtParsedTable[] = [];
for (const source of sources) {
const sourceSchema = source.schema ?? source.name;
const sourceDatabase = source.database ?? null;
const sourceTags = this.normalizeTagList(source.tags);
if (!source.tables || !Array.isArray(source.tables)) {
continue;
}
for (const table of source.tables) {
const tagsDbt = this.mergeTagsDbt(sourceTags, this.normalizeTagList(table.tags));
const freshnessDbt = this.buildFreshnessDbt(table.freshness, table.loaded_at_field);
tables.push({
name: table.identifier ?? table.name,
description: this.normalizeDescription(table.description),
database: sourceDatabase,
schema: sourceSchema,
columns: this.parseColumns(table.columns),
resourceType: 'source',
...(tagsDbt ? { tagsDbt } : {}),
...(freshnessDbt ? { freshnessDbt } : {}),
});
}
}
return tables;
}
private parseModels(models: DbtSchemaModel[] | undefined): DbtParsedTable[] {
if (!models || !Array.isArray(models)) {
return [];
}
const tables: DbtParsedTable[] = [];
for (const model of models) {
if (!model.name) {
continue;
}
const tagsDbt = this.mergeTagsDbt(this.normalizeTagList(model.tags));
const freshnessDbt = this.buildFreshnessDbt(model.freshness, model.loaded_at_field);
tables.push({
name: model.name,
description: this.normalizeDescription(model.description),
database: model.database ?? null,
schema: model.schema ?? null,
columns: this.parseColumns(model.columns),
resourceType: 'model',
...(tagsDbt ? { tagsDbt } : {}),
...(freshnessDbt ? { freshnessDbt } : {}),
});
}
return tables;
}
private parseColumns(columns: DbtSchemaColumn[] | undefined): DbtParsedColumn[] {
if (!columns || !Array.isArray(columns)) {
return [];
}
return columns.map((column) => {
const { refs, constraints, enumValues } = this.parseDataTests(column.data_tests ?? column.tests);
return {
name: column.name,
description: this.normalizeDescription(column.description),
dataType: column.data_type ?? null,
...(refs.length > 0 ? { dataTests: refs } : {}),
...(constraints ? { constraints } : {}),
...(enumValues.length > 0 ? { enumValuesDbt: enumValues } : {}),
};
});
}
private parseDataTests(tests: DbtSchemaDataTest[] | undefined): {
refs: DbtDataTestRef[];
constraints: DbtColumnConstraints | undefined;
enumValues: string[];
} {
const refs: DbtDataTestRef[] = [];
const dbt: { not_null?: boolean; unique?: boolean } = {};
const enumValues: string[] = [];
if (!tests?.length) {
return { refs, constraints: undefined, enumValues };
}
for (const test of tests) {
if (typeof test === 'string') {
const parsed = this.parseTestNameString(test);
refs.push(parsed);
if (parsed.package === 'dbt' && parsed.name === 'not_null') {
dbt.not_null = true;
}
if (parsed.package === 'dbt' && parsed.name === 'unique') {
dbt.unique = true;
}
continue;
}
for (const [key, value] of Object.entries(test)) {
if (key === 'relationships') {
refs.push({
name: 'relationships',
package: 'dbt',
...(value && typeof value === 'object' && !Array.isArray(value)
? { kwargs: value as Record<string, unknown> }
: {}),
});
continue;
}
if (key === 'not_null') {
refs.push({ name: 'not_null', package: 'dbt' });
dbt.not_null = true;
continue;
}
if (key === 'unique') {
refs.push({ name: 'unique', package: 'dbt' });
dbt.unique = true;
continue;
}
if (key === 'accepted_values') {
if (Array.isArray(value)) {
enumValues.push(...value.map((item) => String(item)));
refs.push({ name: 'accepted_values', package: 'dbt', kwargs: { values: value } });
continue;
}
if (value && typeof value === 'object' && !Array.isArray(value)) {
const values = (value as { values?: unknown }).values;
if (Array.isArray(values)) {
enumValues.push(...values.map((item) => String(item)));
}
refs.push({ name: 'accepted_values', package: 'dbt', kwargs: value as Record<string, unknown> });
continue;
}
}
refs.push({
...this.parseTestNameString(key),
...(value && typeof value === 'object' && !Array.isArray(value)
? { kwargs: value as Record<string, unknown> }
: {}),
});
}
}
const constraints = dbt.not_null || dbt.unique ? { dbt } : undefined;
return { refs, constraints, enumValues };
}
private parseTestNameString(value: string): { name: string; package: string } {
const parts = value.split('.');
if (parts.length >= 2) {
return { package: parts[0]!, name: parts.slice(1).join('.') };
}
return { package: 'dbt', name: value };
}
private parseSourceRelationships(sources: DbtSchemaSource[] | undefined): DbtParsedRelationship[] {
if (!sources || !Array.isArray(sources)) {
return [];
}
const relationships: DbtParsedRelationship[] = [];
for (const source of sources) {
const sourceSchema = source.schema ?? source.name;
if (!source.tables || !Array.isArray(source.tables)) {
continue;
}
for (const table of source.tables) {
const tableName = table.identifier ?? table.name;
if (!table.columns || !Array.isArray(table.columns)) {
continue;
}
for (const column of table.columns) {
const tests = column.data_tests ?? column.tests ?? [];
for (const test of tests) {
const relationship = this.parseRelationshipTest(test, tableName, column.name, sourceSchema);
if (relationship) {
relationships.push(relationship);
}
}
}
}
}
return relationships;
}
private parseModelRelationships(models: DbtSchemaModel[] | undefined): DbtParsedRelationship[] {
if (!models || !Array.isArray(models)) {
return [];
}
const relationships: DbtParsedRelationship[] = [];
for (const model of models) {
if (!model.name || !model.columns || !Array.isArray(model.columns)) {
continue;
}
for (const column of model.columns) {
const tests = column.data_tests ?? column.tests ?? [];
for (const test of tests) {
const relationship = this.parseRelationshipTest(test, model.name, column.name, model.schema ?? undefined);
if (relationship) {
relationships.push(relationship);
}
}
}
}
return relationships;
}
private parseRelationshipTest(
test: DbtSchemaDataTest,
fromTable: string,
fromColumn: string,
fromSchema?: string,
): DbtParsedRelationship | null {
if (typeof test === 'string' || !test.relationships) {
return null;
}
const relationship = test.relationships;
const toRef = relationship.to ?? relationship.arguments?.to;
const toColumn = relationship.field ?? relationship.arguments?.field;
if (!toRef || !toColumn) {
this.logger.debug(`Skipping incomplete relationship test for ${fromTable}.${fromColumn}`);
return null;
}
const toTable = this.parseRef(toRef);
if (!toTable) {
this.logger.debug(`Could not parse ref: ${toRef}`);
return null;
}
return {
fromTable,
fromColumn,
toTable,
toColumn,
...(fromSchema ? { fromSchema } : {}),
};
}
private parseRef(refString: string): string | null {
const refMatch = refString.match(/ref\s*\(\s*['"]([^'"]+)['"]\s*\)/);
if (refMatch) {
return refMatch[1];
}
const sourceMatch = refString.match(/source\s*\(\s*['"][^'"]+['"]\s*,\s*['"]([^'"]+)['"]\s*\)/);
if (sourceMatch) {
return sourceMatch[1];
}
return null;
}
private normalizeDescription(description: string | undefined): string | null {
if (!description) {
return null;
}
const trimmed = description.trim();
return trimmed.length > 0 ? trimmed : null;
}
private normalizeTagList(tags: string[] | undefined): string[] {
if (!tags || !Array.isArray(tags)) {
return [];
}
return tags.map((tag) => String(tag));
}
private mergeTagsDbt(...lists: Array<string[] | undefined>): string[] | undefined {
const merged: string[] = [];
const seen = new Set<string>();
for (const list of lists) {
for (const item of list ?? []) {
if (!seen.has(item)) {
seen.add(item);
merged.push(item);
}
}
}
return merged.length > 0 ? merged : undefined;
}
private buildFreshnessDbt(freshness: unknown, loadedAtField: string | undefined): DbtParsedTable['freshnessDbt'] {
const loadedTrim = loadedAtField?.trim();
const hasFreshness = freshness !== undefined && freshness !== null;
if (!hasFreshness && !loadedTrim) {
return undefined;
}
return {
...(hasFreshness ? { raw: freshness } : {}),
...(hasFreshness ? { loadedAtField: loadedTrim ?? null } : loadedTrim ? { loadedAtField: loadedTrim } : {}),
};
}
private deduplicateTables(tables: DbtParsedTable[]): DbtParsedTable[] {
const seen = new Map<string, DbtParsedTable>();
for (const table of tables) {
const key = `${table.database ?? ''}.${table.schema ?? ''}.${table.name}`.toLowerCase();
const existing = seen.get(key);
if (!existing) {
seen.set(key, table);
continue;
}
seen.set(key, {
...existing,
description: existing.description ?? table.description,
columns: this.mergeColumns(existing.columns, table.columns),
tagsDbt: this.mergeTagsDbt(existing.tagsDbt, table.tagsDbt),
freshnessDbt: this.mergeFreshnessDbt(existing.freshnessDbt, table.freshnessDbt),
});
}
return Array.from(seen.values());
}
private mergeColumns(existing: DbtParsedColumn[], incoming: DbtParsedColumn[]): DbtParsedColumn[] {
const seen = new Map<string, DbtParsedColumn>();
for (const column of existing) {
seen.set(column.name.toLowerCase(), column);
}
for (const column of incoming) {
const key = column.name.toLowerCase();
const existingColumn = seen.get(key);
if (!existingColumn) {
seen.set(key, column);
continue;
}
seen.set(key, {
...existingColumn,
description: existingColumn.description ?? column.description,
dataType: existingColumn.dataType ?? column.dataType,
dataTests: this.mergeDbtDataTests(existingColumn.dataTests, column.dataTests),
constraints: this.mergeDbtConstraints(existingColumn.constraints, column.constraints),
enumValuesDbt: this.mergeStringList(existingColumn.enumValuesDbt, column.enumValuesDbt),
});
}
return Array.from(seen.values());
}
private deduplicateRelationships(relationships: DbtParsedRelationship[]): DbtParsedRelationship[] {
const seen = new Set<string>();
const result: DbtParsedRelationship[] = [];
for (const relationship of relationships) {
const key =
`${relationship.fromTable}.${relationship.fromColumn}->${relationship.toTable}.${relationship.toColumn}`.toLowerCase();
if (!seen.has(key)) {
seen.add(key);
result.push(relationship);
}
}
return result;
}
private mergeFreshnessDbt(
existing?: DbtParsedTable['freshnessDbt'],
incoming?: DbtParsedTable['freshnessDbt'],
): DbtParsedTable['freshnessDbt'] {
if (!existing && !incoming) {
return undefined;
}
const raw = existing?.raw !== undefined ? existing.raw : incoming?.raw;
const loadedAtField = existing?.loadedAtField ?? incoming?.loadedAtField;
return {
...(raw !== undefined ? { raw } : {}),
...(loadedAtField !== undefined ? { loadedAtField } : {}),
};
}
private mergeDbtConstraints(
existing?: DbtColumnConstraints,
incoming?: DbtColumnConstraints,
): DbtColumnConstraints | undefined {
const notNull = !!(existing?.dbt.not_null || incoming?.dbt.not_null);
const unique = !!(existing?.dbt.unique || incoming?.dbt.unique);
if (!notNull && !unique) {
return undefined;
}
return { dbt: { ...(notNull ? { not_null: true } : {}), ...(unique ? { unique: true } : {}) } };
}
private mergeStringList(existing?: string[], incoming?: string[]): string[] | undefined {
return this.mergeTagsDbt(existing, incoming);
}
private mergeDbtDataTests(existing?: DbtDataTestRef[], incoming?: DbtDataTestRef[]): DbtDataTestRef[] | undefined {
if (!existing?.length) {
return incoming?.length ? [...incoming] : undefined;
}
if (!incoming?.length) {
return [...existing];
}
const tests = new Map<string, DbtDataTestRef>();
for (const test of [...existing, ...incoming]) {
const kwargsKey =
test.kwargs && Object.keys(test.kwargs).length > 0
? `:${createHash('sha256').update(JSON.stringify(test.kwargs)).digest('hex').slice(0, 16)}`
: '';
tests.set(`${test.package}:${test.name}${kwargsKey}`, test);
}
return [...tests.values()];
}
private emptyResult(projectName: string | null): DbtSchemaParseResult {
return {
projectName,
dbtVersion: null,
tables: [],
relationships: [],
};
}
}

View file

@ -0,0 +1,102 @@
import { describe, expect, it } from 'vitest';
import type { DbtSchemaParseResult } from './parse-schema.js';
import { toDescriptionUpdates } from './to-description-updates.js';
import type { DbtHostTableLite } from './match-tables.js';
const hostTables: DbtHostTableLite[] = [
{
id: '1',
name: 'orders',
catalog: 'warehouse',
db: 'analytics',
columns: [
{ id: 'c1', name: 'id' },
{ id: 'c2', name: 'amount' },
],
},
];
function parseResult(description: string | null, columnDescription: string | null): DbtSchemaParseResult {
return {
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'orders',
description,
database: 'warehouse',
schema: 'analytics',
resourceType: 'model',
columns: [
{ name: 'id', description: columnDescription, dataType: null },
{ name: 'missing', description: 'not imported', dataType: null },
],
},
],
};
}
describe('dbt descriptions update payloads', () => {
it('emits dbt writes and matching ai invalidations when descriptions exist', () => {
expect(
toDescriptionUpdates({
connectionId: 'conn-1',
parseResult: parseResult('Orders table', 'Primary key'),
hostTables,
targetSchema: null,
}),
).toEqual({
dbt: [
{
connectionId: 'conn-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'dbt',
tableDescription: 'Orders table',
columnDescriptions: { id: 'Primary key' },
},
],
aiInvalidations: [
{
connectionId: 'conn-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'ai',
},
],
});
});
it('does not emit spurious dbt writes or ai invalidations when no descriptions exist', () => {
expect(
toDescriptionUpdates({
connectionId: 'conn-1',
parseResult: parseResult(null, null),
hostTables,
targetSchema: null,
}),
).toEqual({ dbt: [], aiInvalidations: [] });
});
it('emits ai invalidation without a dbt description write when only structural metadata exists', () => {
const result = parseResult(null, null);
result.tables[0]!.tagsDbt = ['finance'];
expect(
toDescriptionUpdates({
connectionId: 'conn-1',
parseResult: result,
hostTables,
targetSchema: null,
}),
).toEqual({
dbt: [],
aiInvalidations: [
{
connectionId: 'conn-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'ai',
},
],
});
});
});

View file

@ -0,0 +1,70 @@
import type { KloDescriptionUpdate } from '../../../scan/enrichment-types.js';
import { findMatchingKloTable, type DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export interface DbtDescriptionUpdates {
dbt: KloDescriptionUpdate[];
aiInvalidations: KloDescriptionUpdate[];
}
export function toDescriptionUpdates(input: {
connectionId: string;
parseResult: DbtSchemaParseResult;
hostTables: DbtHostTableLite[];
targetSchema: string | null;
}): DbtDescriptionUpdates {
const dbt: KloDescriptionUpdate[] = [];
const aiInvalidations: KloDescriptionUpdate[] = [];
for (const dbtTable of input.parseResult.tables) {
const hostTable = findMatchingKloTable(dbtTable, input.hostTables, input.targetSchema);
if (!hostTable) {
continue;
}
const tableDescription = dbtTable.description ?? undefined;
const columnDescriptions: Record<string, string | null> = {};
for (const dbtColumn of dbtTable.columns) {
if (!dbtColumn.description) {
continue;
}
const hostColumn = hostTable.columns.find(
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
);
if (hostColumn) {
columnDescriptions[hostColumn.name] = dbtColumn.description;
}
}
const hasColumnDescriptions = Object.keys(columnDescriptions).length > 0;
const hasDescriptionChange = tableDescription !== undefined || hasColumnDescriptions;
const hasMetadataChange =
!!dbtTable.tagsDbt?.length ||
dbtTable.freshnessDbt !== undefined ||
dbtTable.columns.some(
(column) => column.constraints !== undefined || !!column.enumValuesDbt?.length || !!column.dataTests?.length,
);
if (!hasDescriptionChange && !hasMetadataChange) {
continue;
}
const tableRef = { catalog: hostTable.catalog, db: hostTable.db, name: hostTable.name };
if (hasDescriptionChange) {
dbt.push({
connectionId: input.connectionId,
table: tableRef,
source: 'dbt',
...(tableDescription !== undefined ? { tableDescription } : {}),
...(hasColumnDescriptions ? { columnDescriptions } : {}),
});
}
aiInvalidations.push({
connectionId: input.connectionId,
table: tableRef,
source: 'ai',
});
}
return { dbt, aiInvalidations };
}

View file

@ -0,0 +1,70 @@
import { describe, expect, it } from 'vitest';
import { toMetadataUpdates } from './to-metadata-updates.js';
describe('toMetadataUpdates', () => {
it('emits source-keyed dbt metadata updates for matched tables and columns', () => {
const updates = toMetadataUpdates({
connectionId: 'conn_1',
targetSchema: 'analytics',
hostTables: [
{
id: 'orders-id',
name: 'orders',
catalog: 'warehouse',
db: 'analytics',
columns: [
{ id: 'status-id', name: 'status' },
{ id: 'created-id', name: 'created_at' },
],
},
],
parseResult: {
projectName: null,
dbtVersion: null,
relationships: [],
tables: [
{
name: 'orders',
description: null,
database: 'warehouse',
schema: 'analytics',
resourceType: 'model',
tagsDbt: ['finance'],
freshnessDbt: { loadedAtField: 'created_at' },
columns: [
{
name: 'status',
description: null,
dataType: null,
enumValuesDbt: ['placed', 'shipped'],
constraints: { dbt: { not_null: true } },
dataTests: [{ name: 'accepted_values', package: 'dbt', kwargs: { values: ['placed', 'shipped'] } }],
},
],
},
],
},
});
expect(updates).toEqual([
{
connectionId: 'conn_1',
table: { catalog: 'warehouse', db: 'analytics', name: 'orders' },
source: 'dbt',
tableFields: {
tags: ['finance'],
freshness: { loaded_at_field: 'created_at' },
},
columnFields: {
status: {
constraints: { not_null: true },
enum_values: ['placed', 'shipped'],
tests: [
{ name: 'accepted_values', package: 'dbt', kwargs: { values: ['placed', 'shipped'] } },
],
},
},
},
]);
});
});

View file

@ -0,0 +1,74 @@
import type { KloMetadataUpdate } from '../../../scan/enrichment-types.js';
import { findMatchingKloTable, type DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export function toMetadataUpdates(input: {
connectionId: string;
parseResult: DbtSchemaParseResult;
hostTables: DbtHostTableLite[];
targetSchema: string | null;
}): KloMetadataUpdate[] {
const updates: KloMetadataUpdate[] = [];
for (const dbtTable of input.parseResult.tables) {
const hostTable = findMatchingKloTable(dbtTable, input.hostTables, input.targetSchema);
if (!hostTable) {
continue;
}
const tableFields: Record<string, unknown> = {};
if (dbtTable.tagsDbt?.length) {
tableFields.tags = dbtTable.tagsDbt;
}
if (dbtTable.freshnessDbt) {
tableFields.freshness = {
...(dbtTable.freshnessDbt.raw !== undefined ? { raw: dbtTable.freshnessDbt.raw } : {}),
...(dbtTable.freshnessDbt.loadedAtField !== undefined
? { loaded_at_field: dbtTable.freshnessDbt.loadedAtField }
: {}),
};
}
const columnFields: Record<string, Record<string, unknown>> = {};
for (const dbtColumn of dbtTable.columns) {
const hostColumn = hostTable.columns.find(
(column) => column.name.toLowerCase() === dbtColumn.name.toLowerCase(),
);
if (!hostColumn) {
continue;
}
const fields: Record<string, unknown> = {};
if (dbtColumn.constraints) {
fields.constraints = dbtColumn.constraints.dbt;
}
if (dbtColumn.enumValuesDbt?.length) {
fields.enum_values = dbtColumn.enumValuesDbt;
}
if (dbtColumn.dataTests?.length) {
fields.tests = dbtColumn.dataTests.map((test) => ({
name: test.name,
package: test.package,
...(test.kwargs ? { kwargs: test.kwargs } : {}),
}));
}
if (Object.keys(fields).length > 0) {
columnFields[hostColumn.name] = fields;
}
}
if (Object.keys(tableFields).length === 0 && Object.keys(columnFields).length === 0) {
continue;
}
updates.push({
connectionId: input.connectionId,
table: { catalog: hostTable.catalog, db: hostTable.db, name: hostTable.name },
source: 'dbt',
...(Object.keys(tableFields).length > 0 ? { tableFields } : {}),
...(Object.keys(columnFields).length > 0 ? { columnFields } : {}),
});
}
return updates;
}

View file

@ -0,0 +1,62 @@
import { describe, expect, it } from 'vitest';
import type { DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
import { toRelationshipUpdates } from './to-relationship-updates.js';
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
const hostTables: DbtHostTableLite[] = [
{
id: '1',
name: 'orders',
catalog: 'warehouse',
db: 'analytics',
columns: [{ id: 'c1', name: 'customer_id' }],
},
{
id: '2',
name: 'customers',
catalog: 'warehouse',
db: 'staging',
columns: [{ id: 'c2', name: 'id' }],
},
];
const parseResult: DbtSchemaParseResult = {
projectName: null,
dbtVersion: null,
tables: [],
relationships: [
{
fromTable: 'orders',
fromColumn: 'customer_id',
toTable: 'customers',
toColumn: 'id',
fromSchema: 'analytics',
toSchema: 'analytics',
description: 'schema intentionally differs from the host customers table',
},
{ fromTable: 'orders', fromColumn: 'missing', toTable: 'customers', toColumn: 'id' },
{ fromTable: 'orders', fromColumn: 'customer_id', toTable: 'missing_table', toColumn: 'id' },
],
};
describe('dbt relationship update payloads', () => {
it('validates relationships using the current name-only matching behavior and dbt provenance', () => {
expect(toRelationshipUpdates({ connectionId: 'conn-1', parseResult, hostTables })).toEqual({
joins: [
{
connectionId: 'conn-1',
fromTable: 'orders',
fromColumns: ['customer_id'],
toTable: 'customers',
toColumns: ['id'],
relationship: 'many_to_one',
author: 'dbt',
authorEmail: DBT_SYSTEM_EMAIL,
},
],
skippedNoMatch: 2,
});
});
});

View file

@ -0,0 +1,57 @@
import type { KloJoinUpdate } from '../../../scan/enrichment-types.js';
import type { DbtHostTableLite } from './match-tables.js';
import type { DbtSchemaParseResult } from './parse-schema.js';
export interface DbtRelationshipUpdates {
joins: KloJoinUpdate[];
skippedNoMatch: number;
}
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
export function toRelationshipUpdates(input: {
connectionId: string;
parseResult: DbtSchemaParseResult;
hostTables: DbtHostTableLite[];
}): DbtRelationshipUpdates {
const tablesByName = new Map<string, DbtHostTableLite>();
for (const table of input.hostTables) {
tablesByName.set(table.name.toLowerCase(), table);
}
const joins: KloJoinUpdate[] = [];
let skippedNoMatch = 0;
for (const relationship of input.parseResult.relationships) {
const fromTable = tablesByName.get(relationship.fromTable.toLowerCase());
const toTable = tablesByName.get(relationship.toTable.toLowerCase());
if (!fromTable || !toTable) {
skippedNoMatch++;
continue;
}
const fromColumn = fromTable.columns.find(
(column) => column.name.toLowerCase() === relationship.fromColumn.toLowerCase(),
);
const toColumn = toTable.columns.find(
(column) => column.name.toLowerCase() === relationship.toColumn.toLowerCase(),
);
if (!fromColumn || !toColumn) {
skippedNoMatch++;
continue;
}
joins.push({
connectionId: input.connectionId,
fromTable: fromTable.name,
fromColumns: [fromColumn.name],
toTable: toTable.name,
toColumns: [toColumn.name],
relationship: 'many_to_one',
author: 'dbt',
authorEmail: DBT_SYSTEM_EMAIL,
});
}
return { joins, skippedNoMatch };
}

View file

@ -0,0 +1,410 @@
import { describe, expect, it } from 'vitest';
import { type DbtHostTableLite, matchDbtTables } from './dbt-descriptions/match-tables.js';
import { mergeSemanticModelTables } from './dbt-descriptions/merge-semantic-model-tables.js';
import { parseDbtSchemaFiles } from './dbt-descriptions/parse-schema.js';
import { toDescriptionUpdates } from './dbt-descriptions/to-description-updates.js';
import { toRelationshipUpdates } from './dbt-descriptions/to-relationship-updates.js';
import { parseMetricflowFiles } from './metricflow/deep-parse.js';
import { mapCrossModelMetricToSource, mapSemanticModelToSource } from './metricflow/semantic-models.js';
const DBT_SYSTEM_EMAIL = ['system@kae', 'lio.dev'].join('');
const metricflowYaml = `
semantic_models:
- name: orders_semantic
description: MetricFlow order facts
model: ref('fct_orders')
defaults:
agg_time_dimension: ordered_at
entities:
- name: customer
type: foreign
expr: customer_id
description: Customer relationship
dimensions:
- name: status
type: categorical
expr: status
description: Order status
- name: ordered_at
type: time
expr: ordered_at
measures:
- name: total_revenue
agg: sum
expr: amount
description: Revenue
- name: customers_semantic
description: Customer dimension
model: ref('dim_customers')
entities:
- name: customer
type: primary
expr: id
dimensions:
- name: country
type: categorical
expr: country
description: Customer country
measures:
- name: customer_count
agg: count
expr: id
description: Customer count
metrics:
- name: total_revenue
type: simple
type_params:
measure: total_revenue
- name: customer_count
type: simple
type_params:
measure: customer_count
- name: revenue_per_customer
description: Revenue per customer
type: derived
type_params:
expr: total_revenue / NULLIF(customer_count, 0)
metrics:
- name: total_revenue
alias: total_revenue
- name: customer_count
alias: customer_count
`;
const schemaYaml = `
version: 2
sources:
- name: raw
database: warehouse
schema: landing
tables:
- name: customers
identifier: dim_customers
description: Raw customer dimension
columns:
- name: id
description: Customer primary key
- name: country
description: Country name
models:
- name: "{{ var('orders_model', 'fct_orders') }}"
schema: "{{ var('mart_schema', 'analytics') }}"
description: Modeled orders
columns:
- name: customer_id
description: Linked customer id
tests:
- relationships:
to: ref('dim_customers')
field: id
- name: status
description: Order status
- name: amount
description: Gross amount
`;
const hostTables: DbtHostTableLite[] = [
{
id: 'orders-table',
name: 'fct_orders',
catalog: 'warehouse',
db: 'analytics',
columns: [
{ id: 'orders-customer-id', name: 'customer_id' },
{ id: 'orders-status', name: 'status' },
{ id: 'orders-amount', name: 'amount' },
{ id: 'orders-ordered-at', name: 'ordered_at' },
],
},
{
id: 'customers-table',
name: 'dim_customers',
catalog: 'warehouse',
db: 'landing',
columns: [
{ id: 'customers-id', name: 'id' },
{ id: 'customers-country', name: 'country' },
],
},
];
describe('dbt extraction golden parity fixture', () => {
it('freezes the relocated MetricFlow and dbt-description contract together', () => {
const metricflow = parseMetricflowFiles([{ path: 'semantic_models/orders.yml', content: metricflowYaml }]);
expect(metricflow).toEqual({
semanticModels: [
{
name: 'orders_semantic',
description: 'MetricFlow order facts',
modelRef: 'fct_orders',
dimensions: [
{
name: 'status',
column: 'status',
type: 'string',
label: 'Status',
description: 'Order status',
},
{
name: 'ordered_at',
column: 'ordered_at',
type: 'time',
label: 'Ordered At',
description: undefined,
},
],
measures: [
{
type: 'simple',
name: 'total_revenue',
column: 'amount',
aggregation: 'sum',
label: 'Total Revenue',
description: 'Revenue',
},
],
entities: [{ name: 'customer', type: 'foreign', expr: 'customer_id', description: 'Customer relationship' }],
defaultTimeDimension: 'ordered_at',
},
{
name: 'customers_semantic',
description: 'Customer dimension',
modelRef: 'dim_customers',
dimensions: [
{
name: 'country',
column: 'country',
type: 'string',
label: 'Country',
description: 'Customer country',
},
],
measures: [
{
type: 'simple',
name: 'customer_count',
column: 'id',
aggregation: 'count',
label: 'Customer Count',
description: 'Customer count',
},
],
entities: [{ name: 'customer', type: 'primary', expr: 'id' }],
defaultTimeDimension: null,
},
],
crossModelMetrics: [
{
name: 'revenue_per_customer',
label: null,
description: 'Revenue per customer',
type: 'derived',
expr: 'total_revenue / NULLIF(customer_count, 0)',
dependsOn: [
{ metricName: 'orders_semantic', alias: 'total_revenue' },
{ metricName: 'customers_semantic', alias: 'customer_count' },
],
filter: null,
},
],
relationships: [
{
fromTable: 'fct_orders',
fromColumn: 'customer_id',
toTable: 'dim_customers',
toColumn: 'id',
description: 'Customer relationship',
},
],
warnings: [],
});
expect(mapSemanticModelToSource(metricflow.semanticModels[0], 'analytics.fct_orders')).toEqual({
name: 'fct-orders',
table: 'analytics.fct_orders',
grain: ['status', 'ordered_at'],
columns: [
{ name: 'status', type: 'string', description: 'Order status' },
{ name: 'ordered_at', type: 'time' },
],
measures: [
{
name: 'total_revenue',
expr: 'sum(amount)',
description: 'Revenue',
},
],
joins: [],
descriptions: { dbt: 'MetricFlow order facts' },
});
expect(mapCrossModelMetricToSource(metricflow.crossModelMetrics[0])).toEqual({
name: 'revenue-per-customer',
sql: 'total_revenue / NULLIF(customer_count, 0)',
descriptions: { dbt: 'Revenue per customer' },
grain: [],
columns: [],
measures: [
{
name: 'revenue_per_customer',
expr: 'total_revenue / NULLIF(customer_count, 0)',
description: 'Revenue per customer',
},
],
joins: [],
});
const schema = parseDbtSchemaFiles(
[{ path: 'models/schema.yml', content: schemaYaml }],
new Map([
['orders_model', 'fct_orders'],
['mart_schema', 'analytics'],
]),
);
const merged = mergeSemanticModelTables(schema, metricflow.semanticModels);
expect(merged).toEqual({
projectName: null,
dbtVersion: null,
tables: [
{
name: 'dim_customers',
description: 'Raw customer dimension',
database: 'warehouse',
schema: 'landing',
columns: [
{ name: 'id', description: 'Customer primary key', dataType: null },
{ name: 'country', description: 'Country name', dataType: null },
],
resourceType: 'source',
},
{
name: 'fct_orders',
description: 'Modeled orders',
database: null,
schema: 'analytics',
columns: [
{
name: 'customer_id',
description: 'Linked customer id',
dataType: null,
dataTests: [
{
name: 'relationships',
package: 'dbt',
kwargs: { to: "ref('dim_customers')", field: 'id' },
},
],
},
{ name: 'status', description: 'Order status', dataType: null },
{ name: 'amount', description: 'Gross amount', dataType: null },
],
resourceType: 'model',
},
],
relationships: [
{
fromTable: 'fct_orders',
fromColumn: 'customer_id',
toTable: 'dim_customers',
toColumn: 'id',
fromSchema: 'analytics',
},
],
});
expect(matchDbtTables(merged.tables, hostTables, 'analytics')).toEqual([
{
dbtTable: 'dim_customers',
dbtSchema: 'landing',
dbtDatabase: 'warehouse',
hostTableId: 'customers-table',
hostTableName: 'dim_customers',
matched: true,
tableDescriptionAction: 'import',
tableDescriptionFound: true,
columnsToImport: 2,
columnsMatched: 2,
columnsTotal: 2,
columnDescriptionsFound: 2,
},
{
dbtTable: 'fct_orders',
dbtSchema: 'analytics',
dbtDatabase: null,
hostTableId: 'orders-table',
hostTableName: 'fct_orders',
matched: true,
tableDescriptionAction: 'import',
tableDescriptionFound: true,
columnsToImport: 3,
columnsMatched: 3,
columnsTotal: 3,
columnDescriptionsFound: 3,
},
]);
expect(
toDescriptionUpdates({
connectionId: 'warehouse-1',
parseResult: merged,
hostTables,
targetSchema: 'analytics',
}),
).toEqual({
dbt: [
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'landing', name: 'dim_customers' },
source: 'dbt',
tableDescription: 'Raw customer dimension',
columnDescriptions: {
id: 'Customer primary key',
country: 'Country name',
},
},
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'fct_orders' },
source: 'dbt',
tableDescription: 'Modeled orders',
columnDescriptions: {
customer_id: 'Linked customer id',
status: 'Order status',
amount: 'Gross amount',
},
},
],
aiInvalidations: [
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'landing', name: 'dim_customers' },
source: 'ai',
},
{
connectionId: 'warehouse-1',
table: { catalog: 'warehouse', db: 'analytics', name: 'fct_orders' },
source: 'ai',
},
],
});
expect(toRelationshipUpdates({ connectionId: 'warehouse-1', parseResult: merged, hostTables })).toEqual({
joins: [
{
connectionId: 'warehouse-1',
fromTable: 'fct_orders',
fromColumns: ['customer_id'],
toTable: 'dim_customers',
toColumns: ['id'],
relationship: 'many_to_one',
author: 'dbt',
authorEmail: DBT_SYSTEM_EMAIL,
},
],
skippedNoMatch: 0,
});
});
});

View file

@ -0,0 +1,36 @@
import { describe, expect, it } from 'vitest';
import { chunkDbtProject } from './chunk.js';
describe('chunkDbtProject', () => {
const diffSet = (modified: string[]) => ({ added: [], modified, deleted: [], unchanged: [] });
it('caps peerFileIndex when the project has very many yaml files', () => {
const modelPaths = Array.from({ length: 201 }, (_, i) => `models/m${i}.yml`);
const allPaths = ['dbt_project.yml', ...modelPaths].sort();
const { workUnits } = chunkDbtProject({ allPaths });
const [first] = workUnits;
expect(first).toBeDefined();
expect(first?.peerFileIndex).toHaveLength(200);
expect(first?.notes).toMatch(/capped at 200/);
});
it('keeps large-project model work units when dbt_project.yml changes', () => {
const modelPaths = Array.from({ length: 30 }, (_, i) => `models/m${i}.yml`);
const allPaths = ['dbt_project.yml', ...modelPaths].sort();
const { workUnits } = chunkDbtProject({ allPaths }, { diffSet: diffSet(['dbt_project.yml']) });
expect(workUnits).toHaveLength(30);
expect(workUnits[0]?.rawFiles).toEqual(['models/m0.yml']);
expect(workUnits[0]?.dependencyPaths).toContain('dbt_project.yml');
});
it('keeps large-project model work units when non-model yaml peers change', () => {
const modelPaths = Array.from({ length: 30 }, (_, i) => `models/m${i}.yml`);
const allPaths = ['dbt_project.yml', 'seeds/seed_properties.yml', ...modelPaths].sort();
const { workUnits } = chunkDbtProject({ allPaths }, { diffSet: diffSet(['seeds/seed_properties.yml']) });
expect(workUnits).toHaveLength(30);
expect(workUnits[0]?.rawFiles).toEqual(['models/m0.yml']);
expect(workUnits[0]?.dependencyPaths).toContain('seeds/seed_properties.yml');
});
});

View file

@ -0,0 +1,130 @@
import type { ChunkResult, DiffSet, WorkUnit } from '../../types.js';
import type { ParsedDbtProject } from './parse.js';
interface ChunkOptions {
diffSet?: DiffSet;
}
/**
* Per-model work units (when the project has more than 25 YAML files) only name `rawFiles` under
* `models/**`. Other `.yml` (e.g. some `seeds/` or custom layouts) still appear in `peerFileIndex`
* or in the small-project / no-models fallbacks v1 does not emit one WU per non-models file.
*/
const MODELS_PREFIX = 'models/';
/** `peerFileIndex` is a hint only (agents may not read those paths). Cap to limit prompt size. */
const MAX_PEER_FILE_INDEX = 200;
function projectYamlPath(allPaths: string[]): string | undefined {
if (allPaths.includes('dbt_project.yml')) {
return 'dbt_project.yml';
}
if (allPaths.includes('dbt_project.yaml')) {
return 'dbt_project.yaml';
}
return undefined;
}
function modelRelativePaths(allPaths: string[]): string[] {
return allPaths.filter((p) => p.replace(/\\/g, '/').startsWith(MODELS_PREFIX)).sort();
}
function unitKeyForModelFile(mf: string): string {
const base = mf
.replace(/\.(ya?ml)$/i, '')
.replace(/\\/g, '/')
.replace(/[^a-zA-Z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
return `dbt-${base.toLowerCase()}`;
}
function emitFirstRunWorkUnits(allPaths: string[], dbtDep: string | undefined): WorkUnit[] {
if (allPaths.length === 0) {
return [];
}
if (allPaths.length <= 25) {
return [
{
unitKey: 'dbt-all',
displayLabel: 'dbt project (all yaml)',
rawFiles: [...allPaths],
peerFileIndex: [],
dependencyPaths: [],
notes: 'dbt project — all YAML in one WorkUnit (≤25 files)',
},
];
}
const modelFiles = modelRelativePaths(allPaths);
if (modelFiles.length === 0) {
return [
{
unitKey: 'dbt-all',
displayLabel: 'dbt project (all yaml, no models/**)',
rawFiles: [...allPaths],
peerFileIndex: [],
dependencyPaths: dbtDep ? [dbtDep] : [],
notes: 'dbt: no models/**/*.yml — single slice with dbt_project as dependency if present',
},
];
}
return modelFiles.map((mf) => {
const allPeers = allPaths.filter((p) => p !== mf).sort();
const truncated = allPeers.length > MAX_PEER_FILE_INDEX;
const peerFileIndex = truncated ? allPeers.slice(0, MAX_PEER_FILE_INDEX) : allPeers;
const dependencyPaths = dbtDep && allPaths.includes(dbtDep) && mf !== dbtDep ? [dbtDep].sort() : [];
const notes = truncated
? `dbt model schema slice (peer index capped at ${MAX_PEER_FILE_INDEX} of ${allPeers.length} paths)`
: 'dbt model schema slice';
return {
unitKey: unitKeyForModelFile(mf),
displayLabel: `dbt ${mf}`,
rawFiles: [mf],
peerFileIndex,
dependencyPaths: dependencyPaths,
notes,
};
});
}
function applyDiffSet(firstRunUnits: WorkUnit[], diffSet: DiffSet): ChunkResult {
const touched = new Set([...diffSet.added, ...diffSet.modified]);
const kept: WorkUnit[] = [];
for (const wu of firstRunUnits) {
const touchedRawFiles = wu.rawFiles.filter((p) => touched.has(p));
const touchedDependencies = wu.dependencyPaths.filter((p) => touched.has(p));
const touchedPeerFiles = wu.peerFileIndex.filter((p) => touched.has(p));
if (touchedRawFiles.length === 0 && touchedDependencies.length === 0 && touchedPeerFiles.length === 0) {
continue;
}
const rawFiles = touchedRawFiles.length > 0 ? touchedRawFiles : wu.rawFiles;
const unchangedRaw = touchedRawFiles.length > 0 ? wu.rawFiles.filter((p) => !touched.has(p)) : [];
for (const p of wu.rawFiles) {
if (!rawFiles.includes(p) && !unchangedRaw.includes(p)) {
unchangedRaw.push(p);
}
}
const combinedDeps = new Set<string>([...wu.dependencyPaths, ...unchangedRaw, ...touchedPeerFiles]);
kept.push({
...wu,
rawFiles: rawFiles.sort(),
dependencyPaths: [...combinedDeps].sort(),
});
}
const eviction = diffSet.deleted.length > 0 ? { deletedRawPaths: [...diffSet.deleted].sort() } : undefined;
return { workUnits: kept, eviction };
}
export function chunkDbtProject(project: ParsedDbtProject, opts: ChunkOptions = {}): ChunkResult {
const dbtDep = projectYamlPath(project.allPaths);
const firstRun = emitFirstRunWorkUnits(project.allPaths, dbtDep);
if (!opts.diffSet) {
return { workUnits: firstRun };
}
return applyDiffSet(firstRun, opts.diffSet);
}

View file

@ -0,0 +1,51 @@
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
import type { SourceAdapter } from '../../types.js';
import { DbtSourceAdapter } from './dbt.adapter.js';
describe('DbtSourceAdapter', () => {
let stagedDir: string;
let adapter: SourceAdapter;
beforeEach(async () => {
stagedDir = await mkdtemp(join(tmpdir(), 'dbt-adapter-'));
adapter = new DbtSourceAdapter();
});
afterEach(async () => {
await rm(stagedDir, { recursive: true, force: true });
});
it('declares the expected source key and skill list', () => {
expect(adapter.source).toBe('dbt');
expect(adapter.skillNames).toEqual(['dbt_ingest']);
});
it('detects a staged dbt project root (dbt_project.yml)', async () => {
await writeFile(join(stagedDir, 'dbt_project.yml'), "name: 'jaffle'\nversion: '1.0.0'\n", 'utf-8');
expect(await adapter.detect(stagedDir)).toBe(true);
});
it('chunk: dbt_project.yml + models/a.yml yields one WU (≤25 files)', async () => {
await writeFile(join(stagedDir, 'dbt_project.yml'), "name: 'jaffle'\n", 'utf-8');
await mkdir(join(stagedDir, 'models'), { recursive: true });
await writeFile(
join(stagedDir, 'models/a.yml'),
'version: 2\nmodels:\n - name: orders\n description: Orders\n',
'utf-8',
);
const result = await adapter.chunk(stagedDir);
expect(result.workUnits).toHaveLength(1);
expect(result.workUnits[0].unitKey).toBe('dbt-all');
expect(result.parseArtifacts).toMatchObject({
projectName: 'jaffle',
tables: [{ name: 'orders', description: 'Orders' }],
});
});
it('implements fetch() for git-backed dbt source setup', () => {
expect(adapter.fetch).toBeTypeOf('function');
});
});

View file

@ -0,0 +1,48 @@
import { join } from 'node:path';
import type { ChunkResult, DiffSet, SourceAdapter } from '../../types.js';
import type { FetchContext } from '../../types.js';
import { loadProjectInfo } from '../../dbt-shared/project-vars.js';
import { loadDbtSchemaFiles } from '../../dbt-shared/schema-files.js';
import { parseDbtSchemaFiles } from '../dbt-descriptions/parse-schema.js';
import { chunkDbtProject } from './chunk.js';
import { detectDbtStagedDir } from './detect.js';
import { fetchDbtRepo, type DbtPullConfig } from './fetch.js';
import { parseDbtStagedDir } from './parse.js';
interface DbtSourceAdapterOptions {
homeDir?: string;
}
export class DbtSourceAdapter implements SourceAdapter {
readonly source = 'dbt' as const;
/** Runner merges: ingest_triage, sl_capture, knowledge_capture (see ingest-bundle.runner.ts) */
readonly skillNames: string[] = ['dbt_ingest'];
constructor(private readonly options: DbtSourceAdapterOptions = {}) {}
detect(stagedDir: string): Promise<boolean> {
return detectDbtStagedDir(stagedDir);
}
async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
const config = pullConfig as DbtPullConfig | undefined;
if (!config?.repoUrl) {
throw new Error('dbt fetch requires repoUrl');
}
await fetchDbtRepo({
config,
cacheDir: join(this.options.homeDir ?? '.klo/cache', 'dbt', ctx.connectionId),
stagedDir,
});
}
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const project = await parseDbtStagedDir(stagedDir);
const projectInfo = await loadProjectInfo(stagedDir);
const schemaFiles = await loadDbtSchemaFiles(stagedDir);
const parseArtifacts = parseDbtSchemaFiles(schemaFiles, projectInfo.variables, {
projectName: projectInfo.projectName,
});
return { ...chunkDbtProject(project, { diffSet }), parseArtifacts };
}
}

View file

@ -0,0 +1,12 @@
import { access } from 'node:fs/promises';
import { join } from 'node:path';
export async function detectDbtStagedDir(stagedDir: string): Promise<boolean> {
for (const name of ['dbt_project.yml', 'dbt_project.yaml'] as const) {
try {
await access(join(stagedDir, name));
return true;
} catch {}
}
return false;
}

View file

@ -0,0 +1,38 @@
import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { fetchDbtRepo } from './fetch.js';
describe('fetchDbtRepo', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'klo-dbt-fetch-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('copies dbt yaml files from a fetched repo subpath into staged dir', async () => {
const cacheDir = join(tempDir, 'cache');
const stagedDir = join(tempDir, 'staged');
await mkdir(join(cacheDir, 'analytics', 'models'), { recursive: true });
await writeFile(join(cacheDir, 'analytics', 'dbt_project.yml'), 'name: analytics\n', 'utf-8');
await writeFile(join(cacheDir, 'analytics', 'models', 'orders.yml'), 'models: []\n', 'utf-8');
const cloneOrPull = vi.fn(async () => ({ commitHash: 'abc123' }));
await expect(
fetchDbtRepo({
config: { repoUrl: 'https://github.com/acme/dbt.git', path: 'analytics' },
cacheDir,
stagedDir,
deps: { cloneOrPull },
}),
).resolves.toEqual({ commitHash: 'abc123', filesCopied: 2 });
await expect(readFile(join(stagedDir, 'dbt_project.yml'), 'utf-8')).resolves.toContain('analytics');
await expect(readFile(join(stagedDir, 'models', 'orders.yml'), 'utf-8')).resolves.toContain('models');
});
});

View file

@ -0,0 +1,60 @@
import { access, copyFile, mkdir, readdir } from 'node:fs/promises';
import { dirname, join, relative } from 'node:path';
import { cloneOrPull, sanitizeRepoError } from '../../repo-fetch.js';
export interface DbtPullConfig {
repoUrl: string;
branch?: string;
path?: string;
authToken?: string | null;
}
export interface FetchDbtRepoParams {
config: DbtPullConfig;
cacheDir: string;
stagedDir: string;
deps?: {
cloneOrPull?: typeof cloneOrPull;
};
}
export async function fetchDbtRepo(params: FetchDbtRepoParams): Promise<{ commitHash: string; filesCopied: number }> {
try {
const runCloneOrPull = params.deps?.cloneOrPull ?? cloneOrPull;
const { commitHash } = await runCloneOrPull({
repoUrl: params.config.repoUrl,
authToken: params.config.authToken,
cacheDir: params.cacheDir,
branch: params.config.branch ?? 'main',
});
const sourceRoot = params.config.path ? join(params.cacheDir, params.config.path) : params.cacheDir;
const filesCopied = await copyYamlFilesRecursive(sourceRoot, params.stagedDir);
return { commitHash, filesCopied };
} catch (error) {
throw new Error(sanitizeRepoError(error, params.config.authToken));
}
}
async function copyYamlFilesRecursive(sourceRoot: string, destRoot: string): Promise<number> {
try {
await access(sourceRoot);
} catch {
return 0;
}
await mkdir(destRoot, { recursive: true });
const entries = await readdir(sourceRoot, { withFileTypes: true, recursive: true });
let copied = 0;
for (const entry of entries) {
if (!entry.isFile() || !/\.ya?ml$/i.test(entry.name)) {
continue;
}
const absSrc = join(entry.parentPath, entry.name);
const rel = relative(sourceRoot, absSrc);
const dest = join(destRoot, rel);
await mkdir(dirname(dest), { recursive: true });
await copyFile(absSrc, dest);
copied += 1;
}
return copied;
}

View file

@ -0,0 +1,8 @@
import { describe, expect, it } from 'vitest';
import { normalizeDbtPath } from './parse.js';
describe('normalizeDbtPath', () => {
it('normalizes Windows separators to POSIX separators', () => {
expect(normalizeDbtPath('models\\marts\\orders.yml')).toBe('models/marts/orders.yml');
});
});

View file

@ -0,0 +1,32 @@
import { readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
const YAML_EXT_RE = /\.(ya?ml)$/i;
export function normalizeDbtPath(path: string): string {
return path.replaceAll('\\', '/');
}
async function collectYamlFiles(stagedDir: string): Promise<string[]> {
const entries = await readdir(stagedDir, { withFileTypes: true, recursive: true });
const paths: string[] = [];
for (const entry of entries) {
if (!entry.isFile() || !YAML_EXT_RE.test(entry.name)) {
continue;
}
const abs = join(entry.parentPath, entry.name);
paths.push(normalizeDbtPath(relative(stagedDir, abs)));
}
paths.sort();
return paths;
}
export interface ParsedDbtProject {
/** All `.yml` / `.yaml` paths under stagedDir, relative + sorted. */
allPaths: string[];
}
export async function parseDbtStagedDir(stagedDir: string): Promise<ParsedDbtProject> {
const allPaths = await collectYamlFiles(stagedDir);
return { allPaths };
}

View file

@ -0,0 +1,48 @@
import { readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
import type { ChunkResult, DiffSet, SourceAdapter, WorkUnit } from '../../types.js';
export class FakeSourceAdapter implements SourceAdapter {
readonly source = 'fake';
readonly skillNames: string[] = [];
detect(): Promise<boolean> {
return Promise.resolve(true);
}
async chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const subDirs = (await readdir(stagedDir, { withFileTypes: true }))
.filter((e) => e.isDirectory())
.map((e) => e.name)
.sort();
const workUnits: WorkUnit[] = [];
for (const subDir of subDirs) {
const entries = await readdir(join(stagedDir, subDir), { withFileTypes: true, recursive: true });
const rawFiles = entries
.filter((e) => e.isFile())
.map((e) => relative(stagedDir, join(e.parentPath, e.name)))
.sort();
if (rawFiles.length === 0) {
continue;
}
if (diffSet) {
const touched = new Set([...diffSet.added, ...diffSet.modified]);
const anyTouched = rawFiles.some((p) => touched.has(p));
if (!anyTouched) {
continue;
}
}
workUnits.push({
unitKey: `fake-${subDir}`,
displayLabel: subDir,
rawFiles,
peerFileIndex: [],
dependencyPaths: [],
});
}
const eviction = diffSet && diffSet.deleted.length > 0 ? { deletedRawPaths: [...diffSet.deleted] } : undefined;
return { workUnits, eviction };
}
}

View file

@ -0,0 +1,146 @@
{
"name": "eviction-churn",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": [
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn"
]
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 3,
"rows": [
{
"queryid": "501",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 20,
"totalExecTime": 500,
"meanExecTime": 25,
"totalRows": 40
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": null,
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q501": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 20,
"totalExecTime": 500,
"totalRows": 40
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T08:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn",
"pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn",
"baseline_first_run:no_previous_pgss_baseline"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 3,
"templates": [
{
"id": "db5_q501",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q501/page.md"
}
]
}
},
"templates/db5_q501/metadata.json": {
"json": {
"id": "db5_q501",
"title": "postgres · analytics.orders [db5_q501]",
"path": "templates/db5_q501/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q501/page.md": {
"text": "# db5_q501\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q501/usage.json": {
"json": {
"stats": {
"executions": 20,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 25,
"error_rate": 0,
"rows_produced": 40
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -0,0 +1,144 @@
{
"name": "first-run",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "101",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 10,
"totalExecTime": 250,
"meanExecTime": 25,
"totalRows": 20
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [
"^svc_"
],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": null,
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q101": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 10,
"totalExecTime": 250,
"totalRows": 20
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T08:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_first_run:no_previous_pgss_baseline"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q101",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q101/page.md"
}
]
}
},
"templates/db5_q101/metadata.json": {
"json": {
"id": "db5_q101",
"title": "postgres · analytics.orders [db5_q101]",
"path": "templates/db5_q101/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q101/page.md": {
"text": "# db5_q101\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q101/usage.json": {
"json": {
"stats": {
"executions": 10,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 25,
"error_rate": 0,
"rows_produced": 20
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -0,0 +1,181 @@
{
"name": "normal-delta",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "201",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 12,
"totalExecTime": 160,
"meanExecTime": 13.333333333333334,
"totalRows": 58
},
{
"queryid": "201",
"userid": "12",
"username": "svc_loader",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 5,
"totalExecTime": 50,
"meanExecTime": 10,
"totalRows": 25
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [
"^svc_"
],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q201": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 10,
"totalExecTime": 100,
"totalRows": 50
},
"12": {
"calls": 5,
"totalExecTime": 50,
"totalRows": 25
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q201": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 12,
"totalExecTime": 160,
"totalRows": 58
},
"12": {
"calls": 5,
"totalExecTime": 50,
"totalRows": 25
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": false,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q201",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q201/page.md"
}
]
}
},
"templates/db5_q201/metadata.json": {
"json": {
"id": "db5_q201",
"title": "postgres · analytics.orders [db5_q201]",
"path": "templates/db5_q201/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "low",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q201/page.md": {
"text": "# db5_q201\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q201/usage.json": {
"json": {
"stats": {
"executions": 2,
"distinct_users": 1,
"first_seen": "2026-05-08T09:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 30,
"error_rate": 0,
"rows_produced": 8
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -0,0 +1,159 @@
{
"name": "reset-detected",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T11:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "301",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 3,
"totalExecTime": 90,
"meanExecTime": 30,
"totalRows": 9
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q301": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 100,
"totalExecTime": 1000,
"totalRows": 500
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T11:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q301": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 3,
"totalExecTime": 90,
"totalRows": 9
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z"
],
"degraded": true,
"statsResetAt": "2026-05-08T11:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q301",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q301/page.md"
}
]
}
},
"templates/db5_q301/metadata.json": {
"json": {
"id": "db5_q301",
"title": "postgres · analytics.orders [db5_q301]",
"path": "templates/db5_q301/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q301/page.md": {
"text": "# db5_q301\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q301/usage.json": {
"json": {
"stats": {
"executions": 3,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 30,
"error_rate": 0,
"rows_produced": 9
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -0,0 +1,159 @@
{
"name": "version-change",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "401",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 4,
"totalExecTime": 80,
"meanExecTime": 20,
"totalRows": 8
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 15.7",
"templates": {
"db5_q401": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 100,
"totalExecTime": 1000,
"totalRows": 500
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q401": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 4,
"totalExecTime": 80,
"totalRows": 8
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_reset:pg_server_major changed from 15 to 16"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q401",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q401/page.md"
}
]
}
},
"templates/db5_q401/metadata.json": {
"json": {
"id": "db5_q401",
"title": "postgres · analytics.orders [db5_q401]",
"path": "templates/db5_q401/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q401/page.md": {
"text": "# db5_q401\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q401/usage.json": {
"json": {
"stats": {
"executions": 4,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 20,
"error_rate": 0,
"rows_produced": 8
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -0,0 +1,200 @@
import { describe, expect, it, vi } from 'vitest';
import { BigQueryHistoricSqlQueryHistoryReader } from './bigquery-query-history-reader.js';
import { HistoricSqlGrantsMissingError } from './errors.js';
interface FakeQueryResult {
headers: string[];
rows: unknown[][];
totalRows: number;
error?: string;
}
function queryClient(results: FakeQueryResult[]) {
const executeQuery = vi.fn(async (_query: string) => {
const next = results.shift();
if (!next) {
throw new Error('unexpected query');
}
return next;
});
return { executeQuery };
}
function firstQuery(client: ReturnType<typeof queryClient>): string {
const call = client.executeQuery.mock.calls[0];
if (!call) {
throw new Error('expected query client to be called');
}
return call[0];
}
describe('BigQueryHistoricSqlQueryHistoryReader', () => {
it('probes region-qualified INFORMATION_SCHEMA.JOBS_BY_PROJECT', async () => {
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(reader.probe(client)).resolves.toBeUndefined();
expect(client.executeQuery).toHaveBeenCalledWith(
'SELECT 1 FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` LIMIT 1',
);
});
it('turns probe result errors into HistoricSqlGrantsMissingError', async () => {
const client = queryClient([{ headers: [], rows: [], totalRows: 0, error: 'Access Denied: jobs.listAll' }]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'us-central1' });
await expect(reader.probe(client)).rejects.toMatchObject({
name: 'HistoricSqlGrantsMissingError',
dialect: 'bigquery',
remediation:
'Grant roles/bigquery.resourceViewer on the BigQuery project, or grant a custom role containing bigquery.jobs.listAll.',
});
});
it('turns thrown probe failures into HistoricSqlGrantsMissingError', async () => {
const client = {
executeQuery: vi.fn(async () => {
throw new Error('permission denied');
}),
};
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('fetches BigQuery jobs with cursor and maps them into RawQueryRow shape without rowsProduced', async () => {
const client = queryClient([
{
headers: [
'job_id',
'query',
'user_email',
'creation_time',
'end_time',
'runtime_ms',
'total_slot_ms',
'total_bytes_processed',
'state',
'error_reason',
'error_message',
'statement_type',
],
rows: [
[
'bquxjob_1',
"SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'",
'analyst-a@example.test',
'2026-05-04T10:00:00.000Z',
'2026-05-04T10:00:01.250Z',
1250,
3106,
161164718,
'DONE',
null,
null,
'SELECT',
],
[
'bquxjob_2',
'SELECT * FROM `project-1.analytics.missing_table`',
'analyst-b@example.test',
new Date('2026-05-04T10:05:00.000Z'),
null,
null,
0,
0,
'DONE',
'notFound',
'Not found: Table project-1.analytics.missing_table',
'SELECT',
],
],
totalRows: 2,
},
]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
const rows = [];
for await (const row of reader.fetch(
client,
{
start: new Date('2026-05-01T00:00:00.000Z'),
end: new Date('2026-05-04T12:00:00.000Z'),
},
'2026-05-03T00:00:00.000Z',
)) {
rows.push(row);
}
expect(client.executeQuery).toHaveBeenCalledTimes(1);
const sql = firstQuery(client);
expect(sql).toContain('FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`');
expect(sql).toContain("creation_time >= TIMESTAMP('2026-05-03T00:00:00.000Z')");
expect(sql).toContain("creation_time < TIMESTAMP('2026-05-04T12:00:00.000Z')");
expect(sql).toContain("job_type = 'QUERY'");
expect(sql).toContain("(statement_type IS NULL OR statement_type != 'SCRIPT')");
expect(sql).toContain('ORDER BY creation_time ASC, job_id ASC');
expect(sql).toContain('total_slot_ms');
expect(sql).toContain('total_bytes_processed');
expect(sql).not.toMatch(/total_rows/i);
expect(rows).toEqual([
{
id: 'bquxjob_1',
sql: "SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'",
user: 'analyst-a@example.test',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: '2026-05-04T10:00:01.250Z',
runtimeMs: 1250,
success: true,
errorMessage: null,
},
{
id: 'bquxjob_2',
sql: 'SELECT * FROM `project-1.analytics.missing_table`',
user: 'analyst-b@example.test',
startedAt: '2026-05-04T10:05:00.000Z',
endedAt: null,
runtimeMs: null,
success: false,
errorMessage: 'notFound: Not found: Table project-1.analytics.missing_table',
},
]);
});
it('uses the window start when no cursor is available', async () => {
const client = queryClient([{ headers: ['job_id'], rows: [], totalRows: 0 }]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'EU' });
for await (const _row of reader.fetch(client, {
start: new Date('2026-02-03T12:00:00.000Z'),
end: new Date('2026-05-04T12:00:00.000Z'),
})) {
throw new Error('empty result should not yield rows');
}
const sql = firstQuery(client);
expect(sql).toContain('FROM `project-1.region-eu.INFORMATION_SCHEMA.JOBS_BY_PROJECT`');
expect(sql).toContain("creation_time >= TIMESTAMP('2026-02-03T12:00:00.000Z')");
});
it('throws a clear error when the query client cannot execute SQL', async () => {
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(async () => {
for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) {
throw new Error('unreachable');
}
}).rejects.toThrow('Historic SQL BigQuery reader requires a query client with executeQuery(query)');
});
it('rejects unsafe project and region identifiers before building SQL', () => {
expect(() => new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project`1', region: 'US' })).toThrow(
'Invalid BigQuery project id for historic-SQL ingest: project`1',
);
expect(() => new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US;DROP' })).toThrow(
'Invalid BigQuery region for historic-SQL ingest: US;DROP',
);
});
});

View file

@ -0,0 +1,219 @@
import { HistoricSqlGrantsMissingError } from './errors.js';
import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js';
interface QueryResultLike {
headers: string[];
rows: unknown[][];
totalRows: number;
error?: string;
}
interface QueryClientLike {
executeQuery(query: string): Promise<QueryResultLike>;
}
export interface BigQueryHistoricSqlQueryHistoryReaderOptions {
projectId: string;
region: string;
}
const BIGQUERY_GRANTS_REMEDIATION =
'Grant roles/bigquery.resourceViewer on the BigQuery project, or grant a custom role containing bigquery.jobs.listAll.';
function queryClient(client: unknown): QueryClientLike {
if (
client &&
typeof client === 'object' &&
'executeQuery' in client &&
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
) {
return client as QueryClientLike;
}
throw new Error('Historic SQL BigQuery reader requires a query client with executeQuery(query)');
}
function grantsError(cause: unknown): HistoricSqlGrantsMissingError {
const message =
cause instanceof Error
? cause.message
: typeof cause === 'string'
? cause
: 'BigQuery principal cannot query INFORMATION_SCHEMA.JOBS_BY_PROJECT.';
return new HistoricSqlGrantsMissingError({
dialect: 'bigquery',
message: `Missing BigQuery audit grants for historic-SQL ingest: ${message}`,
remediation: BIGQUERY_GRANTS_REMEDIATION,
cause,
});
}
function normalizeProjectId(value: string): string {
if (!/^[A-Za-z0-9_-]+$/.test(value)) {
throw new Error(`Invalid BigQuery project id for historic-SQL ingest: ${value}`);
}
return value;
}
function normalizeRegion(value: string): string {
const region = value.trim().toLowerCase().replace(/^region-/, '');
if (!/^[a-z0-9-]+$/.test(region)) {
throw new Error(`Invalid BigQuery region for historic-SQL ingest: ${value}`);
}
return region;
}
function timestampExpression(value: Date | string): string {
const date = value instanceof Date ? value : new Date(value);
if (Number.isNaN(date.getTime())) {
throw new Error(`Invalid BigQuery query-history timestamp: ${String(value)}`);
}
return `TIMESTAMP('${date.toISOString().replace(/'/g, "\\'")}')`;
}
function indexByHeader(headers: string[]): Map<string, number> {
const out = new Map<string, number>();
headers.forEach((header, index) => {
out.set(header.toUpperCase(), index);
});
return out;
}
function value(row: unknown[], indexes: Map<string, number>, name: string): unknown {
const index = indexes.get(name.toUpperCase());
return index === undefined ? null : row[index];
}
function nullableString(raw: unknown): string | null {
if (raw === null || raw === undefined) {
return null;
}
const text = String(raw);
return text.length > 0 ? text : null;
}
function requiredString(raw: unknown, field: string): string {
const text = nullableString(raw);
if (!text) {
throw new Error(`BigQuery JOBS_BY_PROJECT row is missing ${field}`);
}
return text;
}
function nullableNumber(raw: unknown): number | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
const number = typeof raw === 'number' ? raw : Number(raw);
if (!Number.isFinite(number)) {
return null;
}
return Math.max(0, number);
}
function isoTimestamp(raw: unknown, field: string): string {
if (raw instanceof Date) {
return raw.toISOString();
}
const text = requiredString(raw, field);
const date = new Date(text);
if (Number.isNaN(date.getTime())) {
throw new Error(`BigQuery JOBS_BY_PROJECT row has invalid ${field}: ${text}`);
}
return date.toISOString();
}
function nullableIsoTimestamp(raw: unknown): string | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
return isoTimestamp(raw, 'end_time');
}
function executionSucceeded(state: string | null, errorReason: string | null, errorMessage: string | null): boolean {
if (errorReason || errorMessage) {
return false;
}
return state === null || state.toUpperCase() === 'DONE';
}
function combinedErrorMessage(errorReason: string | null, errorMessage: string | null): string | null {
if (errorReason && errorMessage) {
return `${errorReason}: ${errorMessage}`;
}
return errorMessage ?? errorReason;
}
function mapRow(row: unknown[], indexes: Map<string, number>): HistoricSqlRawQueryRow {
const errorReason = nullableString(value(row, indexes, 'error_reason'));
const errorMessage = nullableString(value(row, indexes, 'error_message'));
return {
id: requiredString(value(row, indexes, 'job_id'), 'job_id'),
sql: requiredString(value(row, indexes, 'query'), 'query'),
user: nullableString(value(row, indexes, 'user_email')),
startedAt: isoTimestamp(value(row, indexes, 'creation_time'), 'creation_time'),
endedAt: nullableIsoTimestamp(value(row, indexes, 'end_time')),
runtimeMs: nullableNumber(value(row, indexes, 'runtime_ms')),
success: executionSucceeded(nullableString(value(row, indexes, 'state')), errorReason, errorMessage),
errorMessage: combinedErrorMessage(errorReason, errorMessage),
};
}
export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader {
private readonly viewPath: string;
constructor(options: BigQueryHistoricSqlQueryHistoryReaderOptions) {
const projectId = normalizeProjectId(options.projectId);
const region = normalizeRegion(options.region);
this.viewPath = `\`${projectId}.region-${region}.INFORMATION_SCHEMA.JOBS_BY_PROJECT\``;
}
async probe(client: unknown): Promise<void> {
let result: QueryResultLike;
try {
result = await queryClient(client).executeQuery(`SELECT 1 FROM ${this.viewPath} LIMIT 1`);
} catch (error) {
throw grantsError(error);
}
if (result.error) {
throw grantsError(result.error);
}
}
async *fetch(
client: unknown,
window: HistoricSqlTimeWindow,
cursor?: string | null,
): AsyncIterable<HistoricSqlRawQueryRow> {
const start = timestampExpression(cursor ?? window.start);
const end = timestampExpression(window.end);
const sql = `
SELECT
job_id,
query,
user_email,
creation_time,
end_time,
TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND) AS runtime_ms,
total_slot_ms,
total_bytes_processed,
state,
error_result.reason AS error_reason,
error_result.message AS error_message,
statement_type
FROM ${this.viewPath}
WHERE creation_time >= ${start}
AND creation_time < ${end}
AND job_type = 'QUERY'
AND query IS NOT NULL
AND (statement_type IS NULL OR statement_type != 'SCRIPT')
ORDER BY creation_time ASC, job_id ASC`.trim();
const result = await queryClient(client).executeQuery(sql);
if (result.error) {
throw grantsError(result.error);
}
const indexes = indexByHeader(result.headers);
for (const row of result.rows) {
yield mapRow(row, indexes);
}
}
}

View file

@ -0,0 +1,251 @@
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-chunk-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
async function writeTemplate(root: string): Promise<void> {
await writeJson(root, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 1,
capped: false,
warnings: ['source warning'],
templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }],
});
await writeJson(root, 'templates/fp_1/metadata.json', {
id: 'fp_1',
title: 'snowflake · analytics.orders [fp_1]',
path: 'templates/fp_1/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_1',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
},
});
await writeFile(join(root, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8');
await writeJson(root, 'templates/fp_1/usage.json', {
stats: {
executions: 20,
distinct_users: 3,
first_seen: '2026-05-01T00:00:00.000Z',
last_seen: '2026-05-04T11:55:00.000Z',
p50_runtime_ms: 100,
p95_runtime_ms: 200,
error_rate: 0,
rows_produced: 20,
},
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }],
samples: [],
});
}
async function writeSubclusterTemplates(root: string): Promise<void> {
await writeJson(root, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 2,
capped: false,
warnings: [],
templates: [
{
id: 'fp_order_status__cat_2b2ff2318877',
fingerprint: 'fp_order_status',
subClusterId: 'cat_2b2ff2318877',
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
},
{
id: 'fp_order_status__cat_34f037ddcbfa',
fingerprint: 'fp_order_status',
subClusterId: 'cat_34f037ddcbfa',
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
},
],
});
for (const template of [
{ id: 'fp_order_status__cat_2b2ff2318877', subClusterId: 'cat_2b2ff2318877' },
{ id: 'fp_order_status__cat_34f037ddcbfa', subClusterId: 'cat_34f037ddcbfa' },
]) {
await writeJson(root, `templates/${template.id}/metadata.json`, {
id: template.id,
title: `snowflake · analytics.orders [fp_ord:${template.subClusterId.slice(-6)}]`,
path: `templates/${template.id}/page.md`,
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_order_status',
sub_cluster_id: template.subClusterId,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
triage_signals: {
executions_bucket: 'mid',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '0 constant, 0 runtime',
},
},
});
await writeFile(join(root, `templates/${template.id}/page.md`), `# ${template.id}\n`, 'utf-8');
await writeJson(root, `templates/${template.id}/usage.json`, {
stats: {
executions: 3,
distinct_users: 3,
first_seen: '2026-05-04T10:00:00.000Z',
last_seen: '2026-05-04T10:05:00.000Z',
p50_runtime_ms: 120,
p95_runtime_ms: 150,
error_rate: 0,
rows_produced: 36,
},
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }],
samples: [],
});
}
}
describe('chunkHistoricSqlStagedDir', () => {
it('emits one WorkUnit per changed template and keeps usage as dependency', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: ['templates/fp_1/metadata.json'],
modified: [],
deleted: [],
unchanged: ['templates/fp_1/page.md', 'templates/fp_1/usage.json', 'manifest.json'],
});
expect(result.workUnits).toEqual([
{
unitKey: 'historic-sql-fp-1',
displayLabel: 'snowflake · analytics.orders [fp_1]',
rawFiles: ['templates/fp_1/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_1/usage.json'],
peerFileIndex: ['templates/fp_1/page.md'],
notes:
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
},
]);
expect(result.contextReport).toEqual({ capped: false, warnings: ['source warning'] });
});
it('emits one WorkUnit per changed categorical sub-cluster', async () => {
const stagedDir = await tempDir();
await writeSubclusterTemplates(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [
'templates/fp_order_status__cat_2b2ff2318877/metadata.json',
'templates/fp_order_status__cat_34f037ddcbfa/metadata.json',
],
modified: [],
deleted: [],
unchanged: [
'manifest.json',
'templates/fp_order_status__cat_2b2ff2318877/page.md',
'templates/fp_order_status__cat_2b2ff2318877/usage.json',
'templates/fp_order_status__cat_34f037ddcbfa/page.md',
'templates/fp_order_status__cat_34f037ddcbfa/usage.json',
],
});
expect(
result.workUnits.map((unit) => ({
unitKey: unit.unitKey,
displayLabel: unit.displayLabel,
rawFiles: unit.rawFiles,
dependencyPaths: unit.dependencyPaths,
})),
).toEqual([
{
unitKey: 'historic-sql-fp-order-status-cat-2b2ff2318877',
displayLabel: 'snowflake · analytics.orders [fp_ord:318877]',
rawFiles: ['templates/fp_order_status__cat_2b2ff2318877/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_2b2ff2318877/usage.json'],
},
{
unitKey: 'historic-sql-fp-order-status-cat-34f037ddcbfa',
displayLabel: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
rawFiles: ['templates/fp_order_status__cat_34f037ddcbfa/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'],
},
]);
});
it('emits zero WorkUnits for usage-only diffs', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [],
modified: ['templates/fp_1/usage.json'],
deleted: [],
unchanged: ['templates/fp_1/metadata.json', 'templates/fp_1/page.md', 'manifest.json'],
});
expect(result.workUnits).toEqual([]);
expect(result.eviction).toBeUndefined();
});
it('emits eviction only for deleted metadata or page files', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [],
modified: [],
deleted: ['templates/fp_1/usage.json', 'templates/fp_2/page.md'],
unchanged: [],
});
expect(result.eviction).toEqual({ deletedRawPaths: ['templates/fp_2/page.md'] });
});
it('describes historic-sql scope without including unrelated paths', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const scope = await describeHistoricSqlScope(stagedDir);
expect(scope.fingerprint).toHaveLength(64);
expect(scope.isPathInScope('manifest.json')).toBe(true);
expect(scope.isPathInScope('templates/fp_1/usage.json')).toBe(true);
expect(scope.isPathInScope('pages/notion/page.md')).toBe(false);
});
});

View file

@ -0,0 +1,86 @@
import { createHash } from 'node:crypto';
import { readFile, readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js';
import { historicSqlManifestSchema, historicSqlMetadataSchema } from './types.js';
async function walk(root: string): Promise<string[]> {
const entries = await readdir(root, { withFileTypes: true, recursive: true });
return entries
.filter((entry) => entry.isFile())
.map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/'))
.sort();
}
function safeUnitKey(id: string): string {
return `historic-sql-${id.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
}
async function readManifest(stagedDir: string) {
try {
return historicSqlManifestSchema.parse(JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')));
} catch (error) {
throw new Error(`Invalid historic-SQL manifest: ${error instanceof Error ? error.message : String(error)}`);
}
}
export async function chunkHistoricSqlStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const files = await walk(stagedDir);
const manifest = await readManifest(stagedDir);
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
const workUnits: WorkUnit[] = [];
for (const pagePath of files.filter((path) => /^templates\/[^/]+\/page\.md$/.test(path))) {
const metadataPath = pagePath.replace(/\/page\.md$/, '/metadata.json');
const usagePath = pagePath.replace(/\/page\.md$/, '/usage.json');
const primary = [metadataPath, pagePath].filter((path) => files.includes(path));
if (touched && !primary.some((path) => touched.has(path))) {
continue;
}
const metadata = historicSqlMetadataSchema.parse(JSON.parse(await readFile(join(stagedDir, metadataPath), 'utf-8')));
const rawFiles = touched ? primary.filter((path) => touched.has(path)).sort() : primary.sort();
const dependencyPaths = ['manifest.json', files.includes(usagePath) ? usagePath : null]
.filter((path): path is string => typeof path === 'string' && !rawFiles.includes(path))
.sort();
const excluded = new Set([...rawFiles, ...dependencyPaths]);
const peerFileIndex = files.filter((path) => !excluded.has(path)).sort();
workUnits.push({
unitKey: safeUnitKey(metadata.id),
displayLabel: metadata.title,
rawFiles,
dependencyPaths,
peerFileIndex,
notes:
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
});
}
const deletedPrimary = diffSet?.deleted.filter((path) => /^templates\/[^/]+\/(metadata\.json|page\.md)$/.test(path));
return {
workUnits,
eviction: deletedPrimary && deletedPrimary.length > 0 ? { deletedRawPaths: deletedPrimary.sort() } : undefined,
reconcileNotes: [`Historic-SQL staged templates=${manifest.templateCount}`],
contextReport: {
capped: manifest.capped,
warnings: manifest.warnings,
},
};
}
export async function describeHistoricSqlScope(stagedDir: string): Promise<ScopeDescriptor> {
const manifest = await readManifest(stagedDir);
const scopeKey = JSON.stringify({
connectionId: manifest.connectionId,
dialect: manifest.dialect,
windowStart: manifest.windowStart,
windowEnd: manifest.windowEnd,
});
const fingerprint = createHash('sha256').update(scopeKey).digest('hex');
return {
fingerprint,
isPathInScope: (rawPath) => rawPath === 'manifest.json' || rawPath.startsWith('templates/'),
};
}

View file

@ -0,0 +1,197 @@
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { detectHistoricSqlStagedDir } from './detect.js';
import {
HISTORIC_SQL_SOURCE_KEY,
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlPullConfigSchema,
historicSqlUsageSchema,
} from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-detect-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
describe('historic-sql staged dir detection', () => {
it('detects manifest source', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 0,
capped: false,
warnings: [],
templates: [],
});
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
});
it('detects document-shaped template structure without manifest', async () => {
const stagedDir = await tempDir();
await writeFile(join(stagedDir, 'not-a-match.txt'), 'x', 'utf-8');
await mkdir(join(stagedDir, 'templates', 'fp_1'), { recursive: true });
await writeFile(join(stagedDir, 'templates', 'fp_1', 'metadata.json'), '{}', 'utf-8');
await writeFile(join(stagedDir, 'templates', 'fp_1', 'page.md'), '# fp_1\n', 'utf-8');
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
});
it('does not detect unrelated directories', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', { source: 'notion' });
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(false);
});
});
describe('historic-sql schemas', () => {
it('defaults disabled optional pull-config fields through the parser', () => {
expect(
historicSqlPullConfigSchema.parse({
dialect: 'bigquery',
}),
).toEqual({
dialect: 'bigquery',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
});
});
it('accepts postgres pull config with a minCalls floor', () => {
expect(
historicSqlPullConfigSchema.parse({
dialect: 'postgres',
minCalls: 12,
}),
).toEqual({
dialect: 'postgres',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 12,
});
});
it('accepts postgres manifest fields with defaults for older dialects', () => {
expect(
historicSqlManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_pg',
dialect: 'postgres',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowStart: '2026-05-08T11:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
templateCount: 0,
capped: false,
warnings: [],
templates: [],
degraded: true,
statsResetAt: '2026-05-01T00:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 3,
}),
).toMatchObject({
dialect: 'postgres',
degraded: true,
statsResetAt: '2026-05-01T00:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 3,
});
expect(
historicSqlManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_sf',
dialect: 'snowflake',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowStart: '2026-05-01T12:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: null,
templateCount: 0,
capped: false,
warnings: [],
templates: [],
}),
).toMatchObject({
degraded: false,
statsResetAt: null,
baselineFirstRun: false,
pgServerVersion: null,
deallocCount: null,
});
});
it('accepts postgres usage stats with mean_runtime_ms and empty samples', () => {
const parsed = historicSqlUsageSchema.parse({
stats: {
executions: 25,
distinct_users: 2,
first_seen: '2026-05-08T10:00:00.000Z',
last_seen: '2026-05-08T12:00:00.000Z',
p50_runtime_ms: null,
p95_runtime_ms: null,
mean_runtime_ms: 32.5,
error_rate: 0,
rows_produced: 1042,
},
literal_slots: [],
samples: [],
});
expect(parsed.stats.mean_runtime_ms).toBe(32.5);
expect(parsed.samples).toEqual([]);
});
it('pins the Notion-compatible metadata envelope', () => {
const parsed = historicSqlMetadataSchema.parse({
id: 'fp_1',
title: 'snowflake · analytics.orders [fp_1]',
path: 'templates/fp_1/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_1',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
},
});
expect(parsed.objectType).toBe('historic_sql_template');
expect(parsed.lastEditedAt).toBeNull();
expect(parsed.properties.triage_signals.service_account_only).toBe('false');
});
});

View file

@ -0,0 +1,37 @@
import { readFile, readdir } from 'node:fs/promises';
import { join } from 'node:path';
import { HISTORIC_SQL_SOURCE_KEY } from './types.js';
export async function detectHistoricSqlStagedDir(stagedDir: string): Promise<boolean> {
try {
const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as { source?: unknown };
if (manifest.source === HISTORIC_SQL_SOURCE_KEY) {
return true;
}
if (manifest.source !== undefined) {
return false;
}
} catch {
// Fall through to structural detection for stage-only fixtures.
}
try {
const entries = await readdir(join(stagedDir, 'templates'), { withFileTypes: true, recursive: true });
const metadataDirs = new Set<string>();
const pageDirs = new Set<string>();
for (const entry of entries) {
if (!entry.isFile()) {
continue;
}
if (entry.name === 'metadata.json') {
metadataDirs.add(entry.parentPath);
}
if (entry.name === 'page.md') {
pageDirs.add(entry.parentPath);
}
}
return [...metadataDirs].some((dir) => pageDirs.has(dir));
} catch {
return false;
}
}

View file

@ -0,0 +1,61 @@
import type { HistoricSqlDialect } from './types.js';
interface HistoricSqlGrantsMissingErrorOptions {
dialect: HistoricSqlDialect;
message: string;
remediation: string;
cause?: unknown;
}
export class HistoricSqlGrantsMissingError extends Error {
readonly dialect: HistoricSqlDialect;
readonly remediation: string;
constructor(options: HistoricSqlGrantsMissingErrorOptions) {
super(options.message, options.cause === undefined ? undefined : { cause: options.cause });
this.name = 'HistoricSqlGrantsMissingError';
this.dialect = options.dialect;
this.remediation = options.remediation;
}
}
interface HistoricSqlExtensionMissingErrorOptions {
dialect: HistoricSqlDialect;
message: string;
remediation: string;
cause?: unknown;
}
export class HistoricSqlExtensionMissingError extends Error {
readonly dialect: HistoricSqlDialect;
readonly remediation: string;
constructor(options: HistoricSqlExtensionMissingErrorOptions) {
super(options.message, options.cause === undefined ? undefined : { cause: options.cause });
this.name = 'HistoricSqlExtensionMissingError';
this.dialect = options.dialect;
this.remediation = options.remediation;
}
}
interface HistoricSqlVersionUnsupportedErrorOptions {
dialect: HistoricSqlDialect;
detectedVersion: string;
minimumVersion: string;
}
export class HistoricSqlVersionUnsupportedError extends Error {
readonly dialect: HistoricSqlDialect;
readonly detectedVersion: string;
readonly minimumVersion: string;
constructor(options: HistoricSqlVersionUnsupportedErrorOptions) {
super(
`Unsupported ${options.dialect} version for historic-SQL ingest: detected ${options.detectedVersion}; requires ${options.minimumVersion} or newer.`,
);
this.name = 'HistoricSqlVersionUnsupportedError';
this.dialect = options.dialect;
this.detectedVersion = options.detectedVersion;
this.minimumVersion = options.minimumVersion;
}
}

Some files were not shown because too many files have changed in this diff Show more