From 620d6adbe6c10d62da689b29cfc24b5561c9a0b6 Mon Sep 17 00:00:00 2001 From: Andrey Avtomonov Date: Tue, 19 May 2026 23:41:29 +0200 Subject: [PATCH] docs: rewrite Semantic Querying concept with imperative-vs-declarative diagram (#156) * docs: rewrite Semantic Querying concept with imperative-vs-declarative diagram Reframe semantic-layer-internals.mdx around the contract the semantic layer offers an agent: declare what you want (a Semantic Query), KTX figures out how to compute it. Replaces the old "Context-Aware SQL" framing with a clear imperative-vs-declarative narrative. Adds a React Flow component (semantic-layer-flow.tsx) that contrasts a buggy 4-table agent-authored SQL (chasm trap, LEFT-JOIN-in-WHERE, hardcoded DATE_TRUNC) against the chasm-safe per-fact CTE SQL the planner actually emits, including the outer GROUP BY over the requested dimensions. Both lanes converge into a shared warehouse node and each SQL card now has parallel bullet notes (failures on the left, KTX behavior on the right). Side fixes bundled in: - include the /ktx basePath in the favicon metadata so the icon resolves under the production prefix - migrate docs-site/middleware.ts to docs-site/proxy.ts (Next 16 rename) - redirect / to /ktx/docs/getting-started/introduction so the apex docs URL works - add tests covering the apex redirect, the favicon basePath, and the middleware-to-proxy rename - propagate the Semantic Query terminology across the ktx-sl CLI reference, the context-layer concept page, and the agent-clients / primary-sources integration pages * Fix CI dead-code failures * docs-site: polish semantic-layer-internals code blocks and flow diagram - Make CodeBlock a server component so children traverse synchronously under React 19 RSC streaming; previously extractText returned "" in dev SSR, leaving code blocks empty. - Add custom JSON/YAML/SQL/code-like tokenizers with theme-aware token classes; drop the colored file-glyph dot and gradient tab-head. - Tighten tab-head: subtle grey background, smaller monospace filename in muted grey, smaller rectangular language pill placed to the left of the filename. - Polish the React Flow semantic-layer diagram (controls, fit-view padding, edge types). * docs-site: annotate imperative SQL, add section anchor, drop ClickHouse - Wire numbered red badges to each problematic span in the "Without KTX" SQL with hover sync between SQL gutter, lines, and the notes list. - Add #imperative-vs-declarative anchor on the flow section header so the eyebrow link is shareable; reveals a # glyph on hover/focus. - Align the compiled-SQL note dots to the first-line midpoint (mt-[6px] instead of mt-1) so 4px dots sit at y=8 in a 16px line. - Remove all ClickHouse references from docs-site (primary-sources, quickstart, ktx-setup, contributing, agents-setup, mechanics test, warehouse drivers in the flow diagram). * test: drop ClickHouse contributing-docs assertion Align the workspace-package mirror test with the ClickHouse removal from docs-site (75907eb). The connector-clickhouse package still exists in packages/, but contributing.mdx no longer lists it, so the test that mirrored docs against the workspace was failing. --- docs-site/app/global.css | 113 +- docs-site/app/layout.tsx | 4 +- docs-site/components/code-block.tsx | 334 ++++- docs-site/components/semantic-layer-flow.tsx | 1258 +++++++++++++++++ docs-site/content/agents-setup.md | 2 +- .../content/docs/cli-reference/ktx-setup.mdx | 2 +- .../content/docs/cli-reference/ktx-sl.mdx | 6 +- .../content/docs/community/contributing.mdx | 1 - .../concepts/semantic-layer-internals.mdx | 385 +++-- .../docs/concepts/the-context-layer.mdx | 2 +- .../docs/getting-started/quickstart.mdx | 2 +- .../docs/integrations/agent-clients.mdx | 2 +- .../docs/integrations/primary-sources.mdx | 65 +- docs-site/next.config.mjs | 6 + docs-site/{middleware.ts => proxy.ts} | 2 +- docs-site/tests/docs-index-route.test.mjs | 12 + docs-site/tests/docs-search-behavior.test.mjs | 19 +- .../tests/product-mechanics-content.test.mjs | 3 +- scripts/examples-docs.test.mjs | 1 - 19 files changed, 1872 insertions(+), 347 deletions(-) create mode 100644 docs-site/components/semantic-layer-flow.tsx rename docs-site/{middleware.ts => proxy.ts} (96%) diff --git a/docs-site/app/global.css b/docs-site/app/global.css index e7e2c5b2..d0f7ac21 100644 --- a/docs-site/app/global.css +++ b/docs-site/app/global.css @@ -221,6 +221,72 @@ pre code, padding-inline: 0 !important; } +.ktx-code .ktx-token-key { + color: #0f766e; +} + +.ktx-code .ktx-token-keyword { + color: #0e7490; + font-weight: 650; +} + +.ktx-code .ktx-token-function { + color: #7c3aed; + font-weight: 650; +} + +.ktx-code .ktx-token-flag { + color: #0369a1; +} + +.ktx-code .ktx-token-string { + color: #b45309; +} + +.ktx-code .ktx-token-number, +.ktx-code .ktx-token-constant { + color: #be123c; +} + +.ktx-code .ktx-token-comment { + color: #64748b; + font-style: italic; +} + +.ktx-code .ktx-token-punctuation { + color: #64748b; +} + +.dark .ktx-code .ktx-token-key { + color: #5eead4; +} + +.dark .ktx-code .ktx-token-keyword { + color: #67e8f9; +} + +.dark .ktx-code .ktx-token-function { + color: #c4b5fd; +} + +.dark .ktx-code .ktx-token-flag { + color: #7dd3fc; +} + +.dark .ktx-code .ktx-token-string { + color: #fbbf24; +} + +.dark .ktx-code .ktx-token-number, +.dark .ktx-code .ktx-token-constant { + color: #fb7185; +} + +.dark .ktx-code .ktx-token-comment, +.dark .ktx-code .ktx-token-punctuation { + color: #94a3b8; +} + /* Neutralize the outer figure styling that our wrapper now owns */ figure:has(> .ktx-code), figure[data-rehype-pretty-code-figure]:has(.ktx-code) { @@ -327,55 +393,32 @@ figure[data-rehype-pretty-code-figure]:has(.ktx-code) { display: flex; align-items: center; gap: 8px; - padding: 8px 10px 8px 14px; + padding: 5px 8px 5px 12px; border-bottom: 1px solid var(--color-fd-border); - background: linear-gradient(180deg, var(--color-fd-muted), transparent); + background: rgba(0, 0, 0, 0.025); } .dark .ktx-code-tab-head { border-bottom-color: rgba(255, 255, 255, 0.05); - background: linear-gradient(180deg, rgba(255, 255, 255, 0.02), transparent); + background: rgba(255, 255, 255, 0.02); } -.ktx-file-glyph { - display: inline-block; - width: 8px; - height: 8px; - border-radius: 999px; - background: var(--color-fd-muted-foreground); - flex-shrink: 0; -} -.ktx-file-glyph[data-lang="yaml"], -.ktx-file-glyph[data-lang="yml"] { background: #fbbf24; } -.ktx-file-glyph[data-lang="ts"], -.ktx-file-glyph[data-lang="tsx"], -.ktx-file-glyph[data-lang="typescript"] { background: #3b82f6; } -.ktx-file-glyph[data-lang="js"], -.ktx-file-glyph[data-lang="jsx"], -.ktx-file-glyph[data-lang="javascript"] { background: #facc15; } -.ktx-file-glyph[data-lang="json"] { background: #84cc16; } -.ktx-file-glyph[data-lang="md"], -.ktx-file-glyph[data-lang="mdx"] { background: #a3a3a3; } -.ktx-file-glyph[data-lang="sql"] { background: #f97316; } -.ktx-file-glyph[data-lang="py"], -.ktx-file-glyph[data-lang="python"] { background: #22d3ee; } - .ktx-code-tab-filename { font-family: var(--font-mono), ui-monospace, monospace; - font-size: 12.5px; - color: var(--color-fd-foreground); + font-size: 11.5px; + color: #6b7280; } .ktx-lang-pill { - margin-left: 4px; - padding: 1px 6px; - font-size: 10px; - font-weight: 600; + margin-right: 4px; + padding: 0 7px; + font-size: 9px; + font-weight: 500; text-transform: uppercase; - letter-spacing: 0.04em; - color: var(--color-fd-muted-foreground); + letter-spacing: 0.06em; + color: #9ca3af; border: 1px solid var(--color-fd-border); - border-radius: 4px; + border-radius: 3px; background: var(--color-fd-card); font-family: var(--font-display), var(--font-sans), sans-serif; } diff --git a/docs-site/app/layout.tsx b/docs-site/app/layout.tsx index 7c808130..230fd232 100644 --- a/docs-site/app/layout.tsx +++ b/docs-site/app/layout.tsx @@ -28,8 +28,8 @@ export const metadata: Metadata = { description: "Open-source context infrastructure that makes agentic analytics reliable.", icons: { - icon: "/brand/ktx-mascot.svg", - shortcut: "/brand/ktx-mascot.svg", + icon: "/ktx/brand/ktx-mascot.svg", + shortcut: "/ktx/brand/ktx-mascot.svg", }, }; diff --git a/docs-site/components/code-block.tsx b/docs-site/components/code-block.tsx index 9c9d71ec..80f29bdc 100644 --- a/docs-site/components/code-block.tsx +++ b/docs-site/components/code-block.tsx @@ -1,5 +1,3 @@ -"use client"; - import { type ComponentPropsWithoutRef, type ReactNode, @@ -15,6 +13,55 @@ type Props = ComponentPropsWithoutRef<"pre"> & { const OUTPUT_LANGS = new Set(["text", "plain", "plaintext", "console", "output"]); const WIZARD_GLYPHS = /^\s*[◆◇◯◐○●]/; +const JSON_TOKEN_PATTERN = + /"(?:\\.|[^"\\])*"|-?\b\d+(?:\.\d+)?\b|\b(?:true|false|null)\b|[{}[\],:]/g; +const SQL_TOKEN_PATTERN = + /--[^\n]*|'(?:''|[^'])*'|\b\d+(?:\.\d+)?\b|\b(?:select|from|join|left|right|inner|outer|on|where|group|by|order|limit|as|sum|avg|min|max|count|coalesce|date_trunc|case|when|then|else|end|and|or|is|not|null|false|true|with|having|over|partition|insert|update|delete|create|alter|drop|table|view)\b|[(),.;=*<>+-]/gi; +const CODE_LIKE_TOKEN_PATTERN = + /\/\/[^\n]*|\/\*[\s\S]*?\*\/|#(?![{\w-]+:)[^\n]*|`(?:\\.|[^`\\])*`|"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|-?\b\d+(?:\.\d+)?\b|\b(?:const|let|var|function|return|import|export|from|type|interface|extends|async|await|if|else|for|while|switch|case|break|continue|try|catch|throw|new|class|public|private|protected|readonly|true|false|null|undefined|pnpm|uv|ktx|node|npx|curl|git)\b|--?[\w-]+|[{}[\](),.;:=*<>|&+-]/g; +const SQL_FUNCTIONS = new Set([ + "sum", + "avg", + "min", + "max", + "count", + "coalesce", + "date_trunc", +]); +const CODE_KEYWORDS = new Set([ + "const", + "let", + "var", + "function", + "return", + "import", + "export", + "from", + "type", + "interface", + "extends", + "async", + "await", + "if", + "else", + "for", + "while", + "switch", + "case", + "break", + "continue", + "try", + "catch", + "throw", + "new", + "class", + "public", + "private", + "protected", + "readonly", +]); +const COMMAND_KEYWORDS = new Set(["pnpm", "uv", "ktx", "node", "npx", "curl", "git"]); +const CODE_CONSTANTS = new Set(["true", "false", "null", "undefined"]); function extractText(node: ReactNode): string { if (typeof node === "string") return node; @@ -65,15 +112,277 @@ function detectLanguage(props: Props, children: ReactNode): string | null { return findLanguageInNode(children); } +function stripOneLeadingBlankLine(text: string) { + return text.startsWith("\n") ? text.slice(1) : text; +} + +function extractCodeHeader(language: string | null, code: string) { + const normalized = normalizeLanguage(language); + const firstLineEnd = code.indexOf("\n"); + const firstLine = firstLineEnd === -1 ? code : code.slice(0, firstLineEnd); + const rest = firstLineEnd === -1 ? "" : code.slice(firstLineEnd + 1); + const commentPrefix = + normalized === "sql" + ? "--" + : normalized === "javascript" || + normalized === "js" || + normalized === "jsx" || + normalized === "typescript" || + normalized === "ts" || + normalized === "tsx" + ? "//" + : "#"; + + if (!firstLine.trimStart().startsWith(commentPrefix)) { + return { header: null, code }; + } + + const candidate = firstLine + .trim() + .slice(commentPrefix.length) + .trim(); + const looksLikePath = + candidate.includes("/") && + /\.[A-Za-z0-9]+(?:["'`)]*)?$/.test(candidate); + + if (!looksLikePath) return { header: null, code }; + + return { + header: candidate, + code: stripOneLeadingBlankLine(rest), + }; +} + +function normalizeLanguage(language: string | null) { + return language?.toLowerCase() ?? ""; +} + +function pushMatchedToken( + parts: ReactNode[], + token: string, + className: string, + key: string, +) { + parts.push( + + {token} + , + ); +} + +function highlightJson(code: string) { + const parts: ReactNode[] = []; + let lastIndex = 0; + let tokenIndex = 0; + + for (const match of code.matchAll(JSON_TOKEN_PATTERN)) { + const token = match[0]; + const index = match.index ?? 0; + if (index > lastIndex) parts.push(code.slice(lastIndex, index)); + + const nextText = code.slice(index + token.length); + const className = token.startsWith('"') + ? /^\s*:/.test(nextText) + ? "ktx-token-key" + : "ktx-token-string" + : /^-?\d/.test(token) + ? "ktx-token-number" + : /^(true|false|null)$/.test(token) + ? "ktx-token-constant" + : "ktx-token-punctuation"; + + pushMatchedToken(parts, token, className, `json-${tokenIndex}`); + lastIndex = index + token.length; + tokenIndex += 1; + } + + if (lastIndex < code.length) parts.push(code.slice(lastIndex)); + return parts; +} + +function highlightYaml(code: string) { + const parts: ReactNode[] = []; + const lines = code.split(/(\n)/); + let tokenIndex = 0; + + for (const line of lines) { + if (line === "\n") { + parts.push(line); + continue; + } + + const commentIndex = line.search(/\s#/); + const fullLineComment = line.trimStart().startsWith("#"); + const contentEnd = + fullLineComment || commentIndex === -1 ? line.length : commentIndex + 1; + const content = fullLineComment ? "" : line.slice(0, contentEnd); + const comment = fullLineComment ? line : line.slice(contentEnd); + const keyMatch = content.match(/^(\s*(?:-\s*)?)([A-Za-z_][\w.-]*)(\s*:)/); + + if (keyMatch) { + parts.push(keyMatch[1]); + pushMatchedToken(parts, keyMatch[2], "ktx-token-key", `yaml-key-${tokenIndex}`); + pushMatchedToken( + parts, + keyMatch[3], + "ktx-token-punctuation", + `yaml-colon-${tokenIndex}`, + ); + const rest = content.slice(keyMatch[0].length); + if (rest) parts.push(...highlightInlineValue(rest, `yaml-${tokenIndex}`)); + } else if (content) { + parts.push(...highlightInlineValue(content, `yaml-${tokenIndex}`)); + } + + if (comment) { + pushMatchedToken(parts, comment, "ktx-token-comment", `yaml-comment-${tokenIndex}`); + } + tokenIndex += 1; + } + + return parts; +} + +function highlightInlineValue(value: string, keyPrefix: string) { + const parts: ReactNode[] = []; + let lastIndex = 0; + let tokenIndex = 0; + const pattern = /'(?:''|[^'])*'|"(?:\\.|[^"\\])*"|-?\b\d+(?:\.\d+)?\b|\b(?:true|false|null)\b|[()[\]{},:=!<>+-]/g; + + for (const match of value.matchAll(pattern)) { + const token = match[0]; + const index = match.index ?? 0; + if (index > lastIndex) parts.push(value.slice(lastIndex, index)); + + const className = + token.startsWith("'") || token.startsWith('"') + ? "ktx-token-string" + : /^-?\d/.test(token) + ? "ktx-token-number" + : /^(true|false|null)$/.test(token) + ? "ktx-token-constant" + : "ktx-token-punctuation"; + + pushMatchedToken(parts, token, className, `${keyPrefix}-value-${tokenIndex}`); + lastIndex = index + token.length; + tokenIndex += 1; + } + + if (lastIndex < value.length) parts.push(value.slice(lastIndex)); + return parts; +} + +function highlightSql(code: string) { + const parts: ReactNode[] = []; + let lastIndex = 0; + let tokenIndex = 0; + + for (const match of code.matchAll(SQL_TOKEN_PATTERN)) { + const token = match[0]; + const index = match.index ?? 0; + if (index > lastIndex) parts.push(code.slice(lastIndex, index)); + + const lowerToken = token.toLowerCase(); + const className = token.startsWith("--") + ? "ktx-token-comment" + : token.startsWith("'") + ? "ktx-token-string" + : /^\d/.test(token) + ? "ktx-token-number" + : SQL_FUNCTIONS.has(lowerToken) + ? "ktx-token-function" + : /^[a-z_]+$/i.test(token) + ? "ktx-token-keyword" + : "ktx-token-punctuation"; + + pushMatchedToken(parts, token, className, `sql-${tokenIndex}`); + lastIndex = index + token.length; + tokenIndex += 1; + } + + if (lastIndex < code.length) parts.push(code.slice(lastIndex)); + return parts; +} + +function highlightCodeLike(code: string) { + const parts: ReactNode[] = []; + let lastIndex = 0; + let tokenIndex = 0; + + for (const match of code.matchAll(CODE_LIKE_TOKEN_PATTERN)) { + const token = match[0]; + const index = match.index ?? 0; + if (index > lastIndex) parts.push(code.slice(lastIndex, index)); + + const lowerToken = token.toLowerCase(); + const className = + token.startsWith("//") || token.startsWith("/*") || token.startsWith("#") + ? "ktx-token-comment" + : token.startsWith("'") || token.startsWith('"') || token.startsWith("`") + ? "ktx-token-string" + : /^-?\d/.test(token) + ? "ktx-token-number" + : CODE_CONSTANTS.has(lowerToken) + ? "ktx-token-constant" + : CODE_KEYWORDS.has(lowerToken) + ? "ktx-token-keyword" + : COMMAND_KEYWORDS.has(lowerToken) + ? "ktx-token-function" + : token.startsWith("-") + ? "ktx-token-flag" + : "ktx-token-punctuation"; + + pushMatchedToken(parts, token, className, `code-${tokenIndex}`); + lastIndex = index + token.length; + tokenIndex += 1; + } + + if (lastIndex < code.length) parts.push(code.slice(lastIndex)); + return parts; +} + +function highlightCode(language: string | null, code: string) { + const normalized = normalizeLanguage(language); + if (normalized === "json" || normalized === "jsonc") return highlightJson(code); + if (normalized === "yaml" || normalized === "yml") return highlightYaml(code); + if (normalized === "sql") return highlightSql(code); + if ( + [ + "bash", + "sh", + "shell", + "zsh", + "javascript", + "js", + "jsx", + "typescript", + "ts", + "tsx", + "python", + "py", + ].includes(normalized) + ) { + return highlightCodeLike(code); + } + return code; +} + export function CodeBlock(props: Props) { const { children, title, className: _ignored, ...rest } = props; const language = detectLanguage(props, children); - const codeText = extractText(children); + const rawCodeText = extractText(children); + const extractedHeader = extractCodeHeader(language, rawCodeText); + const codeText = extractedHeader.code; + const headerTitle = + typeof title === "string" && title.length > 0 + ? title + : extractedHeader.header; + const highlightedCode = highlightCode(language, codeText); - const hasTitle = typeof title === "string" && title.length > 0; + const hasHeader = typeof headerTitle === "string" && headerTitle.length > 0; const isOutput = - !hasTitle && - (WIZARD_GLYPHS.test(codeText) || + !hasHeader && + (WIZARD_GLYPHS.test(rawCodeText) || (language !== null && OUTPUT_LANGS.has(language))); // Mode D - Output preview (wizard prompts, terminal output) @@ -81,7 +390,7 @@ export function CodeBlock(props: Props) { return (
output - +
           {children}
         
@@ -89,18 +398,17 @@ export function CodeBlock(props: Props) { ); } - // Mode B - VS Code tab (filename present) - if (hasTitle) { + // Mode B - Header (filename present) + if (hasHeader) { return (
- - {title} {language && {language}} + {headerTitle}
-          {children}
+          {highlightedCode}
         
); @@ -111,7 +419,7 @@ export function CodeBlock(props: Props) {
-        {children}
+        {highlightedCode}
       
); diff --git a/docs-site/components/semantic-layer-flow.tsx b/docs-site/components/semantic-layer-flow.tsx new file mode 100644 index 00000000..770516ad --- /dev/null +++ b/docs-site/components/semantic-layer-flow.tsx @@ -0,0 +1,1258 @@ +"use client"; + +import { useCallback, useState } from "react"; +import { + Background, + BackgroundVariant, + Controls, + Handle, + MarkerType, + type Node, + type NodeProps, + type OnInit, + Position, + ReactFlow, +} from "@xyflow/react"; +import "@xyflow/react/dist/style.css"; + +type LaneVariant = "manual" | "ktx"; + +type AgentNodeData = { + variant: "single"; + title: string; + subtitle: string; +}; + +type IssueNote = { + id: number; + label: string; +}; + +type ManualSqlNodeData = { + variant: "manual"; + badge: string; + title: string; + caption: string; + code: string; + notes: IssueNote[]; + lineIssues: Record; +}; + +type SlQueryNodeData = { + variant: "slQuery"; + badge: string; + title: string; + caption: string; + code: string; +}; + +type EngineNodeData = { + variant: "engine"; + badge: string; + title: string; + stages: Array<{ index: number; title: string; detail: string }>; +}; + +type CompiledSqlNodeData = { + variant: "compiled"; + badge: string; + title: string; + caption: string; + code: string; + notes: string[]; +}; + +type WarehouseNodeData = { + variant: "warehouse"; + title: string; + drivers: string[]; +}; + +type AgentNode = Node; +type ManualSqlNode = Node; +type SlQueryNode = Node; +type EngineNode = Node; +type CompiledSqlNode = Node; +type WarehouseNode = Node; + +type FlowNode = + | AgentNode + | ManualSqlNode + | SlQueryNode + | EngineNode + | CompiledSqlNode + | WarehouseNode; + +const CANVAS_W = 1120; + +const AGENT_W = 380; +const AGENT_H = 104; +const AGENT_X = (CANVAS_W - AGENT_W) / 2; +const AGENT_Y = 16; + +const LANE_W = 488; +const LEFT_LANE_X = 32; +const RIGHT_LANE_X = CANVAS_W - LEFT_LANE_X - LANE_W; + +const LANE_TOP_Y = 248; + +const SL_QUERY_H = 510; +const ENGINE_H = 380; +const COMPILED_H = 1380; +const RIGHT_GAP = 24; + +const RIGHT_LANE_TOTAL = SL_QUERY_H + RIGHT_GAP + ENGINE_H + RIGHT_GAP + COMPILED_H; +const MANUAL_SQL_H = 840; +const LANES_BOTTOM_Y = + LANE_TOP_Y + Math.max(MANUAL_SQL_H, RIGHT_LANE_TOTAL); + +const SL_QUERY_Y = LANE_TOP_Y; +const ENGINE_Y = SL_QUERY_Y + SL_QUERY_H + RIGHT_GAP; +const COMPILED_Y = ENGINE_Y + ENGINE_H + RIGHT_GAP; + +const WAREHOUSE_W = 304; +const WAREHOUSE_H = 92; +const WAREHOUSE_X = (CANVAS_W - WAREHOUSE_W) / 2; +const WAREHOUSE_Y = LANES_BOTTOM_Y + 56; + +const MANUAL_STROKE = "#94a3b8"; +const KTX_STROKE = "#0891b2"; +const FIT_VIEW_OPTIONS = { padding: 0.05 }; + +const agent: AgentNode = { + id: "agent", + type: "agent", + position: { x: AGENT_X, y: AGENT_Y }, + data: { + variant: "single", + title: "Analytics agent", + subtitle: + "Asks: monthly net revenue and open tickets per segment, high-value orders only, no test customers", + }, + draggable: false, + selectable: false, +}; + +const manualSql: ManualSqlNode = { + id: "manual-sql", + type: "manualSql", + position: { x: LEFT_LANE_X, y: LANE_TOP_Y }, + data: { + variant: "manual", + badge: "Without KTX", + title: "Agent writes the SQL", + caption: + "Stitches four tables, mixes grains, and ships numbers that won't match the dashboard.", + code: `-- agent stitches four tables, mixes facts, +-- and ships numbers that won't match the dashboard + +SELECT + c.segment, + DATE_TRUNC('month', o.created_at) AS month, + SUM(o.amount) - SUM(r.amount) AS net_revenue, + COUNT(t.id) AS open_tickets +FROM customers c +LEFT JOIN orders o + ON o.customer_id = c.id +LEFT JOIN refunds r + ON r.order_id = o.id +LEFT JOIN tickets t + ON t.customer_id = c.id +WHERE + c.is_test = false + AND o.amount >= 100 + AND t.status = 'open' -- turns LEFT JOIN into INNER +GROUP BY + c.segment, + DATE_TRUNC('month', o.created_at) +ORDER BY + month, + c.segment +LIMIT 1000; + +-- chasm trap: orders rows multiply by tickets and refunds +-- net_revenue and open_tickets are both inflated +-- DATE_TRUNC syntax breaks on BigQuery`, + notes: [ + { id: 1, label: "Re-stitches a 4-way join on every question" }, + { id: 2, label: "Reinvents net_revenue and the high-value rule" }, + { id: 3, label: "Hides a chasm trap across three facts" }, + { id: 4, label: "Filters a LEFT JOIN target in WHERE" }, + { id: 5, label: "Hardcodes one warehouse's date functions" }, + ], + lineIssues: { + 5: [5], + 6: [2, 3], + 7: [3], + 8: [1], + 9: [1], + 10: [1], + 11: [1], + 12: [1], + 13: [1], + 14: [1], + 17: [2], + 18: [4], + 21: [5], + 27: [3], + 28: [3], + 29: [5], + }, + }, + draggable: false, + selectable: false, +}; + +const slQuery: SlQueryNode = { + id: "sl-query", + type: "slQuery", + position: { x: RIGHT_LANE_X, y: SL_QUERY_Y }, + data: { + variant: "slQuery", + badge: "With KTX", + title: "Agent sends a Semantic Query", + caption: + "Names the measures, dimensions, segments, and filters it wants. No SQL, no joins.", + code: `{ + "measures": [ + "orders.revenue", + "refunds.amount", + "tickets.open_count", + { + "name": "net_revenue", + "expr": "orders.revenue - refunds.amount" + } + ], + "dimensions": [ + "customers.segment", + { "field": "orders.created_at", "granularity": "month" } + ], + "segments": ["orders.high_value"], + "filters": ["customers.is_test = false"], + "limit": 1000 +}`, + }, + draggable: false, + selectable: false, +}; + +const engine: EngineNode = { + id: "engine", + type: "engine", + position: { x: RIGHT_LANE_X, y: ENGINE_Y }, + data: { + variant: "engine", + badge: "Semantic-layer engine", + title: "Plans the query against the reviewed graph", + stages: [ + { + index: 1, + title: "Resolve refs", + detail: "qualify columns, look up measure formulas", + }, + { + index: 2, + title: "Build join tree", + detail: "Dijkstra over typed edges from an anchor source", + }, + { + index: 3, + title: "Detect fan-out", + detail: "group measures by source, flag chasm traps", + }, + { + index: 4, + title: "Localize aggregation", + detail: "pre-aggregate each fact as its own CTE", + }, + { + index: 5, + title: "Transpile dialect", + detail: "emit Postgres-shaped SQL, then target dialect", + }, + ], + }, + draggable: false, + selectable: false, +}; + +const compiledSql: CompiledSqlNode = { + id: "compiled-sql", + type: "compiledSql", + position: { x: RIGHT_LANE_X, y: COMPILED_Y }, + data: { + variant: "compiled", + badge: "Generated SQL", + title: "KTX returns dialect-correct SQL", + caption: + "Pre-aggregates each fact at its own grain, then joins back on the shared dimension.", + code: `WITH orders_agg AS ( + SELECT + customer_id, + DATE_TRUNC('month', created_at) AS month, + SUM(amount) AS revenue + FROM public.orders + WHERE amount >= 100 + GROUP BY + customer_id, + DATE_TRUNC('month', created_at) +), +refunds_agg AS ( + SELECT + o.customer_id, + DATE_TRUNC('month', o.created_at) AS month, + SUM(r.amount) AS refund_amount + FROM public.refunds r + JOIN public.orders o + ON o.id = r.order_id + WHERE o.amount >= 100 + GROUP BY + o.customer_id, + DATE_TRUNC('month', o.created_at) +), +tickets_agg AS ( + SELECT + customer_id, + DATE_TRUNC('month', opened_at) AS month, + COUNT(*) AS open_count + FROM public.tickets + WHERE status = 'open' + GROUP BY + customer_id, + DATE_TRUNC('month', opened_at) +) +SELECT + c.segment, + o.month, + SUM(o.revenue - COALESCE(r.refund_amount, 0)) AS net_revenue, + SUM(o.revenue) AS revenue, + SUM(r.refund_amount) AS refund_amount, + SUM(COALESCE(t.open_count, 0)) AS open_tickets +FROM public.customers c +JOIN orders_agg o + ON o.customer_id = c.id +LEFT JOIN refunds_agg r + ON r.customer_id = c.id + AND r.month = o.month +LEFT JOIN tickets_agg t + ON t.customer_id = c.id + AND t.month = o.month +WHERE c.is_test = false +GROUP BY + c.segment, + o.month +ORDER BY + o.month, + c.segment +LIMIT 1000;`, + notes: [ + "Walks the reviewed join graph automatically", + "Uses the canonical net_revenue formula", + "Pre-aggregates each fact to avoid the chasm trap", + "Keeps LEFT JOIN filters on the dimension source", + "Transpiles DATE_TRUNC to the target dialect", + ], + }, + draggable: false, + selectable: false, +}; + +const warehouse: WarehouseNode = { + id: "warehouse", + type: "warehouse", + position: { x: WAREHOUSE_X, y: WAREHOUSE_Y }, + data: { + variant: "warehouse", + title: "Warehouse", + drivers: ["PostgreSQL", "Snowflake", "BigQuery"], + }, + draggable: false, + selectable: false, +}; + +const nodes: FlowNode[] = [ + agent, + manualSql, + slQuery, + engine, + compiledSql, + warehouse, +]; + +const arrowMarker = (color: string) => ({ + type: MarkerType.ArrowClosed, + color, + width: 16, + height: 16, +}); + +const edges = [ + { + id: "agent-manual", + source: "agent", + target: "manual-sql", + type: "default" as const, + label: "writes raw SQL", + labelBgPadding: [6, 3] as [number, number], + labelBgBorderRadius: 4, + labelStyle: { + fontSize: 12, + fontWeight: 500, + fill: "var(--color-fd-muted-foreground)", + }, + labelBgStyle: { + fill: "var(--color-fd-background)", + stroke: "var(--color-fd-border)", + strokeWidth: 1, + }, + style: { + stroke: MANUAL_STROKE, + strokeWidth: 1.5, + strokeDasharray: "5 4", + }, + markerEnd: arrowMarker(MANUAL_STROKE), + }, + { + id: "manual-warehouse", + source: "manual-sql", + target: "warehouse", + targetHandle: "warehouse-manual", + type: "default" as const, + style: { + stroke: MANUAL_STROKE, + strokeWidth: 1.5, + strokeDasharray: "5 4", + }, + markerEnd: arrowMarker(MANUAL_STROKE), + }, + { + id: "agent-slquery", + source: "agent", + target: "sl-query", + type: "default" as const, + label: "sends Semantic Query", + labelBgPadding: [6, 3] as [number, number], + labelBgBorderRadius: 4, + labelStyle: { + fontSize: 12, + fontWeight: 600, + fill: KTX_STROKE, + }, + labelBgStyle: { + fill: "var(--color-fd-background)", + stroke: "var(--color-fd-border)", + strokeWidth: 1, + }, + style: { stroke: KTX_STROKE, strokeWidth: 1.75 }, + markerEnd: arrowMarker(KTX_STROKE), + }, + { + id: "slquery-engine", + source: "sl-query", + target: "engine", + type: "straight" as const, + style: { stroke: KTX_STROKE, strokeWidth: 1.75 }, + markerEnd: arrowMarker(KTX_STROKE), + }, + { + id: "engine-compiled", + source: "engine", + target: "compiled-sql", + type: "straight" as const, + style: { stroke: KTX_STROKE, strokeWidth: 1.75 }, + markerEnd: arrowMarker(KTX_STROKE), + }, + { + id: "compiled-warehouse", + source: "compiled-sql", + target: "warehouse", + targetHandle: "warehouse-compiled", + type: "straight" as const, + style: { stroke: KTX_STROKE, strokeWidth: 1.75 }, + markerEnd: arrowMarker(KTX_STROKE), + }, +]; + +type FlowEdge = (typeof edges)[number]; + +function AgentNodeView({ data }: NodeProps) { + return ( +
+ +
+ +
+
+

+ {data.title} +

+

+ {data.subtitle} +

+
+
+ ); +} + +function LaneBadge({ + variant, + children, +}: { + variant: LaneVariant; + children: React.ReactNode; +}) { + const cls = + variant === "manual" + ? "border-slate-300 bg-slate-100 text-slate-700 dark:border-slate-600/60 dark:bg-slate-700/40 dark:text-slate-200" + : "border-cyan-300/70 bg-cyan-50 text-cyan-800 dark:border-cyan-400/40 dark:bg-cyan-400/15 dark:text-cyan-100"; + return ( + + + {children} + + ); +} + +const JSON_TOKEN_PATTERN = + /"(?:\\.|[^"\\])*"|-?\b\d+(?:\.\d+)?\b|\b(?:true|false|null)\b|[{}[\],:]/g; +const SQL_TOKEN_PATTERN = + /--[^\n]*|'(?:''|[^'])*'|\b\d+(?:\.\d+)?\b|\b(?:select|from|join|left|right|inner|outer|on|where|group|by|order|limit|as|sum|count|coalesce|date_trunc|case|when|then|else|end|and|or|is|not|null|false|true|with|having|over|partition)\b|[(),.;=*<>+-]/gi; +const SQL_FUNCTIONS = new Set(["sum", "count", "coalesce", "date_trunc"]); + +function highlightJson(code: string) { + const parts = []; + let lastIndex = 0; + let tokenIndex = 0; + + for (const match of code.matchAll(JSON_TOKEN_PATTERN)) { + const token = match[0]; + const index = match.index ?? 0; + if (index > lastIndex) parts.push(code.slice(lastIndex, index)); + + const nextText = code.slice(index + token.length); + const className = token.startsWith('"') + ? /^\s*:/.test(nextText) + ? "syntax-json-key" + : "syntax-string" + : /^-?\d/.test(token) + ? "syntax-number" + : /^(true|false|null)$/.test(token) + ? "syntax-constant" + : "syntax-punctuation"; + + parts.push( + + {token} + , + ); + lastIndex = index + token.length; + tokenIndex += 1; + } + + if (lastIndex < code.length) parts.push(code.slice(lastIndex)); + return parts; +} + +function highlightSql(code: string) { + const parts = []; + let lastIndex = 0; + let tokenIndex = 0; + + for (const match of code.matchAll(SQL_TOKEN_PATTERN)) { + const token = match[0]; + const index = match.index ?? 0; + if (index > lastIndex) parts.push(code.slice(lastIndex, index)); + + const lowerToken = token.toLowerCase(); + const className = token.startsWith("--") + ? "syntax-comment" + : token.startsWith("'") + ? "syntax-string" + : /^\d/.test(token) + ? "syntax-number" + : SQL_FUNCTIONS.has(lowerToken) + ? "syntax-function" + : /^[a-z_]+$/i.test(token) + ? "syntax-keyword" + : "syntax-punctuation"; + + parts.push( + + {token} + , + ); + lastIndex = index + token.length; + tokenIndex += 1; + } + + if (lastIndex < code.length) parts.push(code.slice(lastIndex)); + return parts; +} + +function highlightCode(language: string, code: string) { + if (language === "json") return highlightJson(code); + if (language === "sql") return highlightSql(code); + return code; +} + +function CodeBlock({ + language, + code, + tone, +}: { + language: string; + code: string; + tone: "manual" | "slQuery" | "compiled"; +}) { + const toneClass = + tone === "manual" + ? "text-slate-600 dark:text-slate-300" + : tone === "slQuery" + ? "text-fd-primary" + : "text-fd-primary/90"; + const highlightedCode = highlightCode(language, code); + + return ( +
+
+ + {language} + + + {tone === "compiled" ? "ktx-compiled" : "agent-authored"} + +
+
+        {highlightedCode}
+      
+
+ ); +} + +function AnnotatedSqlBlock({ + code, + lineIssues, + activeIssue, + onIssueEnter, + onIssueLeave, +}: { + code: string; + lineIssues: Record; + activeIssue: number | null; + onIssueEnter: (n: number) => void; + onIssueLeave: () => void; +}) { + const lines = code.split("\n"); + + return ( +
+
+ + sql + + + agent-authored + +
+
+        {lines.map((line, idx) => {
+          const issues = lineIssues[idx] ?? [];
+          const hasIssue = issues.length > 0;
+          const dim =
+            activeIssue !== null && !issues.includes(activeIssue);
+          const active =
+            activeIssue !== null && issues.includes(activeIssue);
+          const classes = [
+            "sl-sql-line",
+            hasIssue ? "is-issue" : "",
+            active ? "is-active" : "",
+            dim ? "is-dim" : "",
+          ]
+            .filter(Boolean)
+            .join(" ");
+          return (
+            
+ + {issues.map((n) => ( + + ))} + + + {line.length ? highlightSql(line) : " "} + +
+ ); + })} +
+
+ ); +} + +function ManualSqlNodeView({ data }: NodeProps) { + const [activeIssue, setActiveIssue] = useState(null); + const clearActive = useCallback(() => setActiveIssue(null), []); + + return ( +
+ +
+
+ {data.badge} +

+ {data.title} +

+

+ {data.caption} +

+
+
+
+ +
+
    + {data.notes.map((note) => { + const dim = activeIssue !== null && activeIssue !== note.id; + const active = activeIssue === note.id; + return ( +
  • setActiveIssue(note.id)} + onMouseLeave={clearActive} + onFocus={() => setActiveIssue(note.id)} + onBlur={clearActive} + tabIndex={0} + > + + {note.label} +
  • + ); + })} +
+ +
+ ); +} + +function SlQueryNodeView({ data }: NodeProps) { + return ( +
+ +
+
+ {data.badge} +

+ {data.title} +

+

+ {data.caption} +

+
+
+
+ +
+ +
+ ); +} + +function EngineNodeView({ data }: NodeProps) { + return ( +
+
+ ); +} + +function CompiledSqlNodeView({ data }: NodeProps) { + return ( +
+ +
+
+ {data.badge} +

+ {data.title} +

+

+ {data.caption} +

+
+
+
+ +
+
    + {data.notes.map((note) => ( +
  • +
  • + ))} +
+ +
+ ); +} + +function WarehouseNodeView({ data }: NodeProps) { + return ( +
+ + +
+ +
+
+

+ {data.title} +

+

+ {data.drivers.join(" • ")} +

+
+
+ ); +} + +const nodeTypes = { + agent: AgentNodeView, + manualSql: ManualSqlNodeView, + slQuery: SlQueryNodeView, + engine: EngineNodeView, + compiledSql: CompiledSqlNodeView, + warehouse: WarehouseNodeView, +}; + +export function SemanticLayerFlow() { + const [minZoom, setMinZoom] = useState(0.2); + const handleFlowInit = useCallback>((instance) => { + requestAnimationFrame(() => { + void instance.fitView(FIT_VIEW_OPTIONS).then(() => { + setMinZoom(instance.getZoom()); + }); + }); + }, []); + + return ( +
+
+
+ + Imperative vs declarative + + +

+ Same answer, two contracts +

+

+ On the left, the agent works imperatively: chooses tables, writes + joins, picks the grain, and remembers each warehouse's dialect. On + the right, the agent only declares what it wants. KTX handles + every how. +

+
+ +
+
+ Pan / zoom +
+ + + + +
+
+ +
+ ); +} diff --git a/docs-site/content/agents-setup.md b/docs-site/content/agents-setup.md index 84d4cfe9..c7709639 100644 --- a/docs-site/content/agents-setup.md +++ b/docs-site/content/agents-setup.md @@ -53,7 +53,7 @@ Ask the user (grouped if your harness supports it; otherwise sequentially): 3. **Embeddings backend.** Default: `sentence-transformers` (local, no API key, managed Python runtime). Offer `openai` only if the user has a key. 4. **Database connections.** Ask how many to add, then loop. For each, collect: - Connection name (e.g. `warehouse`, `analytics`). - - Driver: one of `sqlite`, `postgres`, `mysql`, `clickhouse`, `sqlserver`, `bigquery`, `snowflake`. + - Driver: one of `sqlite`, `postgres`, `mysql`, `sqlserver`, `bigquery`, `snowflake`. - Connection URL/DSN (or service-account file for BigQuery). Accept `env:VAR_NAME` or `file:/abs/path` to avoid pasting raw secrets. - **Heads-up for the user**: even if they paste a literal URL, KTX will silently relocate it into `/.ktx/secrets/-url` and rewrite `ktx.yaml` to `url: file:…` — this is correct, secure behavior and not a bug. - Schemas / datasets to include (postgres / sqlserver / snowflake / bigquery only). diff --git a/docs-site/content/docs/cli-reference/ktx-setup.mdx b/docs-site/content/docs/cli-reference/ktx-setup.mdx index b65de179..6dbe54c2 100644 --- a/docs-site/content/docs/cli-reference/ktx-setup.mdx +++ b/docs-site/content/docs/cli-reference/ktx-setup.mdx @@ -103,7 +103,7 @@ runtime features are missing. | Flag | Description | |------|-------------| -| `--database ` | Database driver to configure; repeatable. Choices: `sqlite`, `postgres`, `mysql`, `clickhouse`, `sqlserver`, `bigquery`, `snowflake` | +| `--database ` | Database driver to configure; repeatable. Choices: `sqlite`, `postgres`, `mysql`, `sqlserver`, `bigquery`, `snowflake` | | `--database-connection-id ` | Existing selected connection id; repeatable. With `--database` or `--database-url`, connection id for the new connection. | | `--database-url ` | URL, `env:NAME`, or `file:/path` for one new URL-style database connection; also used as the SQLite path | | `--database-schema ` | Database schema or dataset to include; repeatable | diff --git a/docs-site/content/docs/cli-reference/ktx-sl.mdx b/docs-site/content/docs/cli-reference/ktx-sl.mdx index b0282a38..f395a170 100644 --- a/docs-site/content/docs/cli-reference/ktx-sl.mdx +++ b/docs-site/content/docs/cli-reference/ktx-sl.mdx @@ -20,7 +20,7 @@ ktx sl [options] | `list` | List semantic-layer sources | | `search ` | Search semantic-layer sources | | `validate ` | Validate a semantic-layer source against the database schema | -| `query` | Compile or execute a semantic-layer query | +| `query` | Compile or execute a Semantic Query | ## Options @@ -52,7 +52,7 @@ ktx sl [options] | Flag | Description | Default | |------|-------------|---------| | `--connection-id ` | KTX connection id | - | -| `--query-file ` | JSON semantic-layer query file | - | +| `--query-file ` | JSON Semantic Query file | - | | `--measure ` | Measure to query; repeatable (at least one required) | - | | `--dimension ` | Dimension to include; repeatable | - | | `--filter ` | Filter expression; repeatable | - | @@ -67,7 +67,7 @@ ktx sl [options] | `--max-rows ` | Maximum rows to return when executing | - | `sl query` requires at least one `--measure` unless `--query-file` is set. -`--query-file` should point to a JSON semantic-layer query object. +`--query-file` should point to a JSON Semantic Query object. ## Examples diff --git a/docs-site/content/docs/community/contributing.mdx b/docs-site/content/docs/community/contributing.mdx index 791d865c..63b08cd6 100644 --- a/docs-site/content/docs/community/contributing.mdx +++ b/docs-site/content/docs/community/contributing.mdx @@ -91,7 +91,6 @@ packages/ connector-postgres/ # PostgreSQL connector connector-snowflake/ # Snowflake connector connector-bigquery/ # BigQuery connector - connector-clickhouse/ # ClickHouse connector connector-mysql/ # MySQL connector connector-sqlserver/ # SQL Server connector connector-sqlite/ # SQLite connector diff --git a/docs-site/content/docs/concepts/semantic-layer-internals.mdx b/docs-site/content/docs/concepts/semantic-layer-internals.mdx index aec0ccfa..a011d1cb 100644 --- a/docs-site/content/docs/concepts/semantic-layer-internals.mdx +++ b/docs-site/content/docs/concepts/semantic-layer-internals.mdx @@ -1,141 +1,115 @@ --- -title: Context-Aware SQL -description: How KTX turns reviewed context, grain, and relationship evidence into safe SQL for agents. +title: Semantic Querying +description: How KTX compiles a short Semantic Query into safe, dialect-correct SQL using a reviewed join graph. --- -## Why query planning needs context +import { SemanticLayerFlow } from "@/components/semantic-layer-flow"; -Agents can generate SQL from schema alone, but safe analytics SQL needs more -than table names. KTX uses reviewed context to understand grain, joins, measures, -filters, and where aggregation must happen. +KTX's semantic layer is a compiler that turns intent into SQL. The agent +declares _what_ it wants — measures, dimensions, filters — in a small +Semantic Query. KTX figures out the _how_: which tables to join, what +grain to aggregate at, how to keep fan-out from inflating measures, and +what dialect the warehouse speaks. -Read this page as four mechanics: +This page covers four mechanics: -- context files feed the semantic engine; -- evidence becomes a join graph with grain and relationship metadata; -- review keeps the graph current; -- query planning avoids fan-out and ambiguous joins. +- The Semantic Query contract agents send to the compiler. +- The planner steps that turn a Semantic Query into SQL. +- The join graph that backs those steps, and how it's built. +- The fan-out failure mode the compiler is designed to prevent. -## Where the semantic layer fits +## Imperative SQL vs declarative Semantic Querying -This planner is one subsystem inside KTX's broader context layer. It uses source -YAML, wiki context, scan evidence, and provenance to make context actionable for -SQL generation. +Writing analytics SQL is imperative work. Every question forces the +agent to hold two things in mind at once: _what_ it wants — a measure, a +slice, a filter — and _how_ to compute it: which tables to join, which +key links them, what grain to aggregate at, how to keep one fact from +inflating another, and what dialect the warehouse speaks. Plumbing on +top of intent, every query. -
-
-
-

- {"Context inputs"} -

-
-
-

semantic-layer/

-

- {"source YAML, measures, joins, grain"} -

-
-
-

wiki/

-

- {"business rules, definitions, caveats"} -

-
-
-

raw-sources/

-

- {"schema scans, keys, imported metadata"} -

-
-
-

provenance

-

- {"ingest decisions and review history"} -

-
-
-
+KTX's semantic layer separates those concerns: - +- **You and KTX maintain the how.** Sources, joins, grain, measures, and + segments live in reviewable YAML — the analytical contract the team + agrees on, version-controlled. +- **The agent declares the what.** It sends a Semantic Query and trusts + the compiler to produce safe SQL. -
-
-

- {"Semantic layer engine"} -

-
-
-

Join graph

-

- {"sources as nodes, joins as typed edges"} -

-
-
-

Grain

-

- {"row identity before aggregation"} -

-
-
-

Measures

-

- {"verified formulas and filters"} -

-
-
-

Relationships

-

- {"many_to_one, one_to_many, one_to_one"} -

-
-
-
- {"Safe query planning before SQL is generated."} -
-
+The agent stops reasoning about plumbing. It states intent. KTX turns +that into SQL the warehouse can run. - + -
-

- {"Agent workflows"} -

-
-
- {"Search sources and wiki pages"} -
-
- {"Compile trusted SQL"} -
-
- {"Explain metrics and provenance"} -
-
- {"Patch files and validate review"} -
-
-
-
-
+## The Semantic Query contract -## Join graph +A Semantic Query is the JSON payload the agent sends. Every field is optional +except `measures`, and column references are fully qualified +(`source.column`) so the compiler never has to guess where a name came +from. -A semantic source is a node. A join is a typed edge. KTX uses the graph to -choose valid paths and detect row-multiplying joins before SQL is generated. +Notice what's _not_ in the payload: no `FROM`, no `JOIN`, no `GROUP BY`, +no `WITH`. The agent states what it wants. KTX picks the join path, the +grain, the SQL shape, and the dialect. + +| Field | Purpose | +|-------|---------| +| `measures` | Names of pre-defined measures, or inline expressions like `sum(orders.amount)` | +| `dimensions` | Columns to group by, optionally with a `granularity` for time fields | +| `filters` | Row-level predicates, classified into `WHERE` or `HAVING` at planning time | +| `segments` | Named filter sets defined on a source, applied as additional predicates | +| `order_by` | Sort fields with optional direction | +| `limit` | Row cap on the result | + +A typical agent call looks like this: + +```json +{ + "measures": ["orders.revenue", "tickets.ticket_count"], + "dimensions": ["customers.segment"], + "filters": ["orders.created_at >= '2025-01-01'"], + "limit": 1000 +} +``` + +That payload is enough for KTX to plan and compile. The agent never +authors a join, a CTE, or a dialect-specific cast. + +## What the planner does + +The planner is a deterministic pipeline. Each Semantic Query runs through the +same ordered steps before any SQL is emitted. + +1. **Resolve refs.** Qualify bare column names, look up pre-defined + measure expressions, and classify each measure as raw or derived. +2. **Pick an anchor and build the join tree.** Choose the largest measure + source as the root, then run a shortest-path search across the typed + join graph to reach every required source. +3. **Detect fan-out.** Group measures by their owning source. If more + than one group exists, the planner marks the query as a chasm trap + and switches to aggregate-locality compilation. +4. **Classify filters.** Split predicates into row-level (`WHERE`) and + aggregate-level (`HAVING`) based on whether they reference a measure. +5. **Generate SQL.** Emit Postgres-shaped SQL with the right shape: + single-source aggregation when the query is safe, per-source CTEs + when fan-out is present. +6. **Transpile to the target dialect.** Run the result through `sqlglot` + so the warehouse receives syntax it understands. + +The output is the SQL string, the resolved plan, and any warnings +surfaced during planning. + +## The join graph + +A semantic source is a node. A declared join is a typed edge. The graph +is bidirectional: every forward edge has a reverse with the relationship +inverted, so the planner can traverse from any anchor. | Relationship | Planning impact | |--------------|-----------------| -| `many_to_one` | Usually safe for adding dimensions | -| `one_to_many` | Can multiply measures and trigger fan-out handling | -| `one_to_one` | Usually safe when keys are correct | -| Equal-cost paths | Ambiguous unless aliases or explicit joins disambiguate | +| `many_to_one` | Safe direction for adding dimensions | +| `one_to_many` | Multiplies measures and triggers fan-out handling | +| `one_to_one` | Safe in either direction when keys match | +| Equal-cost paths | Treated as ambiguous; aliases or explicit joins resolve them |
-

customers

-

grain: customer_id

+

{"customers"}

+

{"grain: customer_id"}

-

orders

-

grain: order_id

+

{"orders"}

+

{"grain: order_id"}

-

order_items

-

grain: order_id, line_id

+

{"order_items"}

+

{"grain: order_id, line_id"}

-
orders -> customers: many_to_one
-
orders -> order_items: one_to_many
+
{"orders -> customers: many_to_one"}
+
{"orders -> order_items: one_to_many"}
{"Example: "} - {"refunds joins to orders. Used carefully, it explains net revenue. Joined naively, it can duplicate order-level measures."} + {"refunds joins to orders. Used carefully, it explains net revenue. Joined naively, it duplicates order-level measures."}
-The graph is bidirectional for planning. If `orders -> customers` is -`many_to_one`, the reverse path is `one_to_many`. +Edges and grain come from your YAML. The compiler treats them as fact, +not a guess. + +```yaml +# semantic-layer/warehouse/orders.yaml +name: orders +table: public.orders +grain: [order_id] +joins: + - to: customers + on: customer_id = customers.id + relationship: many_to_one + - to: order_items + on: id = order_items.order_id + relationship: one_to_many +measures: + - name: revenue + expr: sum(case when status != 'refunded' then amount end) +``` ## Building and maintaining the graph -KTX starts from evidence, writes reviewable source YAML, and treats the merged -diff as the accepted graph. +KTX builds the graph from evidence and accepted edits, not from runtime +inference. Each input contributes a different kind of authority. | Evidence | What it contributes | |----------|---------------------| | Declared primary keys | Initial row grain | | Declared foreign keys | Formal join candidates | -| Inferred relationships | Edges when warehouses lack constraints | +| Inferred relationships | Edges when the warehouse lacks constraints | | dbt, MetricFlow, and LookML imports | Existing metrics, dimensions, explores, and joins | -| Query history | Real join and filter patterns | +| Query history | Real join and filter patterns from analyst SQL | | Analyst review | Final authority before context is merged |
-## Modeling problems +## Fan-out and aggregate locality -Fan-out is the classic failure mode: an order-level measure joins to line-item -rows before aggregation, so one order becomes many rows. +Fan-out is the classic analytics failure mode. Two fact tables join to a +shared dimension. A naive query joins them all together first, so each +row from one fact is multiplied by the matching rows from the other. +Measures duplicate, numbers go wrong, and the agent doesn't notice. -| Problem | What happens | How KTX handles it | -|---------|--------------|--------------------| -| Order measure joins to `order_items` | `orders.revenue` repeats once per item | Detect `one_to_many` and pre-aggregate | -| Two fact sources share `customers` | Measures multiply across the shared dimension | Treat as a chasm trap and plan each fact locally | -| Filter crosses `one_to_many` | Filtering changes measure grain | Reject or localize the filter | -| Equal-cost paths connect sources | Join choice is ambiguous | Prefer safer paths or require aliases | - -## Execution planning - -The planner resolves sources, chooses a join tree, checks relationship paths, -and picks a simple or aggregate-locality SQL shape. +KTX's planner detects the shape by grouping measures by their owning +source. If more than one source contributes raw measures, the generator +switches to aggregate locality: each fact is pre-aggregated at its own +grain inside a CTE, and the CTEs are joined back to the dimension at the +end. | Naive SQL shape | Semantic-layer SQL shape | |-----------------|--------------------------| -| Join facts and dimensions first, then aggregate | Aggregate each fact source at its own grain, then join results | -| Put every filter in one outer `WHERE` clause | Keep measure filters with the measure source when locality is needed | -| Trust the shortest textual join path | Prefer safe relationship paths and reject disconnected sources | -| Let dimension grain differ across facts | Raise when asymmetric dimensions would fan out another measure | +| Join facts and dimensions first, then aggregate | Aggregate each fact at its own grain, then join | +| Put every filter in one outer `WHERE` clause | Keep measure filters with the measure source | +| Trust the shortest textual join path | Prefer typed safe paths, reject disconnected sources | +| Let dimension grain differ across facts | Raise when an asymmetric dimension would fan out another measure | -
-
-

- {"Fan-out handling"} -

-

- {"The same question planned before and after KTX preserves the measure grain."} -

-
-
-
-
-

- {"Unsafe shape"} -

-

- {"Join first, aggregate later"} -

-
-
-{`orders
-  -> join order_items
-  -> join customers
+The result is the same analyst answer, computed with the join shape an
+analyst would have written by hand.
 
-group by
-  customer_segment
+## Where the context comes from
 
-measure
-  sum(orders.amount)`}
-      
-
- {"Order-level revenue is exposed to line-item fan-out before aggregation."} -
-
-
-
-

- {"KTX shape"} -

-

- {"Aggregate locally, then join"} -

-
-
-{`orders_agg as (
-  select customer_id, sum(amount) revenue
-  from orders
-  group by customer_id
-)
-select customers.segment, sum(revenue)
-from orders_agg
-join customers`}
-      
-
- {"The measure is pre-aggregated at order grain before dimensions are joined."} -
-
-
-
+The planner is only as good as the YAML it reads. KTX builds and +maintains that YAML for you. -The result is structured planning: validated sources, typed relationships, -graph search, fan-out detection, aggregate locality, and dialect transpilation. +- `raw-sources//` holds scan evidence from your warehouse: + schemas, columns, keys, samples, and observed usage patterns. +- `wiki/` holds business language, definitions, and caveats. The + planner doesn't read wiki at compile time, but the agent does, so + measure names and dimensions stay anchored to terms the team uses. +- `semantic-layer//` holds the structured sources, joins, + grain, measures, and segments the planner actually compiles against. + +Every accepted edit flows back into the next ingest, so the graph stays +current as the warehouse changes. ## Agent usage notes -Use this page when an agent needs to explain how KTX turns reviewed semantic -context into SQL, why relationship metadata matters, or why a query was rejected -as unsafe. +Point an agent at this page when it needs to explain why KTX asks for +grain, why a query was rejected as unsafe, or why the compiled SQL looks +different from what the agent first proposed. | Agent task | Relevant section | Next page | |------------|------------------|-----------| -| Explain why KTX asks for `grain` and relationship types | Join graph | [Writing Context](/docs/guides/writing-context) | -| Diagnose duplicated measures after a join | Modeling problems | [ktx sl](/docs/cli-reference/ktx-sl) | -| Explain safe SQL generation | Execution planning | [ktx sl](/docs/cli-reference/ktx-sl) | -| Describe how semantic context stays current | Building and maintaining the graph | [Context as Code](/docs/concepts/context-as-code) | +| Explain the Semantic Query shape | The Semantic Query contract | [ktx sl](/docs/cli-reference/ktx-sl) | +| Describe what the planner does between query and SQL | What the planner does | [ktx sl](/docs/cli-reference/ktx-sl) | +| Explain why KTX asks for grain and relationship types | The join graph | [Writing context](/docs/guides/writing-context) | +| Diagnose duplicated measures after a join | Fan-out and aggregate locality | [ktx sl](/docs/cli-reference/ktx-sl) | +| Describe how semantic context stays current | Building and maintaining the graph | [Context as code](/docs/concepts/context-as-code) | diff --git a/docs-site/content/docs/concepts/the-context-layer.mdx b/docs-site/content/docs/concepts/the-context-layer.mdx index 9a8130d0..c56327c5 100644 --- a/docs-site/content/docs/concepts/the-context-layer.mdx +++ b/docs-site/content/docs/concepts/the-context-layer.mdx @@ -74,7 +74,7 @@ measures: ``` For join graphs, fan-out handling, and execution mechanics, read -[Context-Aware SQL](/docs/concepts/semantic-layer-internals). +[Semantic Querying](/docs/concepts/semantic-layer-internals). ## Wiki pages diff --git a/docs-site/content/docs/getting-started/quickstart.mdx b/docs-site/content/docs/getting-started/quickstart.mdx index 241c4703..d0566ec9 100644 --- a/docs-site/content/docs/getting-started/quickstart.mdx +++ b/docs-site/content/docs/getting-started/quickstart.mdx @@ -114,7 +114,7 @@ The wizard walks you through everything KTX needs in one pass: 3. **Embeddings** - picks an embeddings backend. Choose OpenAI for hosted embeddings or `sentence-transformers` to run locally without an API key. 4. **Database** - adds at least one primary connection. Supported drivers: - SQLite, PostgreSQL, MySQL, ClickHouse, SQL Server, BigQuery, and Snowflake. + SQLite, PostgreSQL, MySQL, SQL Server, BigQuery, and Snowflake. 5. **Context sources** - optionally adds dbt, MetricFlow, LookML, Looker, Metabase, or Notion. You can skip and add them later. 6. **Build** - runs the first ingest so semantic-layer sources and wiki pages diff --git a/docs-site/content/docs/integrations/agent-clients.mdx b/docs-site/content/docs/integrations/agent-clients.mdx index ffb67b59..4a670315 100644 --- a/docs-site/content/docs/integrations/agent-clients.mdx +++ b/docs-site/content/docs/integrations/agent-clients.mdx @@ -285,7 +285,7 @@ Admin CLI skills call the same KTX CLI commands: | `ktx sl list --json` | List semantic-layer sources | | `ktx sl search --json` | Search semantic-layer sources | | `ktx sl validate --connection-id ` | Validate semantic source definitions | -| `ktx sl query --format json` | Execute a semantic-layer query when semantic compute is configured | +| `ktx sl query --format json` | Execute a Semantic Query when semantic compute is configured | ### Security constraints diff --git a/docs-site/content/docs/integrations/primary-sources.mdx b/docs-site/content/docs/integrations/primary-sources.mdx index 4b09c3a6..28ff4559 100644 --- a/docs-site/content/docs/integrations/primary-sources.mdx +++ b/docs-site/content/docs/integrations/primary-sources.mdx @@ -1,6 +1,6 @@ --- title: Primary Sources -description: Connect KTX to PostgreSQL, Snowflake, BigQuery, ClickHouse, MySQL, SQL Server, or SQLite. +description: Connect KTX to PostgreSQL, Snowflake, BigQuery, MySQL, SQL Server, or SQLite. --- KTX connects to your data warehouse or database to build schema context, @@ -26,9 +26,9 @@ Agents should prefer environment or file references over literal secrets. | Field | Required | Applies to | Description | |-------|----------|------------|-------------| -| `driver` | Yes | all connections | Connector driver such as `postgres`, `snowflake`, `bigquery`, `clickhouse`, `mysql`, `sqlserver`, or `sqlite` | +| `driver` | Yes | all connections | Connector driver such as `postgres`, `snowflake`, `bigquery`, `mysql`, `sqlserver`, or `sqlite` | | `url` | One of the connection methods | URL-style connectors | Database URL, `env:NAME`, or `file:/path/to/secret` | -| `host`, `port`, `database`, `username`, `password` | One of the connection methods | PostgreSQL, MySQL, ClickHouse, SQL Server | Field-by-field connection values | +| `host`, `port`, `database`, `username`, `password` | One of the connection methods | PostgreSQL, MySQL, SQL Server | Field-by-field connection values | | `schema` or `schemas` | No | schema-aware warehouses | Single schema or list of schemas to scan | | `context.queryHistory` | No | PostgreSQL, Snowflake, BigQuery | Enables query-history ingestion when the warehouse supports it | | `path` | Yes for path-style SQLite | SQLite | Local SQLite database path or `env:NAME` reference | @@ -269,63 +269,6 @@ staged artifact shape as Postgres and Snowflake. --- -## ClickHouse - -Connects over HTTP (port 8123) or HTTPS (port 8443). Supports the ClickHouse native type system including `Nullable`, `LowCardinality`, and `Array` wrappers. - -### Connection config - -```yaml title="ktx.yaml" -connections: - my-clickhouse: - driver: clickhouse - url: http://localhost:8123/analytics -``` - -Or with individual fields: - -```yaml title="ktx.yaml" -connections: - my-clickhouse: - driver: clickhouse - host: clickhouse.internal - port: 8123 - database: analytics - username: default - password: env:CH_PASSWORD - ssl: false -``` - -### Authentication - -| Method | Config | -|--------|--------| -| Basic auth | `username` + `password` (HTTP basic auth) | -| No auth | Default user `default` with no password | -| HTTPS | Set `ssl: true` (uses port 8443 by default) | - -### Features - -| Feature | Supported | Notes | -|---------|-----------|-------| -| Tables & views | Yes | Via `system.tables`, engine-based detection | -| Primary keys | Yes | Via `system.columns` | -| Foreign keys | No | Not a ClickHouse concept | -| Row count estimates | Yes | Via `system.parts` aggregation | -| Column statistics | No | - | -| Query history | No | - | -| Table sampling | Yes | - | - -### Dialect notes - -- Parameter binding uses `{param:Type}` syntax (e.g., `{database:String}`) -- Detects views vs. tables by engine name (`View`, `MaterializedView`) -- Handles `Nullable(T)` and `LowCardinality(Nullable(T))` type wrappers -- Dictionary tables are excluded from scanning -- Results returned in JSONCompact or JSONEachRow format - ---- - ## MySQL Standard MySQL/MariaDB connector with full foreign key support and schema introspection. @@ -515,4 +458,4 @@ No authentication required - SQLite is file-based. The file must be readable by | Database ingest returns no tables | Schema, database, or project filter is wrong, or the user lacks metadata permissions | Verify the schema list and grant metadata read permissions | | Query history is empty | Query history extension or warehouse history view is unavailable | Enable the warehouse-specific history feature, then rerun `ktx ingest --query-history` or `ktx setup` | | Column statistics are missing | Connector cannot access stats tables or the warehouse does not expose them | Grant stats permissions where supported; otherwise rely on fast schema context | -| Semantic query execution fails | Connection is missing, unreachable, or query execution is disabled | Run `ktx connection test ` and check the `ktx sl query` flags | +| Semantic Query execution fails | Connection is missing, unreachable, or query execution is disabled | Run `ktx connection test ` and check the `ktx sl query` flags | diff --git a/docs-site/next.config.mjs b/docs-site/next.config.mjs index 3beb3073..30a96741 100644 --- a/docs-site/next.config.mjs +++ b/docs-site/next.config.mjs @@ -15,6 +15,12 @@ const config = { }, async redirects() { return [ + { + source: "/", + destination: "/ktx/docs/getting-started/introduction", + permanent: false, + basePath: false, + }, { source: "/docs", destination: "/docs/getting-started/introduction", diff --git a/docs-site/middleware.ts b/docs-site/proxy.ts similarity index 96% rename from docs-site/middleware.ts rename to docs-site/proxy.ts index 1b892076..49d1c324 100644 --- a/docs-site/middleware.ts +++ b/docs-site/proxy.ts @@ -6,7 +6,7 @@ const markdownMimeTypes = new Set([ "application/markdown", ]); -export function middleware(request: NextRequest) { +export function proxy(request: NextRequest) { if (!isMarkdownPreferred(request.headers.get("accept"))) { return NextResponse.next(); } diff --git a/docs-site/tests/docs-index-route.test.mjs b/docs-site/tests/docs-index-route.test.mjs index 721813ec..fdd8ec81 100644 --- a/docs-site/tests/docs-index-route.test.mjs +++ b/docs-site/tests/docs-index-route.test.mjs @@ -112,6 +112,18 @@ test("/ktx/docs redirects to the docs introduction", async () => { ); }); +test("/ redirects into the /ktx docs site", async () => { + const response = await fetch(`${docsSiteUrl}/`, { + redirect: "manual", + }); + + assert.equal(response.status, 307); + assert.equal( + response.headers.get("location"), + `${docsBasePath}/docs/getting-started/introduction`, + ); +}); + test("/ktx/api/search returns docs search results", async () => { const response = await fetch( `${docsSiteUrl}${docsBasePath}/api/search?query=setup`, diff --git a/docs-site/tests/docs-search-behavior.test.mjs b/docs-site/tests/docs-search-behavior.test.mjs index 0a96482b..ece51477 100644 --- a/docs-site/tests/docs-search-behavior.test.mjs +++ b/docs-site/tests/docs-search-behavior.test.mjs @@ -1,5 +1,5 @@ import assert from "node:assert/strict"; -import { readFile } from "node:fs/promises"; +import { access, readFile } from "node:fs/promises"; import { dirname, join } from "node:path"; import { test } from "node:test"; import { fileURLToPath } from "node:url"; @@ -17,6 +17,23 @@ test("root provider uses the base-path-aware search API", async () => { assert.match(layout, /api:\s*"\/ktx\/api\/search"/); }); +test("metadata icons include the docs base path", async () => { + const layout = await readDocsFile("app/layout.tsx"); + + assert.match(layout, /icon:\s*"\/ktx\/brand\/ktx-mascot\.svg"/); + assert.match(layout, /shortcut:\s*"\/ktx\/brand\/ktx-mascot\.svg"/); + assert.doesNotMatch(layout, /:\s*"\/brand\/ktx-mascot\.svg"/); +}); + +test("markdown negotiation uses the Next proxy convention", async () => { + await assert.doesNotReject(access(join(docsSiteDir, "proxy.ts"))); + await assert.rejects(access(join(docsSiteDir, "middleware.ts"))); + + const proxy = await readDocsFile("proxy.ts"); + assert.match(proxy, /export function proxy/); + assert.doesNotMatch(proxy, /export function middleware/); +}); + test("site background stacking does not target every body child", async () => { const css = await readDocsFile("app/global.css"); diff --git a/docs-site/tests/product-mechanics-content.test.mjs b/docs-site/tests/product-mechanics-content.test.mjs index 81d716d3..d0f1cb0c 100644 --- a/docs-site/tests/product-mechanics-content.test.mjs +++ b/docs-site/tests/product-mechanics-content.test.mjs @@ -127,12 +127,11 @@ test("product mechanics component explains ingestion outputs", async () => { assert.doesNotMatch(component, /KTX works in two moments/); assert.doesNotMatch(component, /name: "Metabase and query history"/); assert.doesNotMatch(component, /name: "dbt, MetricFlow, LookML"/); - assert.doesNotMatch(component, /ClickHouse/); assert.doesNotMatch(component, /MySQL/); assert.doesNotMatch(component, /SQL Server/); assert.doesNotMatch( component, - /\/ktx\/brand\/(?:postgresql|snowflake|bigquery|clickhouse|mysql|sqlserver|sqlite|metabase|dbt|looker|notion)\.svg/, + /\/ktx\/brand\/(?:postgresql|snowflake|bigquery|mysql|sqlserver|sqlite|metabase|dbt|looker|notion)\.svg/, ); assert.doesNotMatch(component, / { assert.match(contributing, /context\/\s+# Core context engine/); assert.match(contributing, /llm\/\s+# LLM client abstraction/); assert.match(contributing, /connector-bigquery\/\s+# BigQuery connector/); - assert.match(contributing, /connector-clickhouse\/\s+# ClickHouse connector/); assert.match(contributing, /connector-mysql\/\s+# MySQL connector/); assert.match(contributing, /connector-postgres\/\s+# PostgreSQL connector/); assert.match(contributing, /connector-snowflake\/\s+# Snowflake connector/);