diff --git a/docs-site/app/layout.tsx b/docs-site/app/layout.tsx index 7c808130..230fd232 100644 --- a/docs-site/app/layout.tsx +++ b/docs-site/app/layout.tsx @@ -28,8 +28,8 @@ export const metadata: Metadata = { description: "Open-source context infrastructure that makes agentic analytics reliable.", icons: { - icon: "/brand/ktx-mascot.svg", - shortcut: "/brand/ktx-mascot.svg", + icon: "/ktx/brand/ktx-mascot.svg", + shortcut: "/ktx/brand/ktx-mascot.svg", }, }; diff --git a/docs-site/components/semantic-layer-flow.tsx b/docs-site/components/semantic-layer-flow.tsx new file mode 100644 index 00000000..f4f77506 --- /dev/null +++ b/docs-site/components/semantic-layer-flow.tsx @@ -0,0 +1,873 @@ +"use client"; + +import { + Background, + BackgroundVariant, + Handle, + MarkerType, + type Node, + type NodeProps, + Position, + ReactFlow, +} from "@xyflow/react"; +import "@xyflow/react/dist/style.css"; + +type LaneVariant = "manual" | "ktx"; + +type AgentNodeData = { + variant: "single"; + title: string; + subtitle: string; +}; + +type ManualSqlNodeData = { + variant: "manual"; + badge: string; + title: string; + caption: string; + code: string; + notes: string[]; +}; + +type SlQueryNodeData = { + variant: "slQuery"; + badge: string; + title: string; + caption: string; + code: string; +}; + +type EngineNodeData = { + variant: "engine"; + badge: string; + title: string; + stages: Array<{ index: number; title: string; detail: string }>; +}; + +type CompiledSqlNodeData = { + variant: "compiled"; + badge: string; + title: string; + caption: string; + code: string; + notes: string[]; +}; + +type WarehouseNodeData = { + variant: "warehouse"; + title: string; + drivers: string[]; +}; + +type AgentNode = Node; +type ManualSqlNode = Node; +type SlQueryNode = Node; +type EngineNode = Node; +type CompiledSqlNode = Node; +type WarehouseNode = Node; + +type FlowNode = + | AgentNode + | ManualSqlNode + | SlQueryNode + | EngineNode + | CompiledSqlNode + | WarehouseNode; + +const CANVAS_W = 1120; + +const AGENT_W = 380; +const AGENT_H = 104; +const AGENT_X = (CANVAS_W - AGENT_W) / 2; +const AGENT_Y = 16; + +const LANE_W = 488; +const LEFT_LANE_X = 32; +const RIGHT_LANE_X = CANVAS_W - LEFT_LANE_X - LANE_W; + +const LANE_TOP_Y = 248; + +const SL_QUERY_H = 510; +const ENGINE_H = 380; +const COMPILED_H = 1380; +const RIGHT_GAP = 24; + +const RIGHT_LANE_TOTAL = SL_QUERY_H + RIGHT_GAP + ENGINE_H + RIGHT_GAP + COMPILED_H; +const MANUAL_SQL_H = 840; +const LANES_BOTTOM_Y = + LANE_TOP_Y + Math.max(MANUAL_SQL_H, RIGHT_LANE_TOTAL); + +const SL_QUERY_Y = LANE_TOP_Y; +const ENGINE_Y = SL_QUERY_Y + SL_QUERY_H + RIGHT_GAP; +const COMPILED_Y = ENGINE_Y + ENGINE_H + RIGHT_GAP; + +const WAREHOUSE_W = 304; +const WAREHOUSE_H = 92; +const WAREHOUSE_X = (CANVAS_W - WAREHOUSE_W) / 2; +const WAREHOUSE_Y = LANES_BOTTOM_Y + 56; +const CANVAS_H = WAREHOUSE_Y + WAREHOUSE_H + 32; + +const MANUAL_STROKE = "#94a3b8"; +const KTX_STROKE = "#0891b2"; + +const agent: AgentNode = { + id: "agent", + type: "agent", + position: { x: AGENT_X, y: AGENT_Y }, + data: { + variant: "single", + title: "Analytics agent", + subtitle: + "Asks: monthly net revenue and open tickets per segment, high-value orders only, no test customers", + }, + draggable: false, + selectable: false, +}; + +const manualSql: ManualSqlNode = { + id: "manual-sql", + type: "manualSql", + position: { x: LEFT_LANE_X, y: LANE_TOP_Y }, + data: { + variant: "manual", + badge: "Without KTX", + title: "Agent writes the SQL", + caption: + "Stitches four tables, mixes grains, and ships numbers that won't match the dashboard.", + code: `-- agent stitches four tables, mixes facts, +-- and ships numbers that won't match the dashboard + +SELECT + c.segment, + DATE_TRUNC('month', o.created_at) AS month, + SUM(o.amount) - SUM(r.amount) AS net_revenue, + COUNT(t.id) AS open_tickets +FROM customers c +LEFT JOIN orders o + ON o.customer_id = c.id +LEFT JOIN refunds r + ON r.order_id = o.id +LEFT JOIN tickets t + ON t.customer_id = c.id +WHERE + c.is_test = false + AND o.amount >= 100 + AND t.status = 'open' -- turns LEFT JOIN into INNER +GROUP BY + c.segment, + DATE_TRUNC('month', o.created_at) +ORDER BY + month, + c.segment +LIMIT 1000; + +-- chasm trap: orders rows multiply by tickets and refunds +-- net_revenue and open_tickets are both inflated +-- DATE_TRUNC syntax breaks on BigQuery`, + notes: [ + "Re-stitches a 4-way join on every question", + "Reinvents net_revenue and the high-value rule", + "Hides a chasm trap across three facts", + "Filters a LEFT JOIN target in WHERE", + "Hardcodes one warehouse's date functions", + ], + }, + draggable: false, + selectable: false, +}; + +const slQuery: SlQueryNode = { + id: "sl-query", + type: "slQuery", + position: { x: RIGHT_LANE_X, y: SL_QUERY_Y }, + data: { + variant: "slQuery", + badge: "With KTX", + title: "Agent sends a Semantic Query", + caption: + "Names the measures, dimensions, segments, and filters it wants. No SQL, no joins.", + code: `{ + "measures": [ + "orders.revenue", + "refunds.amount", + "tickets.open_count", + { + "name": "net_revenue", + "expr": "orders.revenue - refunds.amount" + } + ], + "dimensions": [ + "customers.segment", + { "field": "orders.created_at", "granularity": "month" } + ], + "segments": ["orders.high_value"], + "filters": ["customers.is_test = false"], + "limit": 1000 +}`, + }, + draggable: false, + selectable: false, +}; + +const engine: EngineNode = { + id: "engine", + type: "engine", + position: { x: RIGHT_LANE_X, y: ENGINE_Y }, + data: { + variant: "engine", + badge: "Semantic-layer engine", + title: "Plans the query against the reviewed graph", + stages: [ + { + index: 1, + title: "Resolve refs", + detail: "qualify columns, look up measure formulas", + }, + { + index: 2, + title: "Build join tree", + detail: "Dijkstra over typed edges from an anchor source", + }, + { + index: 3, + title: "Detect fan-out", + detail: "group measures by source, flag chasm traps", + }, + { + index: 4, + title: "Localize aggregation", + detail: "pre-aggregate each fact as its own CTE", + }, + { + index: 5, + title: "Transpile dialect", + detail: "emit Postgres-shaped SQL, then target dialect", + }, + ], + }, + draggable: false, + selectable: false, +}; + +const compiledSql: CompiledSqlNode = { + id: "compiled-sql", + type: "compiledSql", + position: { x: RIGHT_LANE_X, y: COMPILED_Y }, + data: { + variant: "compiled", + badge: "Generated SQL", + title: "KTX returns dialect-correct SQL", + caption: + "Pre-aggregates each fact at its own grain, then joins back on the shared dimension.", + code: `WITH orders_agg AS ( + SELECT + customer_id, + DATE_TRUNC('month', created_at) AS month, + SUM(amount) AS revenue + FROM public.orders + WHERE amount >= 100 + GROUP BY + customer_id, + DATE_TRUNC('month', created_at) +), +refunds_agg AS ( + SELECT + o.customer_id, + DATE_TRUNC('month', o.created_at) AS month, + SUM(r.amount) AS refund_amount + FROM public.refunds r + JOIN public.orders o + ON o.id = r.order_id + WHERE o.amount >= 100 + GROUP BY + o.customer_id, + DATE_TRUNC('month', o.created_at) +), +tickets_agg AS ( + SELECT + customer_id, + DATE_TRUNC('month', opened_at) AS month, + COUNT(*) AS open_count + FROM public.tickets + WHERE status = 'open' + GROUP BY + customer_id, + DATE_TRUNC('month', opened_at) +) +SELECT + c.segment, + o.month, + SUM(o.revenue - COALESCE(r.refund_amount, 0)) AS net_revenue, + SUM(o.revenue) AS revenue, + SUM(r.refund_amount) AS refund_amount, + SUM(COALESCE(t.open_count, 0)) AS open_tickets +FROM public.customers c +JOIN orders_agg o + ON o.customer_id = c.id +LEFT JOIN refunds_agg r + ON r.customer_id = c.id + AND r.month = o.month +LEFT JOIN tickets_agg t + ON t.customer_id = c.id + AND t.month = o.month +WHERE c.is_test = false +GROUP BY + c.segment, + o.month +ORDER BY + o.month, + c.segment +LIMIT 1000;`, + notes: [ + "Walks the reviewed join graph automatically", + "Uses the canonical net_revenue formula", + "Pre-aggregates each fact to avoid the chasm trap", + "Keeps LEFT JOIN filters on the dimension source", + "Transpiles DATE_TRUNC to the target dialect", + ], + }, + draggable: false, + selectable: false, +}; + +const warehouse: WarehouseNode = { + id: "warehouse", + type: "warehouse", + position: { x: WAREHOUSE_X, y: WAREHOUSE_Y }, + data: { + variant: "warehouse", + title: "Warehouse", + drivers: ["PostgreSQL", "Snowflake", "BigQuery", "ClickHouse"], + }, + draggable: false, + selectable: false, +}; + +const nodes: FlowNode[] = [ + agent, + manualSql, + slQuery, + engine, + compiledSql, + warehouse, +]; + +const arrowMarker = (color: string) => ({ + type: MarkerType.ArrowClosed, + color, + width: 16, + height: 16, +}); + +const edges = [ + { + id: "agent-manual", + source: "agent", + target: "manual-sql", + type: "smoothstep" as const, + label: "writes raw SQL", + labelBgPadding: [6, 3] as [number, number], + labelBgBorderRadius: 4, + labelStyle: { + fontSize: 12, + fontWeight: 500, + fill: "var(--color-fd-muted-foreground)", + }, + labelBgStyle: { + fill: "var(--color-fd-background)", + stroke: "var(--color-fd-border)", + strokeWidth: 1, + }, + style: { + stroke: MANUAL_STROKE, + strokeWidth: 1.5, + strokeDasharray: "5 4", + }, + markerEnd: arrowMarker(MANUAL_STROKE), + }, + { + id: "manual-warehouse", + source: "manual-sql", + target: "warehouse", + type: "smoothstep" as const, + style: { + stroke: MANUAL_STROKE, + strokeWidth: 1.5, + strokeDasharray: "5 4", + }, + markerEnd: arrowMarker(MANUAL_STROKE), + }, + { + id: "agent-slquery", + source: "agent", + target: "sl-query", + type: "smoothstep" as const, + label: "sends Semantic Query", + labelBgPadding: [6, 3] as [number, number], + labelBgBorderRadius: 4, + labelStyle: { + fontSize: 12, + fontWeight: 600, + fill: KTX_STROKE, + }, + labelBgStyle: { + fill: "var(--color-fd-background)", + stroke: "var(--color-fd-border)", + strokeWidth: 1, + }, + style: { stroke: KTX_STROKE, strokeWidth: 1.75 }, + markerEnd: arrowMarker(KTX_STROKE), + }, + { + id: "slquery-engine", + source: "sl-query", + target: "engine", + type: "straight" as const, + style: { stroke: KTX_STROKE, strokeWidth: 1.75 }, + markerEnd: arrowMarker(KTX_STROKE), + }, + { + id: "engine-compiled", + source: "engine", + target: "compiled-sql", + type: "straight" as const, + style: { stroke: KTX_STROKE, strokeWidth: 1.75 }, + markerEnd: arrowMarker(KTX_STROKE), + }, + { + id: "compiled-warehouse", + source: "compiled-sql", + target: "warehouse", + type: "smoothstep" as const, + style: { stroke: KTX_STROKE, strokeWidth: 1.75 }, + markerEnd: arrowMarker(KTX_STROKE), + }, +]; + +function AgentNodeView({ data }: NodeProps) { + return ( +
+ +
+ +
+
+

+ {data.title} +

+

+ {data.subtitle} +

+
+
+ ); +} + +function LaneBadge({ + variant, + children, +}: { + variant: LaneVariant; + children: React.ReactNode; +}) { + const cls = + variant === "manual" + ? "border-slate-300 bg-slate-100 text-slate-700 dark:border-slate-600/60 dark:bg-slate-700/40 dark:text-slate-200" + : "border-cyan-300/70 bg-cyan-50 text-cyan-800 dark:border-cyan-400/40 dark:bg-cyan-400/15 dark:text-cyan-100"; + return ( + + + {children} + + ); +} + +function CodeBlock({ + language, + code, + tone, +}: { + language: string; + code: string; + tone: "manual" | "slQuery" | "compiled"; +}) { + const toneClass = + tone === "manual" + ? "text-slate-600 dark:text-slate-300" + : tone === "slQuery" + ? "text-fd-primary" + : "text-fd-primary/90"; + return ( +
+
+ + {language} + + + {tone === "compiled" ? "ktx-compiled" : "agent-authored"} + +
+
+        {code}
+      
+
+ ); +} + +function ManualSqlNodeView({ data }: NodeProps) { + return ( +
+ +
+
+ {data.badge} +

+ {data.title} +

+

+ {data.caption} +

+
+
+
+ +
+
    + {data.notes.map((note) => ( +
  • +
  • + ))} +
+ +
+ ); +} + +function SlQueryNodeView({ data }: NodeProps) { + return ( +
+ +
+
+ {data.badge} +

+ {data.title} +

+

+ {data.caption} +

+
+
+
+ +
+ +
+ ); +} + +function EngineNodeView({ data }: NodeProps) { + return ( +
+
+ ); +} + +function CompiledSqlNodeView({ data }: NodeProps) { + return ( +
+ +
+
+ {data.badge} +

+ {data.title} +

+

+ {data.caption} +

+
+
+
+ +
+
    + {data.notes.map((note) => ( +
  • +
  • + ))} +
+ +
+ ); +} + +function WarehouseNodeView({ data }: NodeProps) { + return ( +
+ +
+ +
+
+

+ {data.title} +

+

+ {data.drivers.join(" • ")} +

+
+
+ ); +} + +const nodeTypes = { + agent: AgentNodeView, + manualSql: ManualSqlNodeView, + slQuery: SlQueryNodeView, + engine: EngineNodeView, + compiledSql: CompiledSqlNodeView, + warehouse: WarehouseNodeView, +}; + +export function SemanticLayerFlow() { + return ( +
+
+
+

+ Imperative vs declarative +

+

+ Same answer, two contracts +

+

+ On the left, the agent works imperatively: chooses tables, writes + joins, picks the grain, and remembers each warehouse's dialect. On + the right, the agent only declares what it wants. KTX handles + every how. +

+
+ +
+ + + +
+
+ +
+ ); +} + +export default SemanticLayerFlow; diff --git a/docs-site/content/docs/cli-reference/ktx-sl.mdx b/docs-site/content/docs/cli-reference/ktx-sl.mdx index b0282a38..f395a170 100644 --- a/docs-site/content/docs/cli-reference/ktx-sl.mdx +++ b/docs-site/content/docs/cli-reference/ktx-sl.mdx @@ -20,7 +20,7 @@ ktx sl [options] | `list` | List semantic-layer sources | | `search ` | Search semantic-layer sources | | `validate ` | Validate a semantic-layer source against the database schema | -| `query` | Compile or execute a semantic-layer query | +| `query` | Compile or execute a Semantic Query | ## Options @@ -52,7 +52,7 @@ ktx sl [options] | Flag | Description | Default | |------|-------------|---------| | `--connection-id ` | KTX connection id | - | -| `--query-file ` | JSON semantic-layer query file | - | +| `--query-file ` | JSON Semantic Query file | - | | `--measure ` | Measure to query; repeatable (at least one required) | - | | `--dimension ` | Dimension to include; repeatable | - | | `--filter ` | Filter expression; repeatable | - | @@ -67,7 +67,7 @@ ktx sl [options] | `--max-rows ` | Maximum rows to return when executing | - | `sl query` requires at least one `--measure` unless `--query-file` is set. -`--query-file` should point to a JSON semantic-layer query object. +`--query-file` should point to a JSON Semantic Query object. ## Examples diff --git a/docs-site/content/docs/concepts/semantic-layer-internals.mdx b/docs-site/content/docs/concepts/semantic-layer-internals.mdx index aec0ccfa..a011d1cb 100644 --- a/docs-site/content/docs/concepts/semantic-layer-internals.mdx +++ b/docs-site/content/docs/concepts/semantic-layer-internals.mdx @@ -1,141 +1,115 @@ --- -title: Context-Aware SQL -description: How KTX turns reviewed context, grain, and relationship evidence into safe SQL for agents. +title: Semantic Querying +description: How KTX compiles a short Semantic Query into safe, dialect-correct SQL using a reviewed join graph. --- -## Why query planning needs context +import { SemanticLayerFlow } from "@/components/semantic-layer-flow"; -Agents can generate SQL from schema alone, but safe analytics SQL needs more -than table names. KTX uses reviewed context to understand grain, joins, measures, -filters, and where aggregation must happen. +KTX's semantic layer is a compiler that turns intent into SQL. The agent +declares _what_ it wants — measures, dimensions, filters — in a small +Semantic Query. KTX figures out the _how_: which tables to join, what +grain to aggregate at, how to keep fan-out from inflating measures, and +what dialect the warehouse speaks. -Read this page as four mechanics: +This page covers four mechanics: -- context files feed the semantic engine; -- evidence becomes a join graph with grain and relationship metadata; -- review keeps the graph current; -- query planning avoids fan-out and ambiguous joins. +- The Semantic Query contract agents send to the compiler. +- The planner steps that turn a Semantic Query into SQL. +- The join graph that backs those steps, and how it's built. +- The fan-out failure mode the compiler is designed to prevent. -## Where the semantic layer fits +## Imperative SQL vs declarative Semantic Querying -This planner is one subsystem inside KTX's broader context layer. It uses source -YAML, wiki context, scan evidence, and provenance to make context actionable for -SQL generation. +Writing analytics SQL is imperative work. Every question forces the +agent to hold two things in mind at once: _what_ it wants — a measure, a +slice, a filter — and _how_ to compute it: which tables to join, which +key links them, what grain to aggregate at, how to keep one fact from +inflating another, and what dialect the warehouse speaks. Plumbing on +top of intent, every query. -
-
-
-

- {"Context inputs"} -

-
-
-

semantic-layer/

-

- {"source YAML, measures, joins, grain"} -

-
-
-

wiki/

-

- {"business rules, definitions, caveats"} -

-
-
-

raw-sources/

-

- {"schema scans, keys, imported metadata"} -

-
-
-

provenance

-

- {"ingest decisions and review history"} -

-
-
-
+KTX's semantic layer separates those concerns: - +- **You and KTX maintain the how.** Sources, joins, grain, measures, and + segments live in reviewable YAML — the analytical contract the team + agrees on, version-controlled. +- **The agent declares the what.** It sends a Semantic Query and trusts + the compiler to produce safe SQL. -
-
-

- {"Semantic layer engine"} -

-
-
-

Join graph

-

- {"sources as nodes, joins as typed edges"} -

-
-
-

Grain

-

- {"row identity before aggregation"} -

-
-
-

Measures

-

- {"verified formulas and filters"} -

-
-
-

Relationships

-

- {"many_to_one, one_to_many, one_to_one"} -

-
-
-
- {"Safe query planning before SQL is generated."} -
-
+The agent stops reasoning about plumbing. It states intent. KTX turns +that into SQL the warehouse can run. - + -
-

- {"Agent workflows"} -

-
-
- {"Search sources and wiki pages"} -
-
- {"Compile trusted SQL"} -
-
- {"Explain metrics and provenance"} -
-
- {"Patch files and validate review"} -
-
-
-
-
+## The Semantic Query contract -## Join graph +A Semantic Query is the JSON payload the agent sends. Every field is optional +except `measures`, and column references are fully qualified +(`source.column`) so the compiler never has to guess where a name came +from. -A semantic source is a node. A join is a typed edge. KTX uses the graph to -choose valid paths and detect row-multiplying joins before SQL is generated. +Notice what's _not_ in the payload: no `FROM`, no `JOIN`, no `GROUP BY`, +no `WITH`. The agent states what it wants. KTX picks the join path, the +grain, the SQL shape, and the dialect. + +| Field | Purpose | +|-------|---------| +| `measures` | Names of pre-defined measures, or inline expressions like `sum(orders.amount)` | +| `dimensions` | Columns to group by, optionally with a `granularity` for time fields | +| `filters` | Row-level predicates, classified into `WHERE` or `HAVING` at planning time | +| `segments` | Named filter sets defined on a source, applied as additional predicates | +| `order_by` | Sort fields with optional direction | +| `limit` | Row cap on the result | + +A typical agent call looks like this: + +```json +{ + "measures": ["orders.revenue", "tickets.ticket_count"], + "dimensions": ["customers.segment"], + "filters": ["orders.created_at >= '2025-01-01'"], + "limit": 1000 +} +``` + +That payload is enough for KTX to plan and compile. The agent never +authors a join, a CTE, or a dialect-specific cast. + +## What the planner does + +The planner is a deterministic pipeline. Each Semantic Query runs through the +same ordered steps before any SQL is emitted. + +1. **Resolve refs.** Qualify bare column names, look up pre-defined + measure expressions, and classify each measure as raw or derived. +2. **Pick an anchor and build the join tree.** Choose the largest measure + source as the root, then run a shortest-path search across the typed + join graph to reach every required source. +3. **Detect fan-out.** Group measures by their owning source. If more + than one group exists, the planner marks the query as a chasm trap + and switches to aggregate-locality compilation. +4. **Classify filters.** Split predicates into row-level (`WHERE`) and + aggregate-level (`HAVING`) based on whether they reference a measure. +5. **Generate SQL.** Emit Postgres-shaped SQL with the right shape: + single-source aggregation when the query is safe, per-source CTEs + when fan-out is present. +6. **Transpile to the target dialect.** Run the result through `sqlglot` + so the warehouse receives syntax it understands. + +The output is the SQL string, the resolved plan, and any warnings +surfaced during planning. + +## The join graph + +A semantic source is a node. A declared join is a typed edge. The graph +is bidirectional: every forward edge has a reverse with the relationship +inverted, so the planner can traverse from any anchor. | Relationship | Planning impact | |--------------|-----------------| -| `many_to_one` | Usually safe for adding dimensions | -| `one_to_many` | Can multiply measures and trigger fan-out handling | -| `one_to_one` | Usually safe when keys are correct | -| Equal-cost paths | Ambiguous unless aliases or explicit joins disambiguate | +| `many_to_one` | Safe direction for adding dimensions | +| `one_to_many` | Multiplies measures and triggers fan-out handling | +| `one_to_one` | Safe in either direction when keys match | +| Equal-cost paths | Treated as ambiguous; aliases or explicit joins resolve them |
-

customers

-

grain: customer_id

+

{"customers"}

+

{"grain: customer_id"}

-

orders

-

grain: order_id

+

{"orders"}

+

{"grain: order_id"}

-

order_items

-

grain: order_id, line_id

+

{"order_items"}

+

{"grain: order_id, line_id"}

-
orders -> customers: many_to_one
-
orders -> order_items: one_to_many
+
{"orders -> customers: many_to_one"}
+
{"orders -> order_items: one_to_many"}
{"Example: "} - {"refunds joins to orders. Used carefully, it explains net revenue. Joined naively, it can duplicate order-level measures."} + {"refunds joins to orders. Used carefully, it explains net revenue. Joined naively, it duplicates order-level measures."}
-The graph is bidirectional for planning. If `orders -> customers` is -`many_to_one`, the reverse path is `one_to_many`. +Edges and grain come from your YAML. The compiler treats them as fact, +not a guess. + +```yaml +# semantic-layer/warehouse/orders.yaml +name: orders +table: public.orders +grain: [order_id] +joins: + - to: customers + on: customer_id = customers.id + relationship: many_to_one + - to: order_items + on: id = order_items.order_id + relationship: one_to_many +measures: + - name: revenue + expr: sum(case when status != 'refunded' then amount end) +``` ## Building and maintaining the graph -KTX starts from evidence, writes reviewable source YAML, and treats the merged -diff as the accepted graph. +KTX builds the graph from evidence and accepted edits, not from runtime +inference. Each input contributes a different kind of authority. | Evidence | What it contributes | |----------|---------------------| | Declared primary keys | Initial row grain | | Declared foreign keys | Formal join candidates | -| Inferred relationships | Edges when warehouses lack constraints | +| Inferred relationships | Edges when the warehouse lacks constraints | | dbt, MetricFlow, and LookML imports | Existing metrics, dimensions, explores, and joins | -| Query history | Real join and filter patterns | +| Query history | Real join and filter patterns from analyst SQL | | Analyst review | Final authority before context is merged |
-## Modeling problems +## Fan-out and aggregate locality -Fan-out is the classic failure mode: an order-level measure joins to line-item -rows before aggregation, so one order becomes many rows. +Fan-out is the classic analytics failure mode. Two fact tables join to a +shared dimension. A naive query joins them all together first, so each +row from one fact is multiplied by the matching rows from the other. +Measures duplicate, numbers go wrong, and the agent doesn't notice. -| Problem | What happens | How KTX handles it | -|---------|--------------|--------------------| -| Order measure joins to `order_items` | `orders.revenue` repeats once per item | Detect `one_to_many` and pre-aggregate | -| Two fact sources share `customers` | Measures multiply across the shared dimension | Treat as a chasm trap and plan each fact locally | -| Filter crosses `one_to_many` | Filtering changes measure grain | Reject or localize the filter | -| Equal-cost paths connect sources | Join choice is ambiguous | Prefer safer paths or require aliases | - -## Execution planning - -The planner resolves sources, chooses a join tree, checks relationship paths, -and picks a simple or aggregate-locality SQL shape. +KTX's planner detects the shape by grouping measures by their owning +source. If more than one source contributes raw measures, the generator +switches to aggregate locality: each fact is pre-aggregated at its own +grain inside a CTE, and the CTEs are joined back to the dimension at the +end. | Naive SQL shape | Semantic-layer SQL shape | |-----------------|--------------------------| -| Join facts and dimensions first, then aggregate | Aggregate each fact source at its own grain, then join results | -| Put every filter in one outer `WHERE` clause | Keep measure filters with the measure source when locality is needed | -| Trust the shortest textual join path | Prefer safe relationship paths and reject disconnected sources | -| Let dimension grain differ across facts | Raise when asymmetric dimensions would fan out another measure | +| Join facts and dimensions first, then aggregate | Aggregate each fact at its own grain, then join | +| Put every filter in one outer `WHERE` clause | Keep measure filters with the measure source | +| Trust the shortest textual join path | Prefer typed safe paths, reject disconnected sources | +| Let dimension grain differ across facts | Raise when an asymmetric dimension would fan out another measure | -
-
-

- {"Fan-out handling"} -

-

- {"The same question planned before and after KTX preserves the measure grain."} -

-
-
-
-
-

- {"Unsafe shape"} -

-

- {"Join first, aggregate later"} -

-
-
-{`orders
-  -> join order_items
-  -> join customers
+The result is the same analyst answer, computed with the join shape an
+analyst would have written by hand.
 
-group by
-  customer_segment
+## Where the context comes from
 
-measure
-  sum(orders.amount)`}
-      
-
- {"Order-level revenue is exposed to line-item fan-out before aggregation."} -
-
-
-
-

- {"KTX shape"} -

-

- {"Aggregate locally, then join"} -

-
-
-{`orders_agg as (
-  select customer_id, sum(amount) revenue
-  from orders
-  group by customer_id
-)
-select customers.segment, sum(revenue)
-from orders_agg
-join customers`}
-      
-
- {"The measure is pre-aggregated at order grain before dimensions are joined."} -
-
-
-
+The planner is only as good as the YAML it reads. KTX builds and +maintains that YAML for you. -The result is structured planning: validated sources, typed relationships, -graph search, fan-out detection, aggregate locality, and dialect transpilation. +- `raw-sources//` holds scan evidence from your warehouse: + schemas, columns, keys, samples, and observed usage patterns. +- `wiki/` holds business language, definitions, and caveats. The + planner doesn't read wiki at compile time, but the agent does, so + measure names and dimensions stay anchored to terms the team uses. +- `semantic-layer//` holds the structured sources, joins, + grain, measures, and segments the planner actually compiles against. + +Every accepted edit flows back into the next ingest, so the graph stays +current as the warehouse changes. ## Agent usage notes -Use this page when an agent needs to explain how KTX turns reviewed semantic -context into SQL, why relationship metadata matters, or why a query was rejected -as unsafe. +Point an agent at this page when it needs to explain why KTX asks for +grain, why a query was rejected as unsafe, or why the compiled SQL looks +different from what the agent first proposed. | Agent task | Relevant section | Next page | |------------|------------------|-----------| -| Explain why KTX asks for `grain` and relationship types | Join graph | [Writing Context](/docs/guides/writing-context) | -| Diagnose duplicated measures after a join | Modeling problems | [ktx sl](/docs/cli-reference/ktx-sl) | -| Explain safe SQL generation | Execution planning | [ktx sl](/docs/cli-reference/ktx-sl) | -| Describe how semantic context stays current | Building and maintaining the graph | [Context as Code](/docs/concepts/context-as-code) | +| Explain the Semantic Query shape | The Semantic Query contract | [ktx sl](/docs/cli-reference/ktx-sl) | +| Describe what the planner does between query and SQL | What the planner does | [ktx sl](/docs/cli-reference/ktx-sl) | +| Explain why KTX asks for grain and relationship types | The join graph | [Writing context](/docs/guides/writing-context) | +| Diagnose duplicated measures after a join | Fan-out and aggregate locality | [ktx sl](/docs/cli-reference/ktx-sl) | +| Describe how semantic context stays current | Building and maintaining the graph | [Context as code](/docs/concepts/context-as-code) | diff --git a/docs-site/content/docs/concepts/the-context-layer.mdx b/docs-site/content/docs/concepts/the-context-layer.mdx index 9a8130d0..c56327c5 100644 --- a/docs-site/content/docs/concepts/the-context-layer.mdx +++ b/docs-site/content/docs/concepts/the-context-layer.mdx @@ -74,7 +74,7 @@ measures: ``` For join graphs, fan-out handling, and execution mechanics, read -[Context-Aware SQL](/docs/concepts/semantic-layer-internals). +[Semantic Querying](/docs/concepts/semantic-layer-internals). ## Wiki pages diff --git a/docs-site/content/docs/integrations/agent-clients.mdx b/docs-site/content/docs/integrations/agent-clients.mdx index ffb67b59..4a670315 100644 --- a/docs-site/content/docs/integrations/agent-clients.mdx +++ b/docs-site/content/docs/integrations/agent-clients.mdx @@ -285,7 +285,7 @@ Admin CLI skills call the same KTX CLI commands: | `ktx sl list --json` | List semantic-layer sources | | `ktx sl search --json` | Search semantic-layer sources | | `ktx sl validate --connection-id ` | Validate semantic source definitions | -| `ktx sl query --format json` | Execute a semantic-layer query when semantic compute is configured | +| `ktx sl query --format json` | Execute a Semantic Query when semantic compute is configured | ### Security constraints diff --git a/docs-site/content/docs/integrations/primary-sources.mdx b/docs-site/content/docs/integrations/primary-sources.mdx index 4b09c3a6..8f9e610c 100644 --- a/docs-site/content/docs/integrations/primary-sources.mdx +++ b/docs-site/content/docs/integrations/primary-sources.mdx @@ -515,4 +515,4 @@ No authentication required - SQLite is file-based. The file must be readable by | Database ingest returns no tables | Schema, database, or project filter is wrong, or the user lacks metadata permissions | Verify the schema list and grant metadata read permissions | | Query history is empty | Query history extension or warehouse history view is unavailable | Enable the warehouse-specific history feature, then rerun `ktx ingest --query-history` or `ktx setup` | | Column statistics are missing | Connector cannot access stats tables or the warehouse does not expose them | Grant stats permissions where supported; otherwise rely on fast schema context | -| Semantic query execution fails | Connection is missing, unreachable, or query execution is disabled | Run `ktx connection test ` and check the `ktx sl query` flags | +| Semantic Query execution fails | Connection is missing, unreachable, or query execution is disabled | Run `ktx connection test ` and check the `ktx sl query` flags | diff --git a/docs-site/next.config.mjs b/docs-site/next.config.mjs index 3beb3073..30a96741 100644 --- a/docs-site/next.config.mjs +++ b/docs-site/next.config.mjs @@ -15,6 +15,12 @@ const config = { }, async redirects() { return [ + { + source: "/", + destination: "/ktx/docs/getting-started/introduction", + permanent: false, + basePath: false, + }, { source: "/docs", destination: "/docs/getting-started/introduction", diff --git a/docs-site/middleware.ts b/docs-site/proxy.ts similarity index 96% rename from docs-site/middleware.ts rename to docs-site/proxy.ts index 1b892076..49d1c324 100644 --- a/docs-site/middleware.ts +++ b/docs-site/proxy.ts @@ -6,7 +6,7 @@ const markdownMimeTypes = new Set([ "application/markdown", ]); -export function middleware(request: NextRequest) { +export function proxy(request: NextRequest) { if (!isMarkdownPreferred(request.headers.get("accept"))) { return NextResponse.next(); } diff --git a/docs-site/tests/docs-index-route.test.mjs b/docs-site/tests/docs-index-route.test.mjs index 721813ec..fdd8ec81 100644 --- a/docs-site/tests/docs-index-route.test.mjs +++ b/docs-site/tests/docs-index-route.test.mjs @@ -112,6 +112,18 @@ test("/ktx/docs redirects to the docs introduction", async () => { ); }); +test("/ redirects into the /ktx docs site", async () => { + const response = await fetch(`${docsSiteUrl}/`, { + redirect: "manual", + }); + + assert.equal(response.status, 307); + assert.equal( + response.headers.get("location"), + `${docsBasePath}/docs/getting-started/introduction`, + ); +}); + test("/ktx/api/search returns docs search results", async () => { const response = await fetch( `${docsSiteUrl}${docsBasePath}/api/search?query=setup`, diff --git a/docs-site/tests/docs-search-behavior.test.mjs b/docs-site/tests/docs-search-behavior.test.mjs index 0a96482b..ece51477 100644 --- a/docs-site/tests/docs-search-behavior.test.mjs +++ b/docs-site/tests/docs-search-behavior.test.mjs @@ -1,5 +1,5 @@ import assert from "node:assert/strict"; -import { readFile } from "node:fs/promises"; +import { access, readFile } from "node:fs/promises"; import { dirname, join } from "node:path"; import { test } from "node:test"; import { fileURLToPath } from "node:url"; @@ -17,6 +17,23 @@ test("root provider uses the base-path-aware search API", async () => { assert.match(layout, /api:\s*"\/ktx\/api\/search"/); }); +test("metadata icons include the docs base path", async () => { + const layout = await readDocsFile("app/layout.tsx"); + + assert.match(layout, /icon:\s*"\/ktx\/brand\/ktx-mascot\.svg"/); + assert.match(layout, /shortcut:\s*"\/ktx\/brand\/ktx-mascot\.svg"/); + assert.doesNotMatch(layout, /:\s*"\/brand\/ktx-mascot\.svg"/); +}); + +test("markdown negotiation uses the Next proxy convention", async () => { + await assert.doesNotReject(access(join(docsSiteDir, "proxy.ts"))); + await assert.rejects(access(join(docsSiteDir, "middleware.ts"))); + + const proxy = await readDocsFile("proxy.ts"); + assert.match(proxy, /export function proxy/); + assert.doesNotMatch(proxy, /export function middleware/); +}); + test("site background stacking does not target every body child", async () => { const css = await readDocsFile("app/global.css");