chore(workspace): gate dead-code with knip production mode (#196)

* refactor(workspace): relocate @ktx/llm source into packages/cli/src/llm * refactor(workspace): rewrite @ktx/llm imports to relative paths * refactor(workspace): fold internal packages into cli * chore(workspace): gate dead-code with knip production mode Turn on production-mode knip plus an autofix run in pre-commit and the `pnpm dead-code` script, document the `/** @internal */` convention for test-only exports in AGENTS.md, annotate test-only exports across the CLI with that JSDoc, and drop dead exports/wrappers the new gate surfaced (e.g. `cli-project.ts`, `lookerRuntimeSourceToFileAdapterSource`, `createLocalScanEnrichmentProvidersFromConfig`, `PGLITE_OWNER_PROCESS_BACKEND_CAPABILITIES`, stale type re-exports). Replace the loose `ignoreIssues` allowlist in `knip.json` with explicit production entries so cross-package barrel leaks are caught. * refactor(cli): delete internal barrel index.ts files The 34 `index.ts` re-export barrels inside `packages/cli/src/` were holdovers from the pre-fold multi-workspace structure. Post-fold-in they served no production purpose: external consumers go through the single package main entry, and in-repo callers mostly imported through them only because the path was short. Internally, knip flagged most barrel re-exports as production-dead (only reached via tests). This change: - Deletes every internal barrel except `packages/cli/src/index.ts` (the published package entry). - Rewrites ~270 source/test files to import each name directly from the file that defines it. - Moves `tools/warehouse-verification/index.ts` to `create-warehouse-verification-tools.ts` (the function it defined locally) and updates its single consumer. - Renames `search/backend-conformance.ts` → `.test-utils.ts` to match the existing test-helper file convention. - Deletes 13 dead test-only chains (dbt-descriptions/*, live-database/extracted-schema, live-database/structural-sync, relationship-* feedback/review chain) plus their tests and a cascading orphan integration test. - Updates test mocks that pointed at deleted barrel paths (notion-client, connector barrels in scan/local-scan-connectors tests) to mock the source files instead. - Points the maintainer benchmark script (`scripts/relationship-benchmark-report.mjs`) at source files instead of `dist/context/scan/index.js`. - Drops the barrel `!` entries from `knip.json`; adds explicit production entries only for the benchmark code reached via dist by the maintainer script. Net: 413 files changed, ~1.2k insertions, ~9.4k deletions. `pnpm run dead-code` (Biome + knip default + knip production) and `pnpm run type-check` are clean; 2277 tests pass. * refactor(workspace): rename @ktx/cli to @kaelio/ktx and pack it directly Promote the CLI workspace package to the public name `@kaelio/ktx` and drop the separate `scripts/build-public-npm-package.mjs` wrapper. The CLI package is now publishable in place (`publishConfig.access: public`, `provenance: true`), so artifact packing uses `pnpm pack` against `packages/cli/` instead of assembling a parallel package tree. Updates all workspace filter invocations, docs, tests, and release readiness checks to reference the new package name, and folds the tarball-name helper into `scripts/public-npm-release-metadata.mjs`. * docs: align "agent clients" and "data agents" terminology Replace "client agents" with "agent clients" and "database agents" with "data agents" across AGENTS.md, README.md, the docs-site copy, and the matching setup-agents test description, matching the canonical vocabulary in docs/terminology.md. Also moves packages/cli/tsconfig.json's tsBuildInfoFile from node_modules/.cache/ to dist/.tsbuildinfo so incremental builds survive node_modules reinstalls. * refactor(release): single source of truth for package version Make packages/cli/package.json the single source of truth for the @kaelio/ktx version. publicNpmPackageVersion() now reads it directly, so artifact filenames, release-readiness checks, and the Python wheel version all derive from one field. The duplicate release-policy.json.publicNpmPackageVersion is removed. Previously the two fields could drift: tarballs were named kaelio-ktx-0.4.1.tgz while internally containing @kaelio/ktx@0.0.0-private. - update-public-release-version.mjs rewrites both Python pyproject.toml files (ktx-daemon, ktx-sl) alongside the npm package.jsons, normalizing the version for PEP 440 (e.g. 0.1.0-rc.2 -> 0.1.0rc2). - semantic-release-config.cjs adds the two pyproject.toml files to @semantic-release/git assets so the release commit back to main carries every version source in lockstep. - The six "?? '0.0.0-private'" fallback literals across the CLI are replaced with "?? getKtxCliPackageInfo().version", and createDefaultKtxMcpServer makes its version arg required. - docs/release.md describes the actual commit-back model: the dev tree always reflects the most recent release; no sentinel pin to maintain. Verified: pnpm run artifacts:build now produces kaelio-ktx-0.4.1.tgz and kaelio_ktx-0.4.1-py3-none-any.whl with @kaelio/ktx@0.4.1 inside. Full type-check, dead-code, and 2287 vitests + 173 script tests pass. * refactor(cli): inject embedding provider resolution and detect sentence-transformers runtime Make resolveProjectEmbeddingProvider and runtimeIo injectable in ingest and scan command entrypoints so tests can stub them, and teach resolvePublicIngestRuntimeRequirements to flag the local-embeddings runtime feature when ktx.yaml selects sentence-transformers. * chore(cli): mark buildLocalStatsStatus and LocalStatsStatus as @internal Both symbols are consumed only by status-project.test.ts. Annotating with /** @internal */ keeps knip's production-mode check clean without changing runtime behavior. * fix(cli): use real package metadata in print-command-tree The stubbed package name embedded a forbidden product identifier that tripped the boundary check in CI. Read the metadata from package.json instead — keeps the rendered tree unchanged and removes a duplicate source of truth. * feat(cli): show embedding coverage in `ktx status`, drop duplicate disk counts Inline `(N embedded)` next to the Wiki scope counts and Semantic-layer source counts, computed with `SUM(embedding_json IS NOT NULL)` over `knowledge_pages` and `local_sl_sources`. Rename the "Knowledge" label to "Wiki" (canonical per `docs/terminology.md`) and rename the matching `localStats.knowledgePages` field to `localStats.wikiPages`. Drop `wiki=N md` and `semantic-layer=N yaml` from the Disk row — those duplicated the per-surface rows above. Disk now reports only actual byte usage (db, cache, raw-sources). The unused `wikiGlobalMarkdownCount` / `semanticLayerYamlCount` fields, the `isMarkdownEntry` / `isYamlEntry` helpers, and the `filter` arg on `summarizeDir` are removed.
2026-06-25 08:48:08 +02:00 · 2026-05-21 15:28:58 +02:00 · 2026-05-21 15:28:58 +02:00 · 2366b00301
commit 2366b00301
parent a1cfb03d73
1002 changed files with 2286 additions and 12051 deletions
--- a/packages/cli/src/context/sl/tools/sl-write-source.tool.ts
+++ b/packages/cli/src/context/sl/tools/sl-write-source.tool.ts
@ -0,0 +1,406 @@
+import YAML from 'yaml';
+import { z } from 'zod';
+import { addTouchedSlSource } from '../../../context/tools/touched-sl-sources.js';
+import type { ToolContext, ToolOutput } from '../../../context/tools/base-tool.js';
+import { validateActionRawPaths } from '../../../context/tools/action-raw-paths.js';
+import { validateActionTargetConnection } from '../../../context/tools/action-target-connection.js';
+import { sourceOverlaySchema } from '../schemas.js';
+import type { SemanticLayerService } from '../semantic-layer.service.js';
+import type { SemanticLayerSource } from '../types.js';
+import {
+  BaseSemanticLayerTool,
+  type BaseSemanticLayerToolDeps,
+  type SemanticLayerStructured,
+  sourceDefinitionSchema,
+} from './base-semantic-layer.tool.js';
+import { normalizeSemanticLayerDescriptions } from '../description-normalization.js';
+import { slToolConnectionIdSchema } from './connection-id-schema.js';
+
+const sourceInputSchema = z.union([sourceDefinitionSchema, sourceOverlaySchema]);
+
+const slWriteSourceInputSchema = z.object({
+  connectionId: slToolConnectionIdSchema.describe('Data source connection ID'),
+  sourceName: z
+    .string()
+    .regex(/^[a-z0-9][a-z0-9_]*$/, 'Source name must be snake_case (lowercase alphanumeric and underscores)')
+    .describe('Name of the source to create, edit, or delete'),
+  source: sourceInputSchema
+    .optional()
+    .describe(
+      'Source definition (standalone with table/sql) or overlay (measures, column_overrides, computed columns, etc.)',
+    ),
+  delete: z.boolean().optional().describe('Set to true to delete this source entirely'),
+  rawPaths: z
+    .array(z.string().min(1))
+    .optional()
+    .describe('In ingest sessions, raw source file paths that directly support this SL action.'),
+});
+
+type SlWriteSourceInput = z.infer<typeof slWriteSourceInputSchema>;
+
+function actionTargetConnectionId(
+  runConnectionId: string | null | undefined,
+  actionConnectionId: string,
+): string | null {
+  return runConnectionId && runConnectionId !== actionConnectionId ? actionConnectionId : null;
+}
+
+export class SlWriteSourceTool extends BaseSemanticLayerTool<typeof slWriteSourceInputSchema> {
+  readonly name = 'sl_write_source';
+
+  constructor(deps: BaseSemanticLayerToolDeps) {
+    super(deps);
+  }
+
+  get description(): string {
+    return `<purpose>
+Create a new semantic layer source or fully rewrite an existing one.
+If the source already exists, this tool will overwrite it with the new definition.
+</purpose>
+
+<when_to_use>
+- First time creating a source definition
+- When modeling a new SQL-backed source (e.g., churn risk view, ARR calculation)
+- When the user asks to start over / fully rewrite a source
+- Consolidating multiple sources into one (write merged definition)
+- For targeted edits to existing sources (add/remove measures, update joins), prefer sl_edit_source instead
+</when_to_use>
+
+<editing_approach>
+- New source: provide \`source\` with full definition
+- Full rewrite: provide \`source\` (overwrites existing)
+- Targeted edits on an existing source: use sl_edit_source instead
+- Delete: set \`delete: true\`
+</editing_approach>
+
+<source_definition>
+- name: Unique identifier for the source
+- table: For physical table/view sources (e.g., "public.orders"). Mutually exclusive with sql.
+- sql: For SQL-based sources (the SQL query). Mutually exclusive with table.
+- grain: What one row represents (e.g., ["id"], ["customer_id", "product_id"])
+- columns: All columns with type (string/number/time/boolean) and optional descriptions. On overlays, columns are computed-only and require expr + type.
+- column_overrides: Overlay-only metadata patches for existing manifest columns (descriptions, role, visibility, constraints, enum_values, tests). Do not include type or expr.
+- joins: Relationships to other sources (to, on, relationship: many_to_one/one_to_many/one_to_one)
+- measures: Pre-defined aggregations (name, expr like "sum(amount)", optional filter, optional segments — bare names of segments defined on the same source, optional description)
+- segments: Named, reusable boolean predicates scoped to this source (name, expr — a SQL boolean over this source's columns, optional description). A measure references one with \`segments: [name]\`; a query references one with the dotted form \`source.segment_name\`. Use when the same predicate appears on 3+ measures — e.g. extract \`is_paid = true and is_refunded = '0'\` as \`segments: [{name: paid_non_refunded, expr: "..."}]\` and have each measure use \`segments: [paid_non_refunded]\` instead of re-typing the predicate inside \`sum(case when ... then x end)\`. Segments are predicates only — they cannot be selected as dimensions or grouped by; if you need to group by the predicate, add a \`columns[]\` entry instead.
+</source_definition>
+
+<join_requirements>
+Sources with joins: [] are disconnected from the semantic layer join graph and cannot be composed with other sources in semantic queries.
+Before writing, use discover_data to check existing sources and their grain columns.
+For each grain/key column in your source (e.g., account_id, item_id), find the matching dimension source (e.g., ACCOUNTS, ITEMS) and declare a many_to_one join.
+Example: a source graining on [account_id] should declare:
+  joins:
+    - to: ACCOUNTS
+      on: source_name.account_id = ACCOUNTS.ACCOUNT_ID
+      relationship: many_to_one
+The on condition format: local_column = TARGET_SOURCE.target_column (right side must include target source name).
+Do NOT join back to a table that the SQL already aggregates from if the grain column is not in the output (the relationship is already baked into the SQL).
+</join_requirements>`;
+  }
+
+  get inputSchema() {
+    return slWriteSourceInputSchema;
+  }
+
+  async call(input: SlWriteSourceInput, context: ToolContext): Promise<ToolOutput<SemanticLayerStructured>> {
+    const { connectionId, sourceName } = input;
+    const { name: author, email: authorEmail } = await this.authorResolver.resolve(context.userId);
+
+    const semanticLayerService = context.session?.semanticLayerService ?? this.semanticLayerService;
+    const skipIndex = context.session?.isWorktreeScoped === true;
+    const targetConnectionValidation = validateActionTargetConnection(context.session, connectionId);
+    if (!targetConnectionValidation.ok) {
+      return this.buildOutput(false, [targetConnectionValidation.error], sourceName);
+    }
+    const rawPathValidation = validateActionRawPaths(context.session, input.rawPaths);
+    if (!rawPathValidation.ok) {
+      return this.buildOutput(false, [rawPathValidation.error], sourceName);
+    }
+
+    // Handle delete
+    if (input.delete) {
+      try {
+        await semanticLayerService.deleteSource(connectionId, sourceName, author, authorEmail);
+        if (!skipIndex) {
+          const { sources: allSources } = await semanticLayerService.loadAllSources(connectionId);
+          await this.slSearchService.indexSources(connectionId, allSources).catch(() => {});
+        }
+        if (context.session) {
+          addTouchedSlSource(context.session.touchedSlSources, connectionId, sourceName);
+          context.session.actions.push({
+            target: 'sl',
+            type: 'removed',
+            key: sourceName,
+            detail: 'Deleted source',
+            targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId),
+            ...(rawPathValidation.rawPaths ? { rawPaths: rawPathValidation.rawPaths } : {}),
+          });
+        }
+        return this.buildOutput(true, [], sourceName, { yaml: undefined, commitHash: undefined });
+      } catch (error) {
+        return this.buildOutput(false, [error instanceof Error ? error.message : String(error)], sourceName);
+      }
+    }
+
+    // Require source for create/rewrite
+    if (!input.source) {
+      return this.buildOutput(
+        false,
+        ['Provide `source` to create or rewrite. For targeted edits, use sl_edit_source.'],
+        sourceName,
+      );
+    }
+
+    return this.writeFullSource(
+      connectionId,
+      input.source,
+      sourceName,
+      author,
+      authorEmail,
+      context,
+      semanticLayerService,
+      skipIndex,
+      rawPathValidation.rawPaths,
+    );
+  }
+
+  private async writeFullSource(
+    connectionId: string,
+    source: z.infer<typeof sourceInputSchema>,
+    sourceName: string,
+    author: string,
+    authorEmail: string,
+    context: ToolContext,
+    semanticLayerService: SemanticLayerService,
+    skipIndex: boolean,
+    rawPaths: string[] | undefined,
+  ): Promise<ToolOutput<SemanticLayerStructured>> {
+    const normalizedSource = normalizeSemanticLayerDescriptions(source, { fillMissing: !!context.session?.ingest });
+    const isOverlay =
+      !('table' in normalizedSource && normalizedSource.table) && !('sql' in normalizedSource && normalizedSource.sql);
+
+    const existing = await this.readSourceYamlFromService(semanticLayerService, connectionId, sourceName);
+    const commitMessage = existing
+      ? `${isOverlay ? 'Update overlay' : 'Rewrite source'}: ${sourceName}`
+      : `${isOverlay ? 'Create overlay' : 'Create source'}: ${sourceName}`;
+
+    const yamlContent = YAML.stringify(normalizedSource, { indent: 2, lineWidth: 0, version: '1.1' });
+
+    const orphanError = await this.rejectOrphanOverlay(semanticLayerService, connectionId, sourceName, yamlContent);
+    if (orphanError) {
+      return this.buildOutput(false, [orphanError], sourceName, { yaml: yamlContent });
+    }
+    const shadowError = await this.rejectStandaloneShadow(semanticLayerService, connectionId, sourceName, yamlContent);
+    if (shadowError) {
+      return this.buildOutput(false, [shadowError], sourceName, { yaml: yamlContent });
+    }
+
+    const validatedSource = normalizedSource as SemanticLayerSource;
+    const validationResult = await semanticLayerService.validateWithProposedSource(connectionId, validatedSource);
+    const validationErrors = validationResult.errors;
+    const validationWarnings = [...validationResult.warnings];
+    const actionRequiredWarnings = validationResult.perSourceWarnings?.[sourceName] ?? [];
+    if (validationErrors.length > 0) {
+      return this.buildOutput(false, ['Validation failed — source was NOT saved:', ...validationErrors], sourceName, {
+        yaml: yamlContent,
+        validationErrors,
+        validationWarnings,
+        actionRequiredWarnings,
+      });
+    }
+
+    try {
+      const result = await semanticLayerService.writeSource(
+        connectionId,
+        validatedSource,
+        author,
+        authorEmail,
+        commitMessage,
+      );
+
+      if (!skipIndex) {
+        const { sources: allSources } = await semanticLayerService.loadAllSources(connectionId);
+        await this.slSearchService.indexSources(connectionId, allSources).catch(() => {});
+      }
+
+      if (context.session) {
+        addTouchedSlSource(context.session.touchedSlSources, connectionId, sourceName);
+        context.session.actions.push({
+          target: 'sl',
+          type: existing ? 'updated' : 'created',
+          key: sourceName,
+          detail: existing ? `Rewrote source` : `Created source`,
+          targetConnectionId: actionTargetConnectionId(context.session.connectionId, connectionId),
+          ...(rawPaths ? { rawPaths } : {}),
+        });
+      }
+
+      return this.buildOutput(true, [], sourceName, {
+        yaml: yamlContent,
+        commitHash: result.commitHash ?? undefined,
+        validationErrors,
+        validationWarnings,
+        actionRequiredWarnings,
+      });
+    } catch (error) {
+      return this.buildOutput(false, [error instanceof Error ? error.message : String(error)], sourceName);
+    }
+  }
+
+  private async readSourceYamlFromService(
+    service: SemanticLayerService,
+    connectionId: string,
+    sourceName: string,
+  ): Promise<string | null> {
+    try {
+      const { content } = await service.readSourceFile(connectionId, sourceName);
+      return content;
+    } catch {
+      return null;
+    }
+  }
+
+  private async rejectOrphanOverlay(
+    semanticLayerService: SemanticLayerService,
+    connectionId: string,
+    sourceName: string,
+    content: string,
+  ): Promise<string | null> {
+    let parsed: Record<string, unknown>;
+    try {
+      parsed = YAML.parse(content) as Record<string, unknown>;
+    } catch {
+      return null;
+    }
+    if (!parsed || typeof parsed !== 'object') {
+      return null;
+    }
+    const isOverlay = !('table' in parsed && parsed.table) && !('sql' in parsed && parsed.sql);
+    if (!isOverlay) {
+      return null;
+    }
+
+    const manifestNames = await semanticLayerService.listManifestSourceNames(connectionId);
+    if (manifestNames.includes(sourceName)) {
+      return null;
+    }
+
+    const suggestions = this.nearestMatches(sourceName, manifestNames, 3);
+    return [
+      `Error: cannot write "${sourceName}" as an overlay — no manifest entry with that name exists.`,
+      suggestions.length > 0
+        ? `  Nearest manifest matches: ${suggestions.join(', ')}.`
+        : `  No manifest entries resemble "${sourceName}".`,
+      `To customize an existing base table, retarget the overlay at one of the nearest matches.`,
+      `For a LookML derived_table or any source backed by inline SQL, rewrite as a standalone`,
+      `curated source with a top-level "sql:" block plus explicit "grain:" and "columns:".`,
+    ].join('\n');
+  }
+
+  private async rejectStandaloneShadow(
+    semanticLayerService: SemanticLayerService,
+    connectionId: string,
+    sourceName: string,
+    content: string,
+  ): Promise<string | null> {
+    let parsed: Record<string, unknown>;
+    try {
+      parsed = YAML.parse(content) as Record<string, unknown>;
+    } catch {
+      return null;
+    }
+    if (!parsed || typeof parsed !== 'object') {
+      return null;
+    }
+    const isOverlay = !('table' in parsed && parsed.table) && !('sql' in parsed && parsed.sql);
+    if (isOverlay) {
+      return null;
+    }
+
+    const isManifestBacked = await semanticLayerService.isManifestBacked(connectionId, sourceName);
+    if (!isManifestBacked) {
+      return null;
+    }
+
+    return [
+      `Error: cannot write "${sourceName}" as a standalone source — a manifest entry with that name already exists.`,
+      `  Writing standalone would drop the manifest's columns and joins, leaving only what you list here.`,
+      `To add measures/segments on top of the manifest, rewrite this YAML as an overlay:`,
+      `  - Remove "sql:", "table:", "grain:", and base-table "columns:".`,
+      `  - Keep "name:" plus "measures:", "segments:", "descriptions:", "joins:", "disable_joins:",`,
+      `    "exclude_columns:", "column_overrides:", and/or computed-only "columns:" entries with expr + type.`,
+      `  - The manifest's schema is inherited automatically.`,
+      `If you really need a different base table, use a different source name.`,
+    ].join('\n');
+  }
+
+  private nearestMatches(needle: string, haystack: string[], limit: number): string[] {
+    if (haystack.length === 0) {
+      return [];
+    }
+    const lowerNeedle = needle.toLowerCase();
+    const scored = haystack.map((candidate) => {
+      const lower = candidate.toLowerCase();
+      const prefixBoost = lower.startsWith(lowerNeedle) || lowerNeedle.startsWith(lower) ? 0.2 : 0;
+      const substringBoost = lower.includes(lowerNeedle) || lowerNeedle.includes(lower) ? 0.1 : 0;
+      const score = jaroWinkler(lowerNeedle, lower) + prefixBoost + substringBoost;
+      return { candidate, score };
+    });
+    scored.sort((a, b) => b.score - a.score);
+    return scored
+      .filter((s) => s.score > 0.4)
+      .slice(0, limit)
+      .map((s) => s.candidate);
+  }
+}
+
+function jaroWinkler(a: string, b: string): number {
+  if (a === b) {
+    return 1;
+  }
+  const matchDistance = Math.max(0, Math.floor(Math.max(a.length, b.length) / 2) - 1);
+  const aMatches = new Array<boolean>(a.length).fill(false);
+  const bMatches = new Array<boolean>(b.length).fill(false);
+  let matches = 0;
+  for (let i = 0; i < a.length; i++) {
+    const start = Math.max(0, i - matchDistance);
+    const end = Math.min(i + matchDistance + 1, b.length);
+    for (let j = start; j < end; j++) {
+      if (bMatches[j]) {
+        continue;
+      }
+      if (a[i] !== b[j]) {
+        continue;
+      }
+      aMatches[i] = true;
+      bMatches[j] = true;
+      matches++;
+      break;
+    }
+  }
+  if (matches === 0) {
+    return 0;
+  }
+  let transpositions = 0;
+  let k = 0;
+  for (let i = 0; i < a.length; i++) {
+    if (!aMatches[i]) {
+      continue;
+    }
+    while (!bMatches[k]) {
+      k++;
+    }
+    if (a[i] !== b[k]) {
+      transpositions++;
+    }
+    k++;
+  }
+  const jaro = (matches / a.length + matches / b.length + (matches - transpositions / 2) / matches) / 3;
+  let prefix = 0;
+  const maxPrefix = Math.min(4, a.length, b.length);
+  while (prefix < maxPrefix && a[prefix] === b[prefix]) {
+    prefix++;
+  }
+  return jaro + prefix * 0.1 * (1 - jaro);
+}