diff --git a/AGENTS.md b/AGENTS.md index 52ddd87d..d5f41bcd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -288,6 +288,16 @@ use `PascalCase` without the suffix. source-code identifier, package/API name, or other literal value that must match the implementation. +### Terminology + +For canonical vocabulary used across docs, code, comments, CLI strings, and +error messages — including the disambiguation rule for the overloaded word +`source` (semantic / primary / context / source of truth) — see +[`docs/terminology.md`](docs/terminology.md). Follow that file when choosing +between near-synonyms (e.g. `connector` vs `adapter`, `data agent` vs +`database agent`, `fast ingest` vs `schema ingest`). Product-name rules in +this section take precedence over anything in that file when they conflict. + ### Updating `docs-site/` After Code Changes Before finishing a task, decide whether `docs-site/content/docs/` needs an diff --git a/docs/terminology.md b/docs/terminology.md new file mode 100644 index 00000000..fab5d290 --- /dev/null +++ b/docs/terminology.md @@ -0,0 +1,95 @@ +# ktx Terminology Rules + +Canonical vocabulary for coding agents working on this repository. Applies to +docs prose, code comments, identifiers, CLI strings, error messages, log lines, +and example output. + +For product-name capitalization rules (`ktx` vs `**ktx**` vs code font), see the +`Product Naming` section of `AGENTS.md` — those rules take precedence over +anything below when they conflict. + +## The "source" rule + +`source` does four different jobs in this codebase. Never write bare `source` +in prose when ambiguity is possible. Always qualify: + +- **semantic source** — the YAML file that describes a table +- **primary source** — the connected database +- **context source** — the analytics-tooling integration (dbt, Metabase, etc.) +- **source of truth** — the canonical place a fact lives + +Bare `source` is allowed only inside a section that has already established its +referent (e.g., body of a `Semantic sources` page, or `sourceName` as a CLI arg). + +## Canonical vocabulary + +| Concept | Use | Do not use | +|---|---|---| +| AI consumer (general prose) | **data agent** | analytics agent, database agent, client agent | +| AI consumer (Integrations nav) | **agent client** | client agent | +| Coding-tool framing (user-facing) | **coding agent** | — | +| The connected database | **primary source** / **database connection** | data source | +| Analytics-tooling integration | **context source** / **context-source connection** | BI source, BI model, metadata source, source tool | +| YAML file describing a table | **semantic source** | semantic-layer source, model file, bare "source file" | +| The whole **ktx** surface | **context layer** (lowercase in prose) | "Context Layer" in prose | +| The compiler pillar | **semantic layer** (lowercase in prose) | "Semantic Layer" in prose | +| The query payload | **semantic query** (lowercase in prose) | "Semantic Query" | +| The MCP layer | **MCP server** (the server), **MCP tools** (the functions) | "ktx MCP" as a standalone noun | +| The plugin/implementation | **connector** (prefix with **primary** or **context** when contrasting) | adapter, driver-as-noun | +| Config field value | `driver` (code font only) | `driver` as a generic noun | +| Merge step | **reconcile** / **reconciliation** / **reconciliation agent** | "merge intelligently", bare "LLM agent" | +| Connection ref in prose | **connection id** (lowercase, two words) | "connection ID" | +| CLI arg/flag literal | `connectionId` (code font) | — | +| File path placeholder | `` (code font) | — | +| Fast schema mode | **fast ingest** | schema ingest, schema-only ingest | +| AI-enriched mode | **deep ingest** | AI-enriched ingest | +| Ingest of a primary connection | **database ingest** | — | +| Ingest of a context-source connection | **context-source ingest** | bare "source ingest" | +| Wiki capture | **text ingest** | — | +| Query-history sub-mode | **query-history ingest** | — | +| SQL compilation | **compile** / **the compiler** / **SQL compilation** | "SQL generation" | +| Internal stage inside compilation | **planner** / **planning** (only in semantic-layer-internals) | — | +| Setup flow noun | **setup wizard** | "the wizard" (bare) | +| Setup flow contrast | **interactive setup** (vs non-interactive / flag-driven) | "interactive command" | +| The whole project | **ktx project** | "KTX project" (all caps) | +| The filesystem path | **project directory** | "project dir" | +| Wiki surface as a whole | **wiki** | "wiki context" | +| A single Markdown file | **wiki page** | — | +| YAML vs Markdown contrast | **wiki Markdown** (only when contrasting with **semantic source YAML**) | — | +| Joins multiplying rows (generic) | **fan-out** | — | +| The two named patterns | **chasm trap** / **fan trap** | — | +| Casual gloss in user prose | **double-count** | (avoid in technical/internals prose) | + +## Prose rules + +- **Article + ktx.** Treat `ktx` as a bare proper noun, no article: `ktx + is...`, `in ktx`. Articles attach to the *following* noun, not to `ktx`: + `the **ktx** MCP server`, `the **ktx** project`. +- **Capitalization.** Default lowercase for `context layer`, `semantic layer`, + `semantic query`. Title case only inside literal page titles or H1 headings. +- **Code font.** Reserve code font for the CLI command, binary, paths, config + field values (e.g. `driver: postgres`), CLI arg/flag literals + (`connectionId`, `--project-dir`), and path placeholders (``). + Do not use code font for prose nouns like *connector* or *reconciliation*. +- **`driver` is never a prose noun.** Always `driver: postgres` (code font, as + a config field value). For the noun, use `connector`. + +## Canonical lists + +Use these orderings verbatim when listing supported systems: + +- **Primary sources:** PostgreSQL, Snowflake, BigQuery, ClickHouse, MySQL, SQL + Server, SQLite +- **Context sources:** dbt, MetricFlow, LookML, Looker, Metabase, Notion + +If a doc or string omits or reorders members of either list, treat that as a +bug unless the surrounding text justifies the change. + +## When updating this file + +- Add a new row to the canonical vocabulary table; do not introduce a parallel + glossary elsewhere. +- If you rename a converged term, search the workspace for the previous form + and update call sites in the same change. +- When deprecating a term, add it to the *Do not use* column with a one-line + reason in the surrounding prose, not just in the table. diff --git a/packages/cli/src/context/ingest/local-bundle-runtime.test.ts b/packages/cli/src/context/ingest/local-bundle-runtime.test.ts index db0e5e11..3c87c351 100644 --- a/packages/cli/src/context/ingest/local-bundle-runtime.test.ts +++ b/packages/cli/src/context/ingest/local-bundle-runtime.test.ts @@ -111,6 +111,26 @@ describe('createLocalBundleIngestRuntime', () => { ); }); + it('warns when embeddings are configured but no embedding provider is supplied', () => { + const logger = { log: vi.fn(), warn: vi.fn(), error: vi.fn() }; + project.config.ingest.embeddings = { + backend: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + }; + + createLocalBundleIngestRuntime({ + project, + adapters: [new FakeSourceAdapter()], + agentRunner: testAgentRunner(), + logger: logger as never, + }); + + expect(logger.warn).toHaveBeenCalledWith( + '[local-bundle-runtime] embeddings backend "openai" is configured but no embedding provider was passed; embedding-dependent stages will run against a no-op embedding port.', + ); + }); + it('builds runner deps with local SQLite stores and context tools enabled', async () => { const agentRunner = testAgentRunner(); diff --git a/packages/cli/src/context/ingest/local-bundle-runtime.ts b/packages/cli/src/context/ingest/local-bundle-runtime.ts index 73e152cb..8b87d7be 100644 --- a/packages/cli/src/context/ingest/local-bundle-runtime.ts +++ b/packages/cli/src/context/ingest/local-bundle-runtime.ts @@ -653,6 +653,15 @@ export function createLocalBundleIngestRuntime( const store = new SqliteBundleIngestStore({ dbPath }); const contextStore = new SqliteContextEvidenceStore({ dbPath }); const embeddingProvider = options.embeddingProvider ?? null; + if (!embeddingProvider && options.project.config.ingest.embeddings.backend !== 'none') { + // Embedding-dependent stages (CandidateDedup clustering, ContextEvidenceIndex + // chunk indexing) silently produce zero-vector data with NoopEmbeddingPort. + // Surface that fact so the caller knows ingest will not be running its + // configured backend. + logger.warn( + `[local-bundle-runtime] embeddings backend "${options.project.config.ingest.embeddings.backend}" is configured but no embedding provider was passed; embedding-dependent stages will run against a no-op embedding port.`, + ); + } const embedding = embeddingProvider ? new KtxIngestEmbeddingPortAdapter(embeddingProvider) : new NoopEmbeddingPort(); const connections = new LocalConnectionCatalog(options.project, options.queryExecutor); const rootFileStore = options.project.fileStore; diff --git a/packages/cli/src/context/memory/local-memory.test.ts b/packages/cli/src/context/memory/local-memory.test.ts index 69d8e1f2..1a7240c9 100644 --- a/packages/cli/src/context/memory/local-memory.test.ts +++ b/packages/cli/src/context/memory/local-memory.test.ts @@ -88,6 +88,25 @@ describe('createLocalProjectMemoryIngest', () => { await rm(tempDir, { recursive: true, force: true }); }); + it('warns when embeddings are configured but memory ingest is created without an embedding provider', async () => { + const project = await initKtxProject({ projectDir: tempDir }); + project.config.ingest.embeddings = { + backend: 'openai', + model: 'text-embedding-3-small', + dimensions: 1536, + }; + const logger = { log: vi.fn(), warn: vi.fn(), error: vi.fn() }; + + createLocalProjectMemoryIngest(project, { + agentRunner: { runLoop: vi.fn() } as never, + logger: logger as never, + }); + + expect(logger.warn).toHaveBeenCalledWith( + '[memory-ingest] embeddings backend "openai" is configured but no embedding provider was passed; semantic search will fall back to a no-op embedding port.', + ); + }); + it('captures a wiki page through the local memory agent and persists pollable status', async () => { const project = await initKtxProject({ projectDir: tempDir }); const agentRunner = { diff --git a/packages/cli/src/context/memory/local-memory.ts b/packages/cli/src/context/memory/local-memory.ts index 121696e6..b72bc9ce 100644 --- a/packages/cli/src/context/memory/local-memory.ts +++ b/packages/cli/src/context/memory/local-memory.ts @@ -7,8 +7,10 @@ import type { KtxFileStorePort, KtxFileWriteResult } from '../../context/core/fi import { type KtxLogger, noopLogger } from '../../context/core/config.js'; import { SessionWorktreeService } from '../../context/core/session-worktree.service.js'; import type { KtxSemanticLayerComputePort } from '../../context/daemon/semantic-layer-compute.js'; +import { KtxIngestEmbeddingPortAdapter } from '../../context/llm/embedding-port.js'; import { createLocalKtxLlmRuntimeFromConfig } from '../../context/llm/local-config.js'; import { RuntimeAgentRunner, type AgentRunnerPort, type KtxLlmRuntimePort, type KtxRuntimeToolSet } from '../../context/llm/runtime-port.js'; +import type { KtxEmbeddingProvider } from '../../llm/types.js'; import type { KtxLocalProject } from '../../context/project/project.js'; import { PromptService } from '../../context/prompts/prompt.service.js'; import { SkillsRegistryService } from '../../context/skills/skills-registry.service.js'; @@ -61,6 +63,7 @@ export interface CreateLocalProjectMemoryIngestOptions { queryExecutor?: { execute(input: { connectionId: string; sql: string; maxRows?: number }): Promise }; runIdFactory?: () => string; logger?: KtxLogger; + embeddingProvider?: KtxEmbeddingProvider | null; } export function createLocalProjectMemoryIngest( @@ -69,7 +72,18 @@ export function createLocalProjectMemoryIngest( ): MemoryIngestService { const logger = options.logger ?? noopLogger; const rootFileStore = new LocalMemoryFileStore(project.fileStore); - const embedding = new NoopEmbeddingPort(); + const embedding = options.embeddingProvider + ? new KtxIngestEmbeddingPortAdapter(options.embeddingProvider) + : new NoopEmbeddingPort(); + if (!options.embeddingProvider && project.config.ingest.embeddings.backend !== 'none') { + // Memory-agent search (SlSearch, wiki) embeds against this port. With Noop the + // configured backend is silently inert — the agent will see empty vectors and + // rank results against zeros. Surface that so the caller knows to plumb the + // resolved embedding provider through. + logger.warn( + `[memory-ingest] embeddings backend "${project.config.ingest.embeddings.backend}" is configured but no embedding provider was passed; semantic search will fall back to a no-op embedding port.`, + ); + } const knowledgeIndex = new LocalKnowledgeIndex(project); const knowledgeEvents = new NoopKnowledgeEventPort(); const knowledgeSlRefs = new NoopKnowledgeSlRefsPort(); diff --git a/packages/cli/src/context/project/config.test.ts b/packages/cli/src/context/project/config.test.ts index f8faad58..3b7f2feb 100644 --- a/packages/cli/src/context/project/config.test.ts +++ b/packages/cli/src/context/project/config.test.ts @@ -176,6 +176,28 @@ llm: }); }); + it('requires a non-empty Vertex location when the Vertex provider block is present', () => { + const yaml = ` +llm: + provider: + backend: vertex + vertex: + project: local-gcp-project +`; + + expect(() => parseKtxProjectConfig(yaml)).toThrow(/llm\.provider\.vertex\.location/); + + const validation = validateKtxProjectConfig(yaml); + expect(validation.ok).toBe(false); + expect(validation.issues).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + path: 'llm.provider.vertex.location', + }), + ]), + ); + }); + it('parses Claude Code as a first-class LLM backend', () => { const config = parseKtxProjectConfig(` llm: diff --git a/packages/cli/src/context/project/config.ts b/packages/cli/src/context/project/config.ts index e19ee6af..2824ca59 100644 --- a/packages/cli/src/context/project/config.ts +++ b/packages/cli/src/context/project/config.ts @@ -30,13 +30,13 @@ const apiCredentialsSchema = z const vertexProviderSchema = z .strictObject({ project: z.string().min(1).optional().describe('Google Cloud project ID hosting the Vertex AI endpoint.'), - location: z.string().default('').describe('Vertex AI region (e.g. "us-east5"). Empty string falls back to the SDK default.'), + location: z.string().min(1).describe('Vertex AI region (e.g. "us-east5"). Required whenever the vertex provider block is present.'), }) .describe('Google Vertex AI provider configuration.'); const sentenceTransformersSchema = z .strictObject({ - base_url: z.string().default('').describe('Base URL of the sentence-transformers HTTP server. Leave empty (or omit) to use the project-managed local daemon.'), + base_url: z.string().default('').describe('Base URL of the sentence-transformers HTTP server. Leave empty (or omit) when the `ktx` CLI is expected to start and manage a local daemon for this project; programmatic consumers must populate it explicitly.'), pathPrefix: z.string().optional().describe('Optional URL path prefix prepended to embedding requests.'), }) .describe('Sentence-transformers embedding server configuration.'); @@ -83,7 +83,15 @@ const embeddingSchema = z .default('none') .describe('Embedding backend. "openai" and "sentence-transformers" call out to those providers; "none" disables embeddings.'), model: z.string().min(1).optional().describe('Provider-specific embedding model identifier (e.g. "text-embedding-3-small").'), - dimensions: z.int().positive().default(8).describe('Embedding vector dimensionality. Must match the chosen model when using a real provider.'), + dimensions: z + .int() + .positive() + .default(8) + .describe( + 'Embedding vector dimensionality. The default value 8 is a placeholder that is only valid alongside backend: none; ' + + 'before switching backend to openai/sentence-transformers, set this explicitly to match the chosen model ' + + '(e.g. 384 for all-MiniLM-L6-v2, 1536 for text-embedding-3-small).', + ), openai: apiCredentialsSchema.optional().describe('OpenAI credentials, used when backend is "openai".'), sentenceTransformers: sentenceTransformersSchema.optional().describe('Sentence-transformers server config, used when backend is "sentence-transformers".'), batchSize: z.int().positive().optional().describe('Number of texts per embedding API call. Omit to use the backend default.'), diff --git a/packages/cli/src/context/sl/semantic-layer.service.test.ts b/packages/cli/src/context/sl/semantic-layer.service.test.ts index 6ac3460a..0844a3c5 100644 --- a/packages/cli/src/context/sl/semantic-layer.service.test.ts +++ b/packages/cli/src/context/sl/semantic-layer.service.test.ts @@ -67,6 +67,23 @@ describe('listConnectionIdsWithNames', () => { }); }); +describe('loadSource', () => { + it('warns and returns null when an existing source file has invalid YAML', async () => { + const logger = { log: vi.fn(), warn: vi.fn(), error: vi.fn() }; + const configService = { + readFile: vi.fn().mockResolvedValue({ content: 'name: [' }), + }; + const service = new SemanticLayerService(configService as never, connectionCatalog(), pythonPort, logger as never); + + await expect(service.loadSource('warehouse', 'orders')).resolves.toBeNull(); + + expect(configService.readFile).toHaveBeenCalledWith('semantic-layer/warehouse/orders.yaml'); + expect(logger.warn).toHaveBeenCalledWith( + expect.stringContaining('[loadSource] warehouse/orders.yaml: YAML parse failed:'), + ); + }); +}); + describe('composeOverlay', () => { it('carries top-level segments from overlay into the composed source', () => { const overlay = { @@ -856,6 +873,22 @@ describe('loadAllSources — standalone enrichment via inherits_columns_from', ( expect(loadErrors.join('\n')).toContain(overlayPath); expect(loadErrors.join('\n')).toContain("move it to 'column_overrides:'"); }); + + it('reports and logs directory listing failures instead of treating them as empty sources', async () => { + const logger = { log: vi.fn(), warn: vi.fn(), error: vi.fn() }; + configService.listFiles.mockRejectedValue(new Error('permission denied')); + service = new SemanticLayerService(configService as never, connectionCatalog(), pythonPort, logger as never); + + const { sources, loadErrors } = await service.loadAllSources('conn-1'); + + expect(sources).toEqual([]); + expect(loadErrors).toEqual([ + 'Failed to list semantic-layer files under semantic-layer/conn-1: permission denied', + ]); + expect(logger.warn).toHaveBeenCalledWith( + 'Failed to list semantic-layer files under semantic-layer/conn-1: permission denied', + ); + }); }); describe('validateWithProposedSource', () => { diff --git a/packages/cli/src/context/sl/semantic-layer.service.ts b/packages/cli/src/context/sl/semantic-layer.service.ts index 7a780af2..e6afdeaf 100644 --- a/packages/cli/src/context/sl/semantic-layer.service.ts +++ b/packages/cli/src/context/sl/semantic-layer.service.ts @@ -202,12 +202,25 @@ export class SemanticLayerService { } async loadSource(connectionId: string, sourceName: string): Promise { + let content: string; try { - const { content } = await this.readSourceFile(connectionId, sourceName); - return YAML.parse(content) as SemanticLayerSource; + const result = await this.readSourceFile(connectionId, sourceName); + content = result.content; } catch { return null; } + try { + return YAML.parse(content) as SemanticLayerSource; + } catch (error) { + // Distinguish a YAML parse failure from a missing file. The file exists but + // its contents are unparseable — callers that treat null as "does not exist" + // could otherwise overwrite the broken file. Surface the parse failure via + // the service logger so the broken source is at least visible. + this.logger.warn( + `[loadSource] ${connectionId}/${sourceName}.yaml: YAML parse failed: ${error instanceof Error ? error.message : String(error)}`, + ); + return null; + } } async loadAllSources(connectionId: string): Promise { @@ -219,7 +232,10 @@ export class SemanticLayerService { try { const result = await this.configService.listFiles(dir); allFiles = result.files.filter((f) => f.endsWith('.yaml')); - } catch { + } catch (e) { + const message = `Failed to list semantic-layer files under ${dir}: ${e instanceof Error ? e.message : String(e)}`; + loadErrors.push(message); + this.logger.warn(message); return { sources: [], loadErrors }; } diff --git a/packages/cli/src/context/wiki/knowledge-wiki.service.test.ts b/packages/cli/src/context/wiki/knowledge-wiki.service.test.ts index d9242f97..88bd92ab 100644 --- a/packages/cli/src/context/wiki/knowledge-wiki.service.test.ts +++ b/packages/cli/src/context/wiki/knowledge-wiki.service.test.ts @@ -50,6 +50,27 @@ function makeService() { const fm: WikiFrontmatter = { summary: 'sum', usage_mode: 'auto' }; +describe('KnowledgeWikiService file reads', () => { + it('warns and returns null when an existing page cannot be parsed', async () => { + const { service, configService, logger } = makeService(); + configService.readFile.mockResolvedValue({ content: '---\nsummary: [\n---\nBody' }); + + await expect(service.readPage('GLOBAL', null, 'revenue')).resolves.toBeNull(); + + expect(configService.readFile).toHaveBeenCalledWith('wiki/global/revenue.md'); + expect(logger.warn).toHaveBeenCalledWith(expect.stringContaining('[readPage] wiki/global/revenue.md: parse failed:')); + }); + + it('warns and returns an empty page list when directory listing fails', async () => { + const { service, configService, logger } = makeService(); + configService.listFiles.mockRejectedValue(new Error('filesystem unavailable')); + + await expect(service.listPageKeys('GLOBAL', null)).resolves.toEqual([]); + + expect(logger.warn).toHaveBeenCalledWith('[listPageKeys] wiki/global: filesystem unavailable'); + }); +}); + describe('KnowledgeWikiService.syncIndex result stats', () => { it('reports scanned, updated, deleted, and embedding counts', async () => { const { service, pagesRepository, embeddingService, configService } = makeService(); diff --git a/packages/cli/src/context/wiki/knowledge-wiki.service.ts b/packages/cli/src/context/wiki/knowledge-wiki.service.ts index 1383e4d4..e87756c3 100644 --- a/packages/cli/src/context/wiki/knowledge-wiki.service.ts +++ b/packages/cli/src/context/wiki/knowledge-wiki.service.ts @@ -98,13 +98,25 @@ export class KnowledgeWikiService { async readPage(scope: string, scopeId: string | null | undefined, pageKey: string): Promise { const path = this.pagePath(scope, scopeId, pageKey); + let raw: string; try { const result = await this.configService.readFile(path); - const { frontmatter, content } = this.parsePage(result.content); - return { pageKey, frontmatter, content }; + raw = result.content; } catch { return null; } + try { + const { frontmatter, content } = this.parsePage(raw); + return { pageKey, frontmatter, content }; + } catch (error) { + // The file exists but parsing failed. Returning null without surfacing the + // parse error would let callers (and the memory agent) treat it as "page + // doesn't exist" and clobber the broken page on the next write. + this.logger.warn( + `[readPage] ${path}: parse failed: ${error instanceof Error ? error.message : String(error)}`, + ); + return null; + } } async deletePage( @@ -133,19 +145,23 @@ export class KnowledgeWikiService { async listPageKeys(scope: string, scopeId?: string | null): Promise { const dir = this.scopeDir(scope, scopeId); + let files: string[]; try { const result = await this.configService.listFiles(dir); - return result.files - .filter((f) => f.endsWith('.md')) - .map((f) => { - // Strip the directory prefix and .md extension - const name = f.replace(`${dir}/`, '').replace(/\.md$/, ''); - return name; - }) - .filter(isFlatWikiKey); - } catch { + files = result.files; + } catch (error) { + // listFiles returns [] for missing directories; reaching this catch means + // an IO-level failure that should at least be surfaced before we report + // "no pages" the same as a freshly-initialised store would. + this.logger.warn( + `[listPageKeys] ${dir}: ${error instanceof Error ? error.message : String(error)}`, + ); return []; } + return files + .filter((f) => f.endsWith('.md')) + .map((f) => f.replace(`${dir}/`, '').replace(/\.md$/, '')) + .filter(isFlatWikiKey); } async getPageHistory(scope: string, scopeId: string | null | undefined, pageKey: string) { diff --git a/packages/cli/src/mcp-server-factory.test.ts b/packages/cli/src/mcp-server-factory.test.ts new file mode 100644 index 00000000..64c7275d --- /dev/null +++ b/packages/cli/src/mcp-server-factory.test.ts @@ -0,0 +1,197 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { createDefaultKtxMcpServer } from './context/mcp/server.js'; +import { createLocalProjectMcpContextPorts } from './context/mcp/local-project-ports.js'; +import { createLocalProjectMemoryIngest } from './context/memory/local-memory.js'; +import { resolveProjectEmbeddingProvider } from './embedding-resolution.js'; +import { createKtxCliScanConnector } from './local-scan-connectors.js'; +import { createKtxMcpServerFactory } from './mcp-server-factory.js'; + +type FakeEmbeddingProvider = { + maxBatchSize: number; + embed(text: string): Promise; + embedMany(texts: string[]): Promise; +}; + +const mocks = vi.hoisted(() => ({ + queryExecutor: { execute: vi.fn() }, + semanticLayerCompute: { validateSources: vi.fn(), generateSources: vi.fn(), query: vi.fn() }, + sqlAnalysis: { analyzeForFingerprint: vi.fn(), analyzeBatch: vi.fn(), validateReadOnly: vi.fn() }, + memoryIngest: { ingest: vi.fn(), status: vi.fn(), waitForRun: vi.fn() }, +})); + +vi.mock('./context/llm/embedding-port.js', () => ({ + KtxIngestEmbeddingPortAdapter: class { + readonly maxBatchSize: number; + + constructor(private readonly provider: FakeEmbeddingProvider) { + this.maxBatchSize = provider.maxBatchSize; + } + + computeEmbedding(text: string): Promise { + return this.provider.embed(text); + } + + computeEmbeddingsBulk(texts: string[]): Promise { + return this.provider.embedMany(texts); + } + }, +})); + +vi.mock('./context/mcp/server.js', () => ({ + createDefaultKtxMcpServer: vi.fn(() => ({ kind: 'mcp-server' })), +})); + +vi.mock('./context/mcp/local-project-ports.js', () => ({ + createLocalProjectMcpContextPorts: vi.fn(() => ({ context_tool: { name: 'context_tool' } })), +})); + +vi.mock('./context/memory/local-memory.js', () => ({ + createLocalProjectMemoryIngest: vi.fn(() => mocks.memoryIngest), +})); + +vi.mock('./embedding-resolution.js', () => ({ + resolveProjectEmbeddingProvider: vi.fn(), +})); + +vi.mock('./ingest-query-executor.js', () => ({ + createKtxCliIngestQueryExecutor: vi.fn(() => mocks.queryExecutor), +})); + +vi.mock('./local-scan-connectors.js', () => ({ + createKtxCliScanConnector: vi.fn(() => ({ source: 'fake-scan-connector' })), +})); + +vi.mock('./managed-python-command.js', () => ({ + createManagedPythonSemanticLayerComputePort: vi.fn(async () => mocks.semanticLayerCompute), +})); + +vi.mock('./managed-python-http.js', () => ({ + createManagedDaemonSqlAnalysisPort: vi.fn(() => mocks.sqlAnalysis), +})); + +const project = { + projectDir: '/work/project', + configPath: '/work/project/ktx.yaml', + config: {}, + coreConfig: {}, + git: {}, + fileStore: {}, +}; + +const io = { + stdout: { write: vi.fn() }, + stderr: { write: vi.fn() }, +}; + +describe('createKtxMcpServerFactory', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('passes a resolved embedding provider to MCP context ports and memory ingest', async () => { + const provider = { + maxBatchSize: 4, + embed: vi.fn(async () => [0.2, 0.4]), + embedMany: vi.fn(async () => [[0.2, 0.4]]), + }; + vi.mocked(resolveProjectEmbeddingProvider).mockResolvedValue({ kind: 'configured', provider } as never); + + const factory = await createKtxMcpServerFactory({ + project: project as never, + projectDir: project.projectDir, + cliVersion: '0.5.0', + io, + }); + + const contextOptions = vi.mocked(createLocalProjectMcpContextPorts).mock.calls[0][1] as { + embeddingService: { + computeEmbedding(text: string): Promise; + computeEmbeddingsBulk(texts: string[]): Promise; + }; + queryExecutor: unknown; + semanticLayerCompute: unknown; + sqlAnalysis: unknown; + localScan: { + createConnector(connectionId: string): Promise; + }; + }; + await expect(contextOptions.embeddingService.computeEmbedding('gross revenue')).resolves.toEqual([0.2, 0.4]); + await expect(contextOptions.embeddingService.computeEmbeddingsBulk(['gross revenue'])).resolves.toEqual([[0.2, 0.4]]); + await expect(contextOptions.localScan.createConnector('warehouse')).resolves.toEqual({ + source: 'fake-scan-connector', + }); + + expect(provider.embed).toHaveBeenCalledWith('gross revenue'); + expect(provider.embedMany).toHaveBeenCalledWith(['gross revenue']); + expect(createKtxCliScanConnector).toHaveBeenCalledWith(project, 'warehouse'); + expect(contextOptions).toMatchObject({ + queryExecutor: mocks.queryExecutor, + semanticLayerCompute: mocks.semanticLayerCompute, + sqlAnalysis: mocks.sqlAnalysis, + }); + expect(createLocalProjectMemoryIngest).toHaveBeenCalledWith( + project, + expect.objectContaining({ + embeddingProvider: provider, + queryExecutor: mocks.queryExecutor, + semanticLayerCompute: mocks.semanticLayerCompute, + }), + ); + + expect(factory()).toEqual({ kind: 'mcp-server' }); + expect(createDefaultKtxMcpServer).toHaveBeenCalledWith( + expect.objectContaining({ + contextTools: expect.objectContaining({ + context_tool: { name: 'context_tool' }, + memoryIngest: mocks.memoryIngest, + }), + }), + ); + }); + + it('uses null embedding ports when no configured provider is available', async () => { + vi.mocked(resolveProjectEmbeddingProvider).mockResolvedValue({ kind: 'managed-unavailable' } as never); + + await createKtxMcpServerFactory({ + project: project as never, + projectDir: project.projectDir, + cliVersion: '0.5.0', + io, + }); + + expect(vi.mocked(createLocalProjectMcpContextPorts).mock.calls[0][1]).toMatchObject({ + embeddingService: null, + }); + expect(createLocalProjectMemoryIngest).toHaveBeenCalledWith( + project, + expect.objectContaining({ + embeddingProvider: null, + }), + ); + }); + + it('omits memory ingest and logs when memory ingest construction fails', async () => { + vi.mocked(resolveProjectEmbeddingProvider).mockResolvedValue({ kind: 'disabled' } as never); + vi.mocked(createLocalProjectMemoryIngest).mockImplementationOnce(() => { + throw new Error('missing local memory prerequisites'); + }); + + const factory = await createKtxMcpServerFactory({ + project: project as never, + projectDir: project.projectDir, + cliVersion: '0.5.0', + io, + }); + + factory(); + + expect(io.stderr.write).toHaveBeenCalledWith( + 'KTX MCP memory_ingest disabled: missing local memory prerequisites\n', + ); + expect(createDefaultKtxMcpServer).toHaveBeenCalledWith( + expect.objectContaining({ + contextTools: { context_tool: { name: 'context_tool' } }, + }), + ); + }); +}); diff --git a/packages/cli/src/mcp-server-factory.ts b/packages/cli/src/mcp-server-factory.ts index 2967fb8c..e6d4887f 100644 --- a/packages/cli/src/mcp-server-factory.ts +++ b/packages/cli/src/mcp-server-factory.ts @@ -42,10 +42,11 @@ export async function createKtxMcpServerFactory(input: { cliVersion: input.cliVersion, io, }); - const embeddingService = + const embeddingProvider = resolution.kind === 'configured' || resolution.kind === 'managed-running' || resolution.kind === 'managed-started' - ? new KtxIngestEmbeddingPortAdapter(resolution.provider) + ? resolution.provider : null; + const embeddingService = embeddingProvider ? new KtxIngestEmbeddingPortAdapter(embeddingProvider) : null; const contextTools = createLocalProjectMcpContextPorts(input.project, { semanticLayerCompute, queryExecutor, @@ -58,7 +59,11 @@ export async function createKtxMcpServerFactory(input: { let memoryIngest: ReturnType | undefined; try { - memoryIngest = createLocalProjectMemoryIngest(input.project, { semanticLayerCompute, queryExecutor }); + memoryIngest = createLocalProjectMemoryIngest(input.project, { + semanticLayerCompute, + queryExecutor, + embeddingProvider, + }); } catch (error) { io.stderr.write(`KTX MCP memory_ingest disabled: ${error instanceof Error ? error.message : String(error)}\n`); }