diff --git a/docs-site/content/docs/getting-started/meta.json b/docs-site/content/docs/getting-started/meta.json index c40e42c6..a025cb2f 100644 --- a/docs-site/content/docs/getting-started/meta.json +++ b/docs-site/content/docs/getting-started/meta.json @@ -1,5 +1,5 @@ { "title": "Getting Started", "defaultOpen": true, - "pages": ["introduction", "quickstart"] + "pages": ["introduction", "quickstart", "troubleshooting-linux"] } diff --git a/docs-site/content/docs/getting-started/quickstart.mdx b/docs-site/content/docs/getting-started/quickstart.mdx index 0118522c..b3a2b03f 100644 --- a/docs-site/content/docs/getting-started/quickstart.mdx +++ b/docs-site/content/docs/getting-started/quickstart.mdx @@ -296,7 +296,7 @@ surface. | Anthropic health check fails | API key, model id, or access is invalid | Fix `ANTHROPIC_API_KEY` or rerun setup with a different key or model | | Vertex AI health check fails | Vertex API, Claude access, project, location, or IAM permissions are missing | Check the project, location, Application Default Credentials, and Vertex AI permissions | | OpenAI embeddings fail | `OPENAI_API_KEY` is missing or invalid | Export the key or choose local sentence-transformers embeddings | -| Local embeddings fail | Managed Python runtime cannot install or start | Run `ktx dev runtime status`, then install the local embeddings runtime | +| Local embeddings fail | Managed Python runtime cannot install or start | See [Troubleshooting clean Linux install](/docs/getting-started/troubleshooting-linux) — usually missing Python 3.13 or an IPv6 proxy env var | | Database test fails | Credentials, network access, database, warehouse, or schema is wrong | Test the same values with the database's native client, then rerun setup | | Context is not built | Setup saved configuration but skipped or interrupted the build | Run `ktx setup` or `ktx ingest --all` | | Agent integration is incomplete | Setup skipped the agents step or installed a different target | Run `ktx setup --agents --target ` | diff --git a/docs-site/content/docs/getting-started/troubleshooting-linux.mdx b/docs-site/content/docs/getting-started/troubleshooting-linux.mdx new file mode 100644 index 00000000..55dd6048 --- /dev/null +++ b/docs-site/content/docs/getting-started/troubleshooting-linux.mdx @@ -0,0 +1,163 @@ +--- +title: Troubleshooting clean Linux install +description: Known gotchas when installing KTX from scratch on a clean Linux host (Ubuntu, Debian, container images). Read this before debugging managed-runtime or daemon failures. +--- + +This page documents the friction a coding agent (or human) will hit when running `npm install -g @kaelio/ktx@next` on a clean Linux host with no Python ≥ 3.13 installed, and during the first `ktx setup` on that host. Each item lists the symptom, the cause, and the exact recovery command. + +## Prerequisites that aren't always satisfied + +KTX needs: + +| Tool | Minimum version | Why | +|------|-----------------|-----| +| Node.js | 22 | Runs the CLI | +| `uv` | 0.5+ | Manages the local Python runtime (semantic-layer daemon, local embeddings) | +| Python | 3.13 | KTX's managed Python runtime targets `>=3.13`. The system Python on Ubuntu 24.04 is 3.12. | + +If `uv` is not on `PATH`, install it: + +```bash +curl -LsSf https://astral.sh/uv/install.sh | sh +source $HOME/.local/bin/env # or: export PATH="$HOME/.local/bin:$PATH" +``` + +Install Python 3.13 via `uv` so it sits alongside whatever the system ships: + +```bash +uv python install 3.13 +``` + +You do not need to make 3.13 the system default. KTX's runtime installer will pick it up when you set `UV_PYTHON=3.13` for the install command (see below). + +## Symptom: `ktx dev runtime install` fails on the venv step + +The install log (`~/.ktx/runtime//install.log`) shows something like: + +``` +$ uv venv /home/runner/.ktx/runtime//.venv +Using CPython 3.12.3 interpreter at: /usr/bin/python3 +... +Package requires Python >=3.13 but the running Python is 3.12.3 +``` + +**Cause:** `uv venv` picked the system Python (3.12) when it built the runtime virtualenv. KTX's wheels declare `requires-python = ">=3.13"`, so the subsequent install fails. + +**Fix:** install Python 3.13 (above), then force the runtime installer to use it: + +```bash +uv python install 3.13 +UV_PYTHON=3.13 ktx dev runtime install --feature local-embeddings --yes --force +``` + +The `--force` flag rebuilds the venv. Without it, the failed venv from the previous attempt is reused. + +## Symptom: managed Python daemon crashes immediately with `URL parse error` + +The daemon stderr (`/.ktx/runtime/daemon.stderr.log`) contains an httpx traceback ending in something like: + +``` +File ".../httpx/_client.py", line 698, in __init__ + URLPattern(key): None +File ".../httpx/_urls.py", line ..., in __init__ + raise InvalidURL(...) +``` + +**Cause:** an environment variable holds a value httpx cannot parse — typically `NO_PROXY` or `no_proxy` containing an **IPv6 CIDR** such as `fd07:b51a:cc66:f0::/64`. OrbStack and some Docker network setups inject this by default. httpx interprets every comma-separated entry as a URL pattern and rejects raw IPv6 CIDRs. + +**Fix:** scrub the bad entries before starting the daemon. The simplest workaround is to unset proxy vars entirely for daemon-related commands: + +```bash +unset HTTP_PROXY HTTPS_PROXY NO_PROXY http_proxy https_proxy no_proxy +ktx dev runtime start --feature local-embeddings +``` + +If you need proxy entries to remain set for outbound HTTP, keep only the IPv4 + hostname entries: + +```bash +export NO_PROXY="localhost,127.0.0.1,*.orb.internal,*.orb.local" +``` + +This issue is tracked for an upstream fix in the daemon: it should sanitize unparseable entries before constructing httpx clients. + +## Symptom: `ktx setup` keeps connecting to an old daemon port + +Running `ktx setup` more than once can leave orphan `ktx-daemon` processes. Each `setup` invocation may spawn a fresh daemon on a new port and write a new `daemon.json`, while the old one keeps running. Subsequent setup attempts may pick the stale port and fail with a connection-refused error or a `500` health check. + +**Fix:** stop all daemons and remove the state files before re-running setup: + +```bash +pkill -9 -f ktx-daemon || true +rm -f ~/.ktx/runtime/*/daemon.json +rm -f /path/to/project/.ktx/runtime/daemon.json +``` + +Then start the daemon explicitly **before** re-running setup so `setup` reuses it: + +```bash +unset HTTP_PROXY HTTPS_PROXY NO_PROXY http_proxy https_proxy no_proxy +ktx dev runtime start --feature local-embeddings +``` + +## Symptom: `ktx status --json` reports a connection as failed but `ktx connection test ` passes + +`ktx status` may cache a failure record from a prior bad run (for example, when the daemon was crashing). A successful `ktx connection test` does not always invalidate the cache. + +**Fix:** re-run a fast ingest, which writes a fresh status record: + +```bash +ktx ingest --fast +ktx status --json +``` + +## A minimal "clean Linux install" recipe + +If you only want one working sequence, this one works from a fresh Ubuntu 24.04 container with Node 22 and Claude Code installed: + +```bash +# 1. Prerequisites +curl -LsSf https://astral.sh/uv/install.sh | sh +source $HOME/.local/bin/env +uv python install 3.13 +npm install -g @kaelio/ktx@next + +# 2. Pre-warm the managed Python runtime with the right Python +UV_PYTHON=3.13 ktx dev runtime install --feature local-embeddings --yes --force + +# 3. Start the daemon with a clean proxy env +unset HTTP_PROXY HTTPS_PROXY NO_PROXY http_proxy https_proxy no_proxy +ktx dev runtime start --feature local-embeddings + +# 4. Scripted setup (replace DATABASE_URL with your warehouse) +mkdir -p /work/project +cd /work/project +export ANTHROPIC_API_KEY=... # already in env from your Claude Code session +export DATABASE_URL=postgresql://... + +ktx setup \ + --no-input \ + --yes \ + --project-dir /work/project \ + --llm-backend anthropic \ + --anthropic-api-key-env ANTHROPIC_API_KEY \ + --anthropic-model claude-sonnet-4-6 \ + --embedding-backend sentence-transformers \ + --database postgres \ + --new-database-connection-id warehouse \ + --database-url env:DATABASE_URL \ + --skip-sources \ + --skip-agents + +# 5. Build schema context +ktx ingest warehouse --fast + +# 6. Verify +ktx status --json +ktx connection test warehouse +``` + +Success looks like: + +- `ktx status --json` reports `"verdict": "ready"` +- `ktx connection test warehouse` exits 0 with `Status: ok` +- `semantic-layer/warehouse/_schema/` contains generated YAML files diff --git a/docs-site/lib/llm-docs.ts b/docs-site/lib/llm-docs.ts index 561f73e0..9eed23b0 100644 --- a/docs-site/lib/llm-docs.ts +++ b/docs-site/lib/llm-docs.ts @@ -1,7 +1,7 @@ import { source } from "@/lib/source"; import { readDocsPageMarkdown } from "@/lib/docs-markdown"; -const siteOrigin = "https://docs.kaelio.com/ktx"; +const siteOrigin = process.env.KTX_DOCS_ORIGIN ?? "https://docs.kaelio.com/ktx"; export type LlmDocsPage = { title: string; @@ -61,6 +61,7 @@ ${link("/docs/ai-resources/agent-instructions", "Agent Instructions", "Suggested ${link("/docs/getting-started/introduction", "Introduction", "What KTX is and who it is for")} ${link("/docs/getting-started/quickstart", "Quickstart", "Set up KTX and build your first context")} +${link("/docs/getting-started/troubleshooting-linux", "Troubleshooting clean Linux install", "READ FIRST if installing from scratch on Linux/container — covers Python 3.13 prerequisite, IPv6 proxy gotcha, and a minimal working recipe")} ${link("/docs/guides/writing-context", "Writing Context", "Write semantic sources and wiki pages")} ## Machine-Readable Documentation diff --git a/docs-site/next-env.d.ts b/docs-site/next-env.d.ts index 9edff1c7..c4b7818f 100644 --- a/docs-site/next-env.d.ts +++ b/docs-site/next-env.d.ts @@ -1,6 +1,6 @@ /// /// -import "./.next/types/routes.d.ts"; +import "./.next/dev/types/routes.d.ts"; // NOTE: This file should not be edited // see https://nextjs.org/docs/app/api-reference/config/typescript for more information. diff --git a/packages/cli/src/managed-python-daemon.test.ts b/packages/cli/src/managed-python-daemon.test.ts index 09e45fd3..c7009548 100644 --- a/packages/cli/src/managed-python-daemon.test.ts +++ b/packages/cli/src/managed-python-daemon.test.ts @@ -4,6 +4,7 @@ import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { readManagedPythonDaemonStatus, + sanitizeProxyEnv, startManagedPythonDaemon, stopAllManagedPythonDaemons, stopManagedPythonDaemon, @@ -404,3 +405,38 @@ describe('managed Python daemon lifecycle', () => { expect(await readFile(layout(tempDir).daemonStatePath, 'utf8')).toContain('"pid": 4242'); }); }); + +describe('sanitizeProxyEnv', () => { + it('removes IPv6 CIDR entries from NO_PROXY that crash httpx', () => { + const cleaned = sanitizeProxyEnv({ + NO_PROXY: 'localhost,127.0.0.1,fd07:b51a:cc66:f0::/64,*.orb.internal', + no_proxy: 'localhost,127.0.0.1,fd07:b51a:cc66:f0::/64,*.orb.internal', + }); + expect(cleaned.NO_PROXY).toBe('localhost,127.0.0.1,*.orb.internal'); + expect(cleaned.no_proxy).toBe('localhost,127.0.0.1,*.orb.internal'); + }); + + it('deletes NO_PROXY entirely when every entry is unparseable', () => { + const cleaned = sanitizeProxyEnv({ NO_PROXY: 'fd07::/64,::1' }); + expect(cleaned.NO_PROXY).toBeUndefined(); + }); + + it('preserves IPv4 addresses, IPv4 CIDRs, hostnames, and wildcards', () => { + const cleaned = sanitizeProxyEnv({ + NO_PROXY: '127.0.0.0/8,10.0.0.1,localhost,*.example.com', + }); + expect(cleaned.NO_PROXY).toBe('127.0.0.0/8,10.0.0.1,localhost,*.example.com'); + }); + + it('leaves other env vars untouched', () => { + const cleaned = sanitizeProxyEnv({ PATH: '/usr/bin', NO_PROXY: '::1', FOO: 'bar' }); + expect(cleaned.PATH).toBe('/usr/bin'); + expect(cleaned.FOO).toBe('bar'); + expect(cleaned.NO_PROXY).toBeUndefined(); + }); + + it('does nothing when NO_PROXY is not set', () => { + const cleaned = sanitizeProxyEnv({ PATH: '/usr/bin' }); + expect(cleaned).toEqual({ PATH: '/usr/bin' }); + }); +}); diff --git a/packages/cli/src/managed-python-daemon.ts b/packages/cli/src/managed-python-daemon.ts index 76740554..402a1e14 100644 --- a/packages/cli/src/managed-python-daemon.ts +++ b/packages/cli/src/managed-python-daemon.ts @@ -697,7 +697,7 @@ export async function startManagedPythonDaemon( detached: true, stdio: ['ignore', stdout.fd, stderr.fd], env: { - ...process.env, + ...sanitizeProxyEnv(process.env), KTX_DAEMON_VERSION: options.cliVersion, }, }, @@ -807,3 +807,32 @@ export async function stopAllManagedPythonDaemons( scanErrors: discovery.scanErrors, }; } + +/** + * Filter NO_PROXY/no_proxy values to remove entries httpx cannot parse. + * + * httpx (used by the Python daemon via huggingface_hub / sentence-transformers) + * treats each comma-separated NO_PROXY entry as a URL pattern. Raw IPv6 CIDR + * blocks like `fd07:b51a:cc66:f0::/64` raise `InvalidURL` and crash the daemon. + * OrbStack and similar Docker setups inject such entries by default. + * + * We drop any entry containing `::` (the unambiguous IPv6 marker) but keep + * IPv4 addresses, IPv4 CIDRs, hostnames, and wildcard hosts intact. + */ +export function sanitizeProxyEnv(env: NodeJS.ProcessEnv): NodeJS.ProcessEnv { + const result: NodeJS.ProcessEnv = { ...env }; + for (const key of ['NO_PROXY', 'no_proxy']) { + const value = result[key]; + if (typeof value !== 'string' || value.length === 0) continue; + const kept = value + .split(',') + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0 && !entry.includes('::')); + if (kept.length === 0) { + delete result[key]; + } else { + result[key] = kept.join(','); + } + } + return result; +} diff --git a/packages/cli/src/managed-python-runtime.test.ts b/packages/cli/src/managed-python-runtime.test.ts index 540df619..e083368e 100644 --- a/packages/cli/src/managed-python-runtime.test.ts +++ b/packages/cli/src/managed-python-runtime.test.ts @@ -222,7 +222,7 @@ describe('installManagedPythonRuntime', () => { expect(result.status).toBe('installed'); expect(commands).toEqual([ { command: 'uv', args: ['--version'] }, - { command: 'uv', args: ['venv', result.layout.venvDir] }, + { command: 'uv', args: ['venv', result.layout.venvDir, '--python', '3.13'] }, { command: 'uv', args: ['pip', 'install', '--python', result.layout.pythonPath, result.asset.wheelPath], diff --git a/packages/cli/src/managed-python-runtime.ts b/packages/cli/src/managed-python-runtime.ts index 4e3af013..dbe2f859 100644 --- a/packages/cli/src/managed-python-runtime.ts +++ b/packages/cli/src/managed-python-runtime.ts @@ -12,6 +12,16 @@ const execFileAsync = promisify(execFile); export const runtimeFeatureSchema = z.enum(['core', 'local-embeddings']); export type KtxRuntimeFeature = z.infer; +/** + * Python version the managed runtime venv must be built with. KTX's bundled + * wheels declare `requires-python = ">=3.13"`; without an explicit `--python` + * flag, `uv venv` may pick a too-old system Python (Ubuntu 24.04 ships 3.12) + * and the subsequent `uv pip install` fails late with a confusing "package + * requires Python >=3.13" error. Pinning here pushes uv to auto-download the + * right interpreter via its python-management feature. + */ +export const MANAGED_RUNTIME_PYTHON_VERSION = '3.13'; + const runtimeAssetManifestSchema = z.object({ schemaVersion: z.literal(1), distributionName: z.literal('kaelio-ktx'), @@ -334,7 +344,7 @@ export async function installManagedPythonRuntime( exec, logPath: layout.installLogPath, command: 'uv', - args: ['venv', layout.venvDir], + args: ['venv', layout.venvDir, '--python', MANAGED_RUNTIME_PYTHON_VERSION], env: uvEnv, }); const wheelSpec = features.includes('local-embeddings') ? `${asset.wheelPath}[local-embeddings]` : asset.wheelPath; diff --git a/packages/context/src/ingest/adapters/live-database/daemon-introspection.test.ts b/packages/context/src/ingest/adapters/live-database/daemon-introspection.test.ts index 93a9739d..8237d903 100644 --- a/packages/context/src/ingest/adapters/live-database/daemon-introspection.test.ts +++ b/packages/context/src/ingest/adapters/live-database/daemon-introspection.test.ts @@ -216,4 +216,40 @@ describe('createDaemonLiveDatabaseIntrospection', () => { ); expect(runJson).not.toHaveBeenCalled(); }); + + it('filters out tables not on the enabled_tables allowlist', async () => { + const runJson = vi.fn(async () => daemonResponse); + const introspection = createDaemonLiveDatabaseIntrospection({ + connections: { + warehouse: { + driver: 'postgres', + url: 'postgres://localhost:5432/warehouse', + enabled_tables: ['public.orders'], + }, + }, + schemas: ['public'], + runJson, + }); + + const snapshot = await introspection.extractSchema('warehouse'); + expect(snapshot.tables.map((table) => `${table.db}.${table.name}`)).toEqual(['public.orders']); + }); + + it('passes through every table when enabled_tables is omitted or empty', async () => { + const runJson = vi.fn(async () => daemonResponse); + const introspection = createDaemonLiveDatabaseIntrospection({ + connections: { + warehouse: { + driver: 'postgres', + url: 'postgres://localhost:5432/warehouse', + enabled_tables: [], + }, + }, + schemas: ['public'], + runJson, + }); + + const snapshot = await introspection.extractSchema('warehouse'); + expect(snapshot.tables.map((table) => table.name)).toEqual(['customers', 'orders']); + }); }); diff --git a/packages/context/src/ingest/adapters/live-database/daemon-introspection.ts b/packages/context/src/ingest/adapters/live-database/daemon-introspection.ts index 6c333385..ee33a980 100644 --- a/packages/context/src/ingest/adapters/live-database/daemon-introspection.ts +++ b/packages/context/src/ingest/adapters/live-database/daemon-introspection.ts @@ -243,11 +243,29 @@ export function createDaemonLiveDatabaseIntrospection( const raw = requestJson ? await requestJson('/database/introspect', payload) : await runJson('database-introspect', payload); - return mapDaemonSnapshot(raw, { + const snapshot = mapDaemonSnapshot(raw, { connectionId, extractedAt: now().toISOString(), schemas, }); + return applyEnabledTablesFilter(snapshot, connection); }, }; } + +function applyEnabledTablesFilter( + snapshot: KtxSchemaSnapshot, + connection: KtxProjectConnectionConfig, +): KtxSchemaSnapshot { + const allowlist = (connection as { enabled_tables?: unknown }).enabled_tables; + if (!Array.isArray(allowlist) || allowlist.length === 0) return snapshot; + const allowed = new Set(allowlist.filter((value): value is string => typeof value === 'string')); + if (allowed.size === 0) return snapshot; + return { + ...snapshot, + tables: snapshot.tables.filter((table) => { + const qualified = table.db ? `${table.db}.${table.name}` : table.name; + return allowed.has(qualified); + }), + }; +} diff --git a/packages/context/src/project/driver-schemas.ts b/packages/context/src/project/driver-schemas.ts index 1815975d..c3b819ea 100644 --- a/packages/context/src/project/driver-schemas.ts +++ b/packages/context/src/project/driver-schemas.ts @@ -27,6 +27,12 @@ function warehouseConnectionSchema(driver: .min(1) .optional() .describe('Warehouse connection URL or DSN; may contain environment-variable references like env:DATABASE_URL.'), + enabled_tables: z + .array(z.string().min(1)) + .optional() + .describe( + 'Optional allowlist of fully-qualified table names ("schema.table") to ingest. When set, live-database ingest discards any table whose schema-qualified name is not in this list. Useful for smoke-testing deep ingest on a single table.', + ), }) .describe( `${driver} warehouse connection. Additional driver-tunable fields (e.g. historicSql, context.queryHistory) are accepted and passed through.`,