mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-13 08:15:14 +02:00
Make telemetry reliable across interrupts and headless installs
Three reliability gaps surfaced while auditing why PostHog numbers were untrustworthy: 1. Interrupted commands lost their events. capture() is fire-and-forget and the only flush guarantee lived in a finally block, which SIGINT/SIGTERM skip — so Ctrl-C'ing a long ingest or an MCP client killing 'ktx mcp stdio' dropped the command event and any queued events. Add SIGINT/SIGTERM handlers (real-process entry only; never under test/programmatic io) that mark the active command span aborted, emit it, drain the emitter, then exit. Idempotent with the normal finally path via the single-consume command span. 2. Headless-first installs were invisible. loadTelemetryIdentity refused to mint an installId unless stdout was a TTY, so a machine whose first run was an IDE-launched MCP server or a script emitted nothing, ever. Mint on first run regardless of surface (still honoring CI/DO_NOT_TRACK/KTX_TELEMETRY_DISABLED), writing the one-time notice to stderr — safe under the MCP stdio protocol, which reserves stdout. Drop the now-unused stdoutIsTTY option. 3. No guard against silent emit regressions (the 0.7.0 scan_completed blackout). Add tests: the shared executePublicIngestTarget chokepoint emits exactly one ingest_completed on success and on the preflight-failure branch, and a database target invokes the scan that emits scan_completed; plus coverage for the aborted-flush helper. Identity is unchanged otherwise: every event still attributes to the installId in ~/.ktx/telemetry.json. No event/field changes, so Node<->Python schema parity is untouched. Docs updated to reflect first-run-on-any-surface activation.
This commit is contained in:
parent
2334a4b6e3
commit
cb6a67c2d7
7 changed files with 219 additions and 66 deletions
|
|
@ -89,6 +89,46 @@ export async function runInitForCommander(
|
|||
return await runInit(args, io);
|
||||
}
|
||||
|
||||
function signalExitCode(signal: NodeJS.Signals): number {
|
||||
// 128 + signal number: SIGINT (2) -> 130, SIGTERM (15) -> 143.
|
||||
return signal === 'SIGTERM' ? 143 : 130;
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush telemetry on interrupt for the real CLI process. `capture()` is
|
||||
* fire-and-forget and the only flush guarantee lives in a `finally` a signal
|
||||
* skips, so Ctrl-C / `kill` of a long-running command (ingest, `mcp stdio`)
|
||||
* would otherwise drop its `command` event and queued events. Installed only
|
||||
* when driving the actual process; programmatic/test callers pass their own
|
||||
* `io` and never reach here. Returns a disposer that removes the listeners.
|
||||
*/
|
||||
function installTelemetrySignalFlush(io: KtxCliIo, info: KtxCliPackageInfo): () => void {
|
||||
let handling = false;
|
||||
const handle = (signal: NodeJS.Signals): void => {
|
||||
if (handling) {
|
||||
process.exit(signalExitCode(signal));
|
||||
}
|
||||
handling = true;
|
||||
void (async () => {
|
||||
try {
|
||||
const { emitAbortedCommandAndShutdown } = await import('./telemetry/index.js');
|
||||
await emitAbortedCommandAndShutdown({ packageInfo: info, io });
|
||||
} catch {
|
||||
// Best-effort: never let a telemetry hiccup block the interrupt exit.
|
||||
}
|
||||
process.exit(signalExitCode(signal));
|
||||
})();
|
||||
};
|
||||
const onSigint = (): void => handle('SIGINT');
|
||||
const onSigterm = (): void => handle('SIGTERM');
|
||||
process.on('SIGINT', onSigint);
|
||||
process.on('SIGTERM', onSigterm);
|
||||
return () => {
|
||||
process.off('SIGINT', onSigint);
|
||||
process.off('SIGTERM', onSigterm);
|
||||
};
|
||||
}
|
||||
|
||||
export async function runKtxCli(
|
||||
argv = process.argv.slice(2),
|
||||
io: KtxCliIo = process,
|
||||
|
|
@ -98,7 +138,14 @@ export async function runKtxCli(
|
|||
profileMark('runtime:runKtxCli');
|
||||
const { runCommanderKtxCli } = await profileSpan('import ./cli-program.js', () => import('./cli-program.js'));
|
||||
|
||||
return await runCommanderKtxCli(argv, io, deps, info, {
|
||||
runInit: runInitForCommander,
|
||||
});
|
||||
// Real-process entry only: flush telemetry if interrupted. Test/programmatic
|
||||
// callers pass their own `io`, so they never install process-level handlers.
|
||||
const removeSignalFlush = (io as unknown) === process ? installTelemetrySignalFlush(io, info) : undefined;
|
||||
try {
|
||||
return await runCommanderKtxCli(argv, io, deps, info, {
|
||||
runInit: runInitForCommander,
|
||||
});
|
||||
} finally {
|
||||
removeSignalFlush?.();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,7 +37,6 @@ function styleNotice(notice: string, env: TelemetryIdentityEnv): string {
|
|||
export interface LoadTelemetryIdentityOptions {
|
||||
homeDir?: string;
|
||||
env?: TelemetryIdentityEnv;
|
||||
stdoutIsTTY: boolean;
|
||||
stderr: { write(chunk: string): void };
|
||||
now?: () => Date;
|
||||
}
|
||||
|
|
@ -94,13 +93,12 @@ export async function loadTelemetryIdentity(options: LoadTelemetryIdentityOption
|
|||
};
|
||||
}
|
||||
|
||||
// No identity yet. Minting one means showing the one-time opt-out notice, so
|
||||
// first-run creation requires an interactive surface; a headless first run
|
||||
// stays disabled and defers enablement until the next interactive run.
|
||||
if (options.stdoutIsTTY !== true) {
|
||||
return { enabled: false, createdFile: false, noticeShown: false, path };
|
||||
}
|
||||
|
||||
// No identity yet → mint one regardless of surface. Telemetry is opt-out, so
|
||||
// a fresh install is counted even when its first run is headless (an
|
||||
// IDE-launched `ktx mcp stdio`, a scripted invocation); otherwise those
|
||||
// installs would be permanently invisible. Opt-out env vars are honored
|
||||
// above. The one-time notice is written to stderr — safe even under MCP
|
||||
// stdio, which reserves stdout for its JSON-RPC protocol.
|
||||
const timestamp = (options.now ?? (() => new Date()))().toISOString();
|
||||
const next = {
|
||||
installId: randomUUID(),
|
||||
|
|
|
|||
|
|
@ -22,7 +22,6 @@ export type { CommandOutcome, CompletedCommandSpan };
|
|||
|
||||
export async function showTelemetryNoticeIfNeeded(io: KtxCliIo, packageInfo: KtxCliPackageInfo): Promise<void> {
|
||||
const identity = await loadTelemetryIdentity({
|
||||
stdoutIsTTY: io.stdout.isTTY === true,
|
||||
stderr: io.stderr,
|
||||
env: process.env,
|
||||
});
|
||||
|
|
@ -81,7 +80,6 @@ export async function emitTelemetryEvent<Name extends TelemetryEventName>(input:
|
|||
}): Promise<void> {
|
||||
const debug = telemetryDebugEnabled();
|
||||
const identity = await loadTelemetryIdentity({
|
||||
stdoutIsTTY: input.io.stdout.isTTY === true,
|
||||
stderr: input.io.stderr,
|
||||
env: process.env,
|
||||
});
|
||||
|
|
@ -154,3 +152,20 @@ export async function emitCompletedCommand(input: {
|
|||
packageInfo: input.packageInfo,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush telemetry when the process is interrupted (Ctrl-C / kill). The normal
|
||||
* `command` emit + flush lives in a `finally` that a signal skips, so without
|
||||
* this an interrupted long-running command (ingest, `mcp stdio`) loses its
|
||||
* `command` event and any queued events. Marks the active command span as
|
||||
* `aborted`, emits it, and drains the emitter. Best-effort and idempotent: if
|
||||
* the span was already completed (normal exit racing a signal) the emit no-ops.
|
||||
*/
|
||||
export async function emitAbortedCommandAndShutdown(input: {
|
||||
packageInfo: KtxCliPackageInfo;
|
||||
io: KtxCliIo;
|
||||
}): Promise<void> {
|
||||
const completed = completeCommandSpan({ completedAt: performance.now(), outcome: 'aborted' });
|
||||
await emitCompletedCommand({ completed, packageInfo: input.packageInfo, io: input.io });
|
||||
await shutdownTelemetryEmitter();
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue