feat(telemetry): collect PostHog $exception error reports in CLI and daemon (#262)

* feat(telemetry): add node exception reporter

* feat(telemetry): report node cli exceptions

* feat(telemetry): add daemon exception reporter

* feat(telemetry): report daemon exceptions

* docs(telemetry): document error reports

* fix(telemetry): pass redaction snapshots from node call sites

* test(telemetry): verify prepared node exception payload

* fix(telemetry): close daemon exception lifecycle gaps

* test(telemetry): verify prepared daemon exception payload

* test(telemetry): close error collection acceptance gaps

* test(telemetry): close posthog exception acceptance gaps
This commit is contained in:
Andrey Avtomonov 2026-06-05 19:36:21 +02:00 committed by GitHub
parent c3d8cedb0b
commit fb7b94b60e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 2870 additions and 140 deletions

View file

@ -6,6 +6,8 @@ import argparse
import json
import sys
import time
from collections.abc import Callable
from types import TracebackType
from typing import Any
from pydantic import ValidationError
@ -90,6 +92,41 @@ def _read_stdin_json() -> dict[str, Any]:
return parsed
def install_serve_http_exception_hooks(started_at: float) -> Callable[[], None]:
original_hook = sys.excepthook
def hook(
exc_type: type[BaseException],
exc: BaseException,
tb: TracebackType | None,
) -> None:
report_serve_http_crash(exc, started_at=started_at)
original_hook(exc_type, exc, tb)
sys.excepthook = hook
def dispose() -> None:
sys.excepthook = original_hook
return dispose
def report_serve_http_crash(error: BaseException, *, started_at: float) -> None:
from ktx_daemon.telemetry import report_exception
from ktx_daemon.telemetry.daemon_lifecycle import emit_daemon_stopped_once
report_exception(
error,
source="serve-http",
handled=False,
fatal=True,
)
emit_daemon_stopped_once(
reason="crash",
uptime_ms=max(0, (time.perf_counter() - started_at) * 1000),
)
def run_http_server(
*,
host: str,
@ -102,15 +139,23 @@ def run_http_server(
from ktx_daemon.app import create_app
started_at = time.perf_counter()
uvicorn.run(
create_app(
enable_code_execution=enable_code_execution,
telemetry_started_at=started_at,
),
host=host,
port=port,
log_level=log_level,
)
dispose_hooks = install_serve_http_exception_hooks(started_at)
try:
try:
uvicorn.run(
create_app(
enable_code_execution=enable_code_execution,
telemetry_started_at=started_at,
),
host=host,
port=port,
log_level=log_level,
)
except Exception as error:
report_serve_http_crash(error, started_at=started_at)
raise
finally:
dispose_hooks()
def main(argv: list[str] | None = None) -> int:
@ -169,6 +214,14 @@ def main(argv: list[str] | None = None) -> int:
sys.stderr.write(f"{error}\n")
return 1
except Exception as error:
from ktx_daemon.telemetry import report_exception
report_exception(
error,
source=str(args.command),
handled=True,
fatal=False,
)
sys.stderr.write(f"{type(error).__name__}: {error}\n")
return 1

View file

@ -10,8 +10,8 @@ from contextlib import asynccontextmanager
from collections.abc import Callable
from typing import Any
from fastapi import FastAPI, HTTPException
from fastapi.responses import Response
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, Response
from ktx_daemon import VERSION
from ktx_daemon.code_execution import (
@ -65,9 +65,11 @@ from ktx_daemon.table_identifier import (
ParseTableIdentifierBatchResponse,
parse_table_identifier_response,
)
from ktx_daemon.telemetry import track_telemetry_event
from ktx_daemon.telemetry import report_exception, track_telemetry_event
from ktx_daemon.telemetry.daemon_lifecycle import emit_daemon_stopped_once
logger = logging.getLogger(__name__)
CREDENTIAL_KEYS = {"url", "password", "token", "api_key", "apikey", "auth_header"}
class NumpyORJSONResponse(Response):
@ -77,6 +79,36 @@ class NumpyORJSONResponse(Response):
return dumps_numpy_json(content)
def _route_source(request: Request) -> str:
route = request.scope.get("route")
path = getattr(route, "path", None)
if isinstance(path, str) and path:
return f"app:{path}"
return f"app:{request.url.path}"
def _secret_snapshot_from_payload(value: Any) -> list[str]:
secrets: list[str] = []
if isinstance(value, dict):
for key, child in value.items():
normalized_key = str(key).lower()
if normalized_key in CREDENTIAL_KEYS and isinstance(child, str) and child:
secrets.append(child)
secrets.extend(_secret_snapshot_from_payload(child))
elif isinstance(value, list):
for child in value:
secrets.extend(_secret_snapshot_from_payload(child))
return secrets
async def _request_secret_snapshot(request: Request) -> list[str]:
try:
payload = await request.json()
except Exception:
return []
return _secret_snapshot_from_payload(payload)
def create_app(
*,
embedding_provider: EmbeddingProvider | None = None,
@ -104,12 +136,9 @@ def create_app(
try:
yield
finally:
track_telemetry_event(
"daemon_stopped",
{
"reason": "request",
"uptimeMs": max(0, (clock() - started_at) * 1000),
},
emit_daemon_stopped_once(
reason="request",
uptime_ms=max(0, (clock() - started_at) * 1000),
)
app = FastAPI(
@ -119,6 +148,25 @@ def create_app(
lifespan=lifespan,
)
@app.middleware("http")
async def report_unhandled_exceptions(request: Request, call_next):
redaction_secrets = await _request_secret_snapshot(request)
try:
return await call_next(request)
except Exception as error:
logger.exception("Unhandled daemon request failed: %s", error)
report_exception(
error,
source=_route_source(request),
handled=True,
fatal=False,
redaction_secrets=redaction_secrets,
)
return JSONResponse(
status_code=500,
content={"detail": f"Daemon request failed: {error}"},
)
@app.get("/health")
async def health() -> dict[str, str]:
response = {"status": "healthy"}
@ -137,12 +185,6 @@ def create_app(
except ValueError as error:
logger.warning("Database introspection rejected: %s", error)
raise HTTPException(status_code=400, detail=str(error)) from error
except Exception as error:
logger.exception("Database introspection failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Database introspection failed: {error}",
) from error
@app.post("/embeddings/compute", response_model=ComputeEmbeddingResponse)
async def embedding_compute(
@ -156,12 +198,6 @@ def create_app(
except ValueError as error:
logger.warning("Embedding compute rejected: %s", error)
raise HTTPException(status_code=400, detail=str(error)) from error
except Exception as error:
logger.exception("Embedding compute failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Embedding compute failed: {error}",
) from error
@app.post(
"/embeddings/compute-bulk",
@ -178,12 +214,6 @@ def create_app(
except ValueError as error:
logger.warning("Bulk embedding compute rejected: %s", error)
raise HTTPException(status_code=400, detail=str(error)) from error
except Exception as error:
logger.exception("Bulk embedding compute failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Bulk embedding compute failed: {error}",
) from error
if enable_code_execution:
@ -193,29 +223,15 @@ def create_app(
response_class=NumpyORJSONResponse,
)
async def code_execute(request: ExecuteCodeRequest) -> ExecuteCodeResponse:
try:
return execute_code_response(
request,
nest_api_url=None,
auth_header=None,
)
except Exception as error:
logger.exception("Code execution failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Code execution failed: {error}",
) from error
return execute_code_response(
request,
nest_api_url=None,
auth_header=None,
)
@app.post("/lookml/parse", response_model=ParseLookMLResponse)
async def lookml_parse(request: ParseLookMLRequest) -> ParseLookMLResponse:
try:
return parse_lookml_project(request)
except Exception as error:
logger.exception("LookML parsing failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"LookML parsing failed: {error}",
) from error
return parse_lookml_project(request)
@app.post(
"/sql/parse-table-identifier",
@ -224,40 +240,19 @@ def create_app(
async def sql_parse_table_identifier(
request: ParseTableIdentifierBatchRequest,
) -> ParseTableIdentifierBatchResponse:
try:
return parse_table_identifier_response(request)
except Exception as error:
logger.exception("Table identifier parsing failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Table identifier parsing failed: {error}",
) from error
return parse_table_identifier_response(request)
@app.post("/sql/validate-read-only", response_model=ValidateReadOnlySqlResponse)
async def sql_validate_read_only(
request: ValidateReadOnlySqlRequest,
) -> ValidateReadOnlySqlResponse:
try:
return validate_read_only_sql_response(request)
except Exception as error:
logger.exception("SQL read-only validation failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"SQL read-only validation failed: {error}",
) from error
return validate_read_only_sql_response(request)
@app.post("/sql/analyze-batch", response_model=AnalyzeSqlBatchResponse)
async def sql_analyze_batch(
request: AnalyzeSqlBatchRequest,
) -> AnalyzeSqlBatchResponse:
try:
return analyze_sql_batch_response(request)
except Exception as error:
logger.exception("SQL batch analysis failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"SQL batch analysis failed: {error}",
) from error
return analyze_sql_batch_response(request)
@app.post(
"/semantic-layer/generate-sources", response_model=GenerateSourcesResponse
@ -265,14 +260,7 @@ def create_app(
async def semantic_generate_sources(
request: GenerateSourcesRequest,
) -> GenerateSourcesResponse:
try:
return generate_sources_response(request)
except Exception as error:
logger.exception("Semantic source generation failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Semantic source generation failed: {error}",
) from error
return generate_sources_response(request)
@app.post("/semantic-layer/query", response_model=SemanticLayerQueryResponse)
async def semantic_query(
@ -283,12 +271,6 @@ def create_app(
except ValueError as error:
logger.warning("Semantic query rejected: %s", error)
raise HTTPException(status_code=400, detail=str(error)) from error
except Exception as error:
logger.exception("Semantic query failed: %s", error)
raise HTTPException(
status_code=500,
detail=f"Semantic layer query failed: {error}",
) from error
@app.post("/semantic-layer/validate", response_model=ValidateSourcesResponse)
async def semantic_validate(

View file

@ -5,7 +5,7 @@ from __future__ import annotations
import time
from typing import Any
from ktx_daemon.telemetry import error_class, track_telemetry_event
from ktx_daemon.telemetry import error_class, report_exception, track_telemetry_event
from pydantic import BaseModel, ConfigDict, Field
from semantic_layer.duplicate_check import validate_measure_duplicates
from semantic_layer.engine import SemanticEngine
@ -150,6 +150,13 @@ def query_semantic_layer(
track_telemetry_event(
"sql_gen_completed", sql_fields, project_id=request.project_id
)
report_exception(
error,
source="semantic-query",
handled=True,
fatal=False,
project_id=request.project_id,
)
raise

View file

@ -1,5 +1,12 @@
from __future__ import annotations
from ktx_daemon.telemetry.daemon_lifecycle import emit_daemon_stopped_once
from ktx_daemon.telemetry.emitter import error_class, track_telemetry_event
from ktx_daemon.telemetry.exception import report_exception
__all__ = ["error_class", "track_telemetry_event"]
__all__ = [
"emit_daemon_stopped_once",
"error_class",
"report_exception",
"track_telemetry_event",
]

View file

@ -0,0 +1,29 @@
from __future__ import annotations
from typing import Literal
from ktx_daemon.telemetry.emitter import track_telemetry_event
StopReason = Literal["signal", "request", "crash"]
_daemon_stop_emitted = False
def emit_daemon_stopped_once(*, reason: StopReason, uptime_ms: float) -> bool:
global _daemon_stop_emitted
if _daemon_stop_emitted:
return False
_daemon_stop_emitted = True
track_telemetry_event(
"daemon_stopped",
{
"reason": reason,
"uptimeMs": max(0, uptime_ms),
},
)
return True
def reset_daemon_lifecycle_for_tests() -> None:
global _daemon_stop_emitted
_daemon_stop_emitted = False

View file

@ -0,0 +1,156 @@
from __future__ import annotations
import json
import os
import re
import sys
from collections.abc import Mapping, Sequence
from pathlib import Path
from typing import Any
from ktx_daemon import VERSION
from ktx_daemon.telemetry.emitter import POSTHOG_HOST, POSTHOG_PROJECT_API_KEY
from ktx_daemon.telemetry.events import _common_envelope
from ktx_daemon.telemetry.identity import load_telemetry_identity
_KTX_REPORTED_ATTR = "__ktx_posthog_exception_reported"
def _debug_enabled(env: Mapping[str, str]) -> bool:
return env.get("KTX_TELEMETRY_DEBUG") == "1"
def _host(env: Mapping[str, str]) -> str:
return env.get("KTX_TELEMETRY_ENDPOINT") or POSTHOG_HOST
def _redact_static(value: str) -> str:
patterns = [
(
r"([a-z][a-z0-9+.-]*://[^:\s/@]+:)([^@\s/]+)(@)",
r"\1[redacted]\3",
),
(r"\b(password|pwd)=([^;&\s]+)", r"\1=[redacted]"),
(r"\bAuthorization\s*:\s*[^\r\n,;]+", "Authorization: [redacted]"),
(r"\bBearer\s+[A-Za-z0-9._~+/=-]+", "Bearer [redacted]"),
(r"\b(api[_-]?key)\s*[:=]\s*([^\s,;]+)", r"\1=[redacted]"),
(
r"\b(KTX_[A-Z0-9_]*|[A-Z0-9_]*(?:TOKEN|SECRET))\s*[:=]\s*([^\s,;]+)",
r"\1=[redacted]",
),
(r"([?&](?:X-Amz-Signature|X-Goog-Signature|sig)=)[^&\s]+", r"\1[redacted]"),
]
redacted = value
for pattern, replacement in patterns:
redacted = re.sub(pattern, replacement, redacted, flags=re.IGNORECASE)
return redacted
def _redact_text(value: str, secrets: Sequence[str]) -> str:
redacted = value
for secret in secrets:
if secret:
redacted = redacted.replace(secret, "[redacted]")
return _redact_static(redacted)
def _clone_exception(exception: BaseException, secrets: Sequence[str]) -> BaseException:
redacted_args = [_redact_text(str(arg), secrets) for arg in exception.args]
try:
cloned = type(exception)(*redacted_args)
except Exception:
cloned = RuntimeError(_redact_text(str(exception), secrets))
cloned.__traceback__ = exception.__traceback__
cloned.__cause__ = (
_clone_exception(exception.__cause__, secrets) if exception.__cause__ else None
)
cloned.__context__ = (
_clone_exception(exception.__context__, secrets)
if exception.__context__
else None
)
return cloned
def _should_skip_as_reported(exception: BaseException) -> bool:
if getattr(exception, _KTX_REPORTED_ATTR, False):
return True
try:
setattr(exception, _KTX_REPORTED_ATTR, True)
except Exception:
return False
return False
def _properties(*, source: str, handled: bool, fatal: bool) -> dict[str, Any]:
return {
**_common_envelope(),
"daemonVersion": os.environ.get("KTX_DAEMON_VERSION", VERSION),
"source": source,
"handled": handled,
"fatal": fatal,
}
def report_exception(
exception: BaseException,
*,
source: str,
handled: bool,
fatal: bool,
project_id: str | None = None,
home_dir: Path | None = None,
env: Mapping[str, str] | None = None,
redaction_secrets: Sequence[str] | None = None,
) -> None:
source_env = env if env is not None else os.environ
try:
identity = load_telemetry_identity(home_dir=home_dir, env=source_env)
if not identity.enabled or not identity.install_id:
return
if _should_skip_as_reported(exception):
return
properties = _properties(source=source, handled=handled, fatal=fatal)
groups = {"project": project_id} if project_id else None
safe_exception = _clone_exception(exception, redaction_secrets or [])
if _debug_enabled(source_env):
sys.stderr.write(
"[telemetry-exception] "
+ json.dumps(
{
"distinctId": identity.install_id,
"message": str(safe_exception),
"properties": properties,
"groups": groups,
},
sort_keys=True,
)
+ "\n"
)
return
if not POSTHOG_PROJECT_API_KEY.strip() or not _host(source_env).strip():
return
from posthog import Posthog
client = Posthog(
POSTHOG_PROJECT_API_KEY,
host=_host(source_env),
flush_at=1,
flush_interval=0,
sync_mode=True,
timeout=1,
)
client.capture_exception(
safe_exception,
distinct_id=identity.install_id,
properties=properties,
groups=groups,
)
client.shutdown()
except Exception:
return

View file

@ -87,8 +87,10 @@ def test_app_lifespan_emits_daemon_lifecycle_debug_events(
monkeypatch,
capsys,
) -> None:
from ktx_daemon.telemetry.daemon_lifecycle import reset_daemon_lifecycle_for_tests
from ktx_daemon.telemetry.identity import reset_identity_cache
reset_daemon_lifecycle_for_tests()
reset_identity_cache()
identity_path = tmp_path / ".ktx" / "telemetry.json"
identity_path.parent.mkdir(parents=True)

View file

@ -0,0 +1,118 @@
from __future__ import annotations
import gzip
import json
import threading
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
from typing import Any
from ktx_daemon.telemetry.identity import reset_identity_cache
class CaptureHandler(BaseHTTPRequestHandler):
payloads: list[dict[str, Any]] = []
def do_POST(self) -> None:
length = int(self.headers.get("content-length", "0"))
raw = self.rfile.read(length)
if self.headers.get("content-encoding") == "gzip":
raw = gzip.decompress(raw)
self.payloads.append(json.loads(raw.decode("utf-8")))
self.send_response(200)
self.send_header("content-type", "application/json")
self.end_headers()
self.wfile.write(b"{}")
def log_message(self, _format: str, *_args: object) -> None:
return
def write_identity(home: Path) -> None:
target = home / ".ktx" / "telemetry.json"
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(
json.dumps(
{
"installId": "00000000-0000-4000-8000-000000000000",
"enabled": True,
"createdAt": "2026-06-05T00:00:00.000Z",
}
)
+ "\n",
encoding="utf-8",
)
def find_exception_event(payloads: list[dict[str, Any]]) -> dict[str, Any]:
for payload in payloads:
batch = payload.get("batch")
events = batch if isinstance(batch, list) else [payload]
for event in events:
if isinstance(event, dict) and event.get("event") == "$exception":
return event
raise AssertionError(f"No $exception payload found: {payloads}")
def test_prepared_python_exception_payload_groups_and_redacts(tmp_path: Path) -> None:
from ktx_daemon.telemetry.exception import report_exception
reset_identity_cache()
write_identity(tmp_path)
CaptureHandler.payloads.clear()
server = HTTPServer(("127.0.0.1", 0), CaptureHandler)
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
try:
snapshot_secret = "-".join(["plain", "secret", "value"])
db_password = "-".join(["db", "url", "secret"])
auth_token = "".join(["abc", "123"])
report_exception(
RuntimeError(
f"{snapshot_secret} postgres://svc:{db_password}@db.example.test/analytics "
f"Authorization: Basic {auth_token}"
),
source="database-introspect",
handled=True,
fatal=False,
project_id="a" * 64,
home_dir=tmp_path,
env={"KTX_TELEMETRY_ENDPOINT": f"http://127.0.0.1:{server.server_port}"},
redaction_secrets=[snapshot_secret],
)
finally:
server.shutdown()
server.server_close()
thread.join(timeout=2)
event = find_exception_event(CaptureHandler.payloads)
properties = event["properties"]
assert event.get("$groups") == {"project": "a" * 64} or properties.get(
"$groups"
) == {"project": "a" * 64}
serialized = json.dumps(properties.get("$exception_list", []))
assert "[redacted]" in serialized
assert snapshot_secret not in serialized
assert db_password not in serialized
assert auth_token not in serialized
forbidden_keys = {
"argv",
"args",
"env",
"environment",
"sql",
"query",
"prompt",
"mcpArguments",
"tableName",
"schemaName",
"columnName",
"databaseUrl",
"connectionString",
"url",
"password",
"token",
"apiKey",
"authorization",
}
assert forbidden_keys.isdisjoint(properties.keys())

View file

@ -0,0 +1,601 @@
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
from ktx_daemon.telemetry.identity import reset_identity_cache
class FakePosthog:
captures: list[dict[str, Any]] = []
shutdowns = 0
def __init__(self, *_args: Any, **_kwargs: Any) -> None:
pass
def capture_exception(
self,
exception: BaseException,
*,
distinct_id: str,
properties: dict[str, Any],
groups: dict[str, str] | None = None,
) -> None:
self.captures.append(
{
"exception": exception,
"distinct_id": distinct_id,
"properties": properties,
"groups": groups,
}
)
def shutdown(self) -> None:
type(self).shutdowns += 1
def write_identity(home: Path, *, enabled: bool = True) -> None:
target = home / ".ktx" / "telemetry.json"
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(
json.dumps(
{
"installId": "00000000-0000-4000-8000-000000000000",
"enabled": enabled,
"createdAt": "2026-06-05T00:00:00.000Z",
}
)
+ "\n",
encoding="utf-8",
)
def test_report_exception_respects_disabled_gate(tmp_path: Path, monkeypatch) -> None:
from ktx_daemon.telemetry.exception import report_exception
reset_identity_cache()
write_identity(tmp_path)
monkeypatch.setenv("KTX_TELEMETRY_DISABLED", "1")
FakePosthog.captures.clear()
monkeypatch.setattr("posthog.Posthog", FakePosthog)
report_exception(
RuntimeError("boom"),
source="semantic-query",
handled=True,
fatal=False,
home_dir=tmp_path,
env={"KTX_TELEMETRY_DISABLED": "1"},
)
assert FakePosthog.captures == []
def test_report_exception_sends_groups_and_properties(
tmp_path: Path, monkeypatch
) -> None:
from ktx_daemon.telemetry.exception import report_exception
reset_identity_cache()
write_identity(tmp_path)
FakePosthog.captures.clear()
monkeypatch.setattr("posthog.Posthog", FakePosthog)
report_exception(
RuntimeError("boom"),
source="semantic-query",
handled=True,
fatal=False,
project_id="a" * 64,
home_dir=tmp_path,
env={},
)
assert FakePosthog.captures == [
{
"exception": FakePosthog.captures[0]["exception"],
"distinct_id": "00000000-0000-4000-8000-000000000000",
"properties": FakePosthog.captures[0]["properties"],
"groups": {"project": "a" * 64},
}
]
assert FakePosthog.captures[0]["properties"]["source"] == "semantic-query"
assert FakePosthog.captures[0]["properties"]["handled"] is True
assert FakePosthog.captures[0]["properties"]["fatal"] is False
assert FakePosthog.captures[0]["properties"]["runtime"] == "daemon-py"
def test_report_exception_debug_prints_without_sending(tmp_path: Path, capsys) -> None:
from ktx_daemon.telemetry.exception import report_exception
reset_identity_cache()
write_identity(tmp_path)
FakePosthog.captures.clear()
report_exception(
RuntimeError("debug boom"),
source="app:/health",
handled=True,
fatal=False,
home_dir=tmp_path,
env={"KTX_TELEMETRY_DEBUG": "1"},
)
captured = capsys.readouterr()
assert "[telemetry-exception]" in captured.err
assert '"source": "app:/health"' in captured.err
assert FakePosthog.captures == []
def test_report_exception_redacts_snapshot_and_static_patterns(
tmp_path: Path, monkeypatch
) -> None:
from ktx_daemon.telemetry.exception import report_exception
reset_identity_cache()
write_identity(tmp_path)
FakePosthog.captures.clear()
monkeypatch.setattr("posthog.Posthog", FakePosthog)
error = RuntimeError("dsn has plain-secret and password=hunter2")
error.__cause__ = ValueError("Authorization: Bearer token-123")
report_exception(
error,
source="database-introspect",
handled=True,
fatal=False,
home_dir=tmp_path,
env={},
redaction_secrets=["plain-secret"],
)
sent = FakePosthog.captures[0]["exception"]
assert "[redacted]" in str(sent)
assert "plain-secret" not in str(sent)
assert "hunter2" not in str(sent)
assert "token-123" not in str(sent.__cause__)
def test_report_exception_does_not_discover_env_values_without_snapshot(
tmp_path: Path, monkeypatch
) -> None:
from ktx_daemon.telemetry.exception import report_exception
reset_identity_cache()
write_identity(tmp_path)
FakePosthog.captures.clear()
monkeypatch.setenv("KTX_FAKE_SECRET", "plain-secret-without-pattern")
monkeypatch.setattr("posthog.Posthog", FakePosthog)
report_exception(
RuntimeError("plain-secret-without-pattern"),
source="sys.excepthook",
handled=False,
fatal=True,
home_dir=tmp_path,
env={},
)
assert "plain-secret-without-pattern" in str(FakePosthog.captures[0]["exception"])
def test_route_derived_boundary_reports_new_throwing_route(monkeypatch) -> None:
from fastapi import FastAPI
from fastapi.testclient import TestClient
from ktx_daemon.app import create_app
reports: list[dict[str, object]] = []
def fake_report(exception: BaseException, **kwargs: object) -> None:
reports.append({"exception": exception, **kwargs})
monkeypatch.setattr("ktx_daemon.app.report_exception", fake_report)
app: FastAPI = create_app()
@app.get("/new-throwing-route")
async def new_throwing_route() -> dict[str, str]:
raise RuntimeError("route boom")
client = TestClient(app, raise_server_exceptions=False)
response = client.get("/new-throwing-route")
assert response.status_code == 500
assert reports
assert reports[0]["source"] in {"app:/new-throwing-route", "app:new_throwing_route"}
assert reports[0]["handled"] is True
assert reports[0]["fatal"] is False
def test_route_derived_boundary_covers_existing_validate_route(monkeypatch) -> None:
from fastapi.testclient import TestClient
from ktx_daemon import app as app_module
reports: list[dict[str, object]] = []
def fake_report(exception: BaseException, **kwargs: object) -> None:
reports.append({"exception": exception, **kwargs})
monkeypatch.setattr(
app_module,
"validate_semantic_layer",
lambda _request: (_ for _ in ()).throw(RuntimeError("validate boom")),
)
monkeypatch.setattr(app_module, "report_exception", fake_report)
client = TestClient(app_module.create_app(), raise_server_exceptions=False)
response = client.post("/semantic-layer/validate", json={"sources": []})
assert response.status_code == 500
assert reports
assert reports[0]["source"] in {
"app:/semantic-layer/validate",
"app:semantic_validate",
}
def test_daemon_stopped_clean_shutdown_emits_request_once(monkeypatch) -> None:
from ktx_daemon.telemetry.daemon_lifecycle import (
emit_daemon_stopped_once,
reset_daemon_lifecycle_for_tests,
)
events: list[tuple[str, dict[str, object]]] = []
monkeypatch.setattr(
"ktx_daemon.telemetry.daemon_lifecycle.track_telemetry_event",
lambda name, fields: events.append((name, fields)),
)
reset_daemon_lifecycle_for_tests()
emit_daemon_stopped_once(reason="request", uptime_ms=1)
emit_daemon_stopped_once(reason="request", uptime_ms=2)
assert events == [("daemon_stopped", {"reason": "request", "uptimeMs": 1})]
def test_daemon_stopped_crash_wins_over_request(monkeypatch) -> None:
from ktx_daemon.telemetry.daemon_lifecycle import (
emit_daemon_stopped_once,
reset_daemon_lifecycle_for_tests,
)
events: list[tuple[str, dict[str, object]]] = []
monkeypatch.setattr(
"ktx_daemon.telemetry.daemon_lifecycle.track_telemetry_event",
lambda name, fields: events.append((name, fields)),
)
reset_daemon_lifecycle_for_tests()
emit_daemon_stopped_once(reason="crash", uptime_ms=3)
emit_daemon_stopped_once(reason="request", uptime_ms=4)
assert events == [("daemon_stopped", {"reason": "crash", "uptimeMs": 3})]
def test_report_exception_dedupes_same_exception_object(
tmp_path: Path, monkeypatch
) -> None:
from ktx_daemon.telemetry.exception import report_exception
reset_identity_cache()
write_identity(tmp_path)
FakePosthog.captures.clear()
monkeypatch.setattr("posthog.Posthog", FakePosthog)
error = RuntimeError("same object")
report_exception(
error,
source="semantic-query",
handled=True,
fatal=False,
home_dir=tmp_path,
env={},
)
report_exception(
error,
source="app:/semantic-layer/query",
handled=True,
fatal=False,
home_dir=tmp_path,
env={},
)
assert len(FakePosthog.captures) == 1
def test_report_exception_redacts_url_userinfo_and_authorization(
tmp_path: Path, monkeypatch
) -> None:
from ktx_daemon.telemetry.exception import report_exception
reset_identity_cache()
write_identity(tmp_path)
FakePosthog.captures.clear()
monkeypatch.setattr("posthog.Posthog", FakePosthog)
db_password = ["db", "url", "secret"]
auth_token = ["abc", "123"]
report_exception(
RuntimeError(
"connect postgres://svc:"
+ "-".join(db_password)
+ "@db.example.test/analytics Authorization: Basic "
+ "".join(auth_token)
),
source="database-introspect",
handled=True,
fatal=False,
home_dir=tmp_path,
env={},
)
sent = str(FakePosthog.captures[0]["exception"])
assert "postgres://svc:[redacted]@db.example.test/analytics" in sent
assert "Authorization: [redacted]" in sent
assert "-".join(db_password) not in sent
assert "".join(auth_token) not in sent
def test_report_exception_falls_back_when_exception_type_cannot_be_reconstructed(
tmp_path: Path, monkeypatch
) -> None:
from ktx_daemon.telemetry.exception import report_exception
class KeywordOnlyException(Exception):
def __init__(self, *, message: str) -> None:
super().__init__(message)
reset_identity_cache()
write_identity(tmp_path)
FakePosthog.captures.clear()
monkeypatch.setattr("posthog.Posthog", FakePosthog)
report_exception(
KeywordOnlyException(message="custom secret-value"),
source="app:/custom",
handled=True,
fatal=False,
home_dir=tmp_path,
env={},
redaction_secrets=["secret-value"],
)
assert len(FakePosthog.captures) == 1
sent = FakePosthog.captures[0]["exception"]
assert "[redacted]" in str(sent)
assert "secret-value" not in str(sent)
def test_report_exception_redacts_every_static_pattern_and_leaves_benign_text(
tmp_path: Path, monkeypatch
) -> None:
from ktx_daemon.telemetry.exception import report_exception
reset_identity_cache()
write_identity(tmp_path)
FakePosthog.captures.clear()
monkeypatch.setattr("posthog.Posthog", FakePosthog)
cases = [
("dsn password=hunter2", "hunter2", "password=[redacted]"),
("dsn pwd=swordfish", "swordfish", "pwd=[redacted]"),
("Authorization: Basic abc123", "abc123", "Authorization: [redacted]"),
("Authorization: Bearer token-123", "token-123", "Authorization: [redacted]"),
("Bearer standalone-token", "standalone-token", "Bearer [redacted]"),
("api_key=sk-live-secret", "sk-live-secret", "api_key=[redacted]"),
("api-key: sk-dash-secret", "sk-dash-secret", "api-key=[redacted]"),
(
"KTX_PROVIDER_TOKEN=ktx-secret",
"ktx-secret",
"KTX_PROVIDER_TOKEN=[redacted]",
),
(
"REFRESH_SECRET: refresh-secret",
"refresh-secret",
"REFRESH_SECRET=[redacted]",
),
(
"https://s3.example.test/file?X-Amz-Signature=aws-secret&ok=1",
"aws-secret",
"X-Amz-Signature=[redacted]",
),
(
"https://storage.example.test/file?X-Goog-Signature=goog-secret&ok=1",
"goog-secret",
"X-Goog-Signature=[redacted]",
),
(
"https://cdn.example.test/file?sig=signed-secret&ok=1",
"signed-secret",
"sig=[redacted]",
),
(
"postgres://svc:url-password@db.example.test/analytics", # pragma: allowlist secret
"url-password",
"postgres://svc:[redacted]@db.example.test/analytics",
),
]
for message, leaked, expected in cases:
report_exception(
RuntimeError(message),
source="database-introspect",
handled=True,
fatal=False,
home_dir=tmp_path,
env={},
)
sent = str(FakePosthog.captures[-1]["exception"])
assert expected in sent
assert leaked not in sent
report_exception(
RuntimeError("token bucket metrics and passwordless auth are benign"),
source="database-introspect",
handled=True,
fatal=False,
home_dir=tmp_path,
env={},
)
assert str(FakePosthog.captures[-1]["exception"]) == (
"token bucket metrics and passwordless auth are benign"
)
def test_route_derived_boundary_covers_existing_health_route(monkeypatch) -> None:
from fastapi.testclient import TestClient
from ktx_daemon import app as app_module
reports: list[dict[str, object]] = []
def fake_report(exception: BaseException, **kwargs: object) -> None:
reports.append({"exception": exception, **kwargs})
class BrokenEnviron(dict[str, str]):
def get(self, key: str, default: str | None = None) -> str | None:
if key == "KTX_DAEMON_VERSION":
raise RuntimeError("health boom")
return default
monkeypatch.setattr(app_module.os, "environ", BrokenEnviron())
monkeypatch.setattr(app_module, "report_exception", fake_report)
client = TestClient(app_module.create_app(), raise_server_exceptions=False)
response = client.get("/health")
assert response.status_code == 500
assert reports
assert reports[0]["source"] == "app:/health"
assert reports[0]["handled"] is True
assert reports[0]["fatal"] is False
def test_route_boundary_passes_request_scoped_database_secrets(monkeypatch) -> None:
from fastapi.testclient import TestClient
from ktx_daemon import app as app_module
reports: list[dict[str, object]] = []
def fake_report(exception: BaseException, **kwargs: object) -> None:
reports.append({"exception": exception, **kwargs})
monkeypatch.setattr(
app_module,
"introspect_database_response",
lambda _request: (_ for _ in ()).throw(RuntimeError("db-url-secret")),
)
monkeypatch.setattr(app_module, "report_exception", fake_report)
client = TestClient(app_module.create_app(), raise_server_exceptions=False)
response = client.post(
"/database/introspect",
json={
"connection_id": "warehouse",
"url": "postgres://svc:db-url-secret@db.example.test/analytics", # pragma: allowlist secret
"password": "db-password-secret", # pragma: allowlist secret
},
)
assert response.status_code == 500
assert reports
assert (
reports[0]["redaction_secrets"]
== [
"postgres://svc:db-url-secret@db.example.test/analytics", # pragma: allowlist secret
"db-password-secret", # pragma: allowlist secret
]
)
def test_serve_http_run_crash_reports_exception_and_crash_stop(monkeypatch) -> None:
import sys
from ktx_daemon import __main__ as main_module
reports: list[dict[str, object]] = []
stops: list[dict[str, object]] = []
def fake_report(exception: BaseException, **kwargs: object) -> None:
reports.append({"exception": exception, **kwargs})
def fake_stop(*, reason: str, uptime_ms: float) -> bool:
stops.append({"reason": reason, "uptimeMs": uptime_ms})
return True
class FakeUvicorn:
@staticmethod
def run(*_args: object, **_kwargs: object) -> None:
raise RuntimeError("uvicorn crash")
monkeypatch.setitem(sys.modules, "uvicorn", FakeUvicorn)
monkeypatch.setattr("ktx_daemon.telemetry.report_exception", fake_report)
monkeypatch.setattr(
"ktx_daemon.telemetry.daemon_lifecycle.emit_daemon_stopped_once",
fake_stop,
)
try:
main_module.run_http_server(
host="127.0.0.1",
port=9999,
log_level="info",
enable_code_execution=False,
)
except RuntimeError as error:
assert str(error) == "uvicorn crash"
else:
raise AssertionError("run_http_server did not re-raise the crash")
assert reports
assert reports[0]["source"] == "serve-http"
assert reports[0]["handled"] is False
assert reports[0]["fatal"] is True
assert stops and stops[0]["reason"] == "crash"
def test_one_shot_command_reports_without_excepthook_or_daemon_stopped(
monkeypatch,
) -> None:
import sys
from ktx_daemon import __main__ as daemon_main
original_hook = sys.excepthook
reports: list[dict[str, object]] = []
stops: list[dict[str, object]] = []
def fake_report(exception: BaseException, **kwargs: object) -> None:
reports.append({"exception": exception, **kwargs})
def fake_stop(*, reason: str, uptime_ms: float) -> bool:
stops.append({"reason": reason, "uptimeMs": uptime_ms})
return True
monkeypatch.setattr(
daemon_main,
"_read_stdin_json",
lambda: {
"connection_id": "warehouse",
"driver": "postgres",
"url": "postgresql://readonly@example.test/warehouse",
"schemas": ["public"],
},
)
monkeypatch.setattr(
daemon_main,
"introspect_database_response",
lambda _request: (_ for _ in ()).throw(RuntimeError("one-shot boom")),
)
monkeypatch.setattr("ktx_daemon.telemetry.report_exception", fake_report)
monkeypatch.setattr(
"ktx_daemon.telemetry.daemon_lifecycle.emit_daemon_stopped_once",
fake_stop,
)
assert daemon_main.main(["database-introspect"]) == 1
assert sys.excepthook is original_hook
assert stops == []
assert reports
assert reports[0]["source"] == "database-introspect"
assert reports[0]["handled"] is True
assert reports[0]["fatal"] is False

View file

@ -97,6 +97,33 @@ def test_query_semantic_layer_emits_plan_and_sql_debug_events(
assert "public.orders" not in captured.err
def test_query_semantic_layer_reports_exception(monkeypatch) -> None:
from ktx_daemon import semantic_layer as semantic_layer_module
reports: list[dict[str, object]] = []
def fake_report(exception: BaseException, **kwargs: object) -> None:
reports.append({"exception": exception, **kwargs})
monkeypatch.setattr(semantic_layer_module, "report_exception", fake_report)
with pytest.raises(ValueError):
query_semantic_layer(
SemanticLayerQueryRequest(
sources=[ORDERS_SOURCE, ORDERS_SOURCE],
dialect="postgres",
projectId="a" * 64,
query={"measures": ["orders.order_count"]},
)
)
assert reports
assert reports[0]["source"] == "semantic-query"
assert reports[0]["handled"] is True
assert reports[0]["fatal"] is False
assert reports[0]["project_id"] == "a" * 64
def test_semantic_layer_request_rejects_project_id_field_name() -> None:
with pytest.raises(ValueError):
SemanticLayerQueryRequest(