mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-25 08:48:08 +02:00
Merge remote-tracking branch 'origin/main' into luca-martial/schema-select-ux-text
This commit is contained in:
commit
523d6ab68a
130 changed files with 17386 additions and 5942 deletions
|
|
@ -1,146 +0,0 @@
|
|||
{
|
||||
"name": "eviction-churn",
|
||||
"now": "2026-05-08T12:00:00.000Z",
|
||||
"connectionId": "warehouse",
|
||||
"probe": {
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"warnings": [
|
||||
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn"
|
||||
]
|
||||
},
|
||||
"snapshot": {
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"deallocCount": 3,
|
||||
"rows": [
|
||||
{
|
||||
"queryid": "501",
|
||||
"userid": "11",
|
||||
"username": "analyst",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 20,
|
||||
"totalExecTime": 500,
|
||||
"meanExecTime": 25,
|
||||
"totalRows": 40
|
||||
}
|
||||
]
|
||||
},
|
||||
"pullConfig": {
|
||||
"dialect": "postgres",
|
||||
"windowDays": 90,
|
||||
"lastSuccessfulCursor": null,
|
||||
"serviceAccountUserPatterns": [],
|
||||
"redactionPatterns": [],
|
||||
"maxTemplatesPerRun": 5000,
|
||||
"minCalls": 5
|
||||
},
|
||||
"analysisBySql": {
|
||||
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"tablesTouched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literalSlots": []
|
||||
}
|
||||
},
|
||||
"baseline": null,
|
||||
"expectedBaseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q501": {
|
||||
"firstObservedAt": "2026-05-08T12:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 20,
|
||||
"totalExecTime": 500,
|
||||
"totalRows": 40
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedFiles": {
|
||||
"manifest.json": {
|
||||
"json": {
|
||||
"source": "historic-sql",
|
||||
"connectionId": "warehouse",
|
||||
"dialect": "postgres",
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"windowStart": "2026-05-08T08:00:00.000Z",
|
||||
"windowEnd": "2026-05-08T12:00:00.000Z",
|
||||
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
|
||||
"templateCount": 1,
|
||||
"capped": false,
|
||||
"warnings": [
|
||||
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn",
|
||||
"pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn",
|
||||
"baseline_first_run:no_previous_pgss_baseline"
|
||||
],
|
||||
"degraded": true,
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"baselineFirstRun": true,
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"deallocCount": 3,
|
||||
"templates": [
|
||||
{
|
||||
"id": "db5_q501",
|
||||
"fingerprint": "fp_orders_status",
|
||||
"subClusterId": null,
|
||||
"path": "templates/db5_q501/page.md"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"templates/db5_q501/metadata.json": {
|
||||
"json": {
|
||||
"id": "db5_q501",
|
||||
"title": "postgres · analytics.orders [db5_q501]",
|
||||
"path": "templates/db5_q501/page.md",
|
||||
"objectType": "historic_sql_template",
|
||||
"lastEditedAt": null,
|
||||
"properties": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"sub_cluster_id": null,
|
||||
"dialect": "postgres",
|
||||
"tables_touched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literal_slots": [],
|
||||
"triage_signals": {
|
||||
"executions_bucket": "mid",
|
||||
"distinct_users_bucket": "solo",
|
||||
"error_rate_bucket": "ok",
|
||||
"recency_bucket": "active",
|
||||
"service_account_only": "false",
|
||||
"runtime_bucket": "fast"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"templates/db5_q501/page.md": {
|
||||
"text": "# db5_q501\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
|
||||
},
|
||||
"templates/db5_q501/usage.json": {
|
||||
"json": {
|
||||
"stats": {
|
||||
"executions": 20,
|
||||
"distinct_users": 1,
|
||||
"first_seen": "2026-05-08T12:00:00.000Z",
|
||||
"last_seen": "2026-05-08T12:00:00.000Z",
|
||||
"p50_runtime_ms": null,
|
||||
"p95_runtime_ms": null,
|
||||
"mean_runtime_ms": 25,
|
||||
"error_rate": 0,
|
||||
"rows_produced": 40
|
||||
},
|
||||
"literal_slots": [],
|
||||
"samples": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,144 +0,0 @@
|
|||
{
|
||||
"name": "first-run",
|
||||
"now": "2026-05-08T12:00:00.000Z",
|
||||
"connectionId": "warehouse",
|
||||
"probe": {
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"warnings": []
|
||||
},
|
||||
"snapshot": {
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"deallocCount": 0,
|
||||
"rows": [
|
||||
{
|
||||
"queryid": "101",
|
||||
"userid": "11",
|
||||
"username": "analyst",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 10,
|
||||
"totalExecTime": 250,
|
||||
"meanExecTime": 25,
|
||||
"totalRows": 20
|
||||
}
|
||||
]
|
||||
},
|
||||
"pullConfig": {
|
||||
"dialect": "postgres",
|
||||
"windowDays": 90,
|
||||
"lastSuccessfulCursor": null,
|
||||
"serviceAccountUserPatterns": [
|
||||
"^svc_"
|
||||
],
|
||||
"redactionPatterns": [],
|
||||
"maxTemplatesPerRun": 5000,
|
||||
"minCalls": 5
|
||||
},
|
||||
"analysisBySql": {
|
||||
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"tablesTouched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literalSlots": []
|
||||
}
|
||||
},
|
||||
"baseline": null,
|
||||
"expectedBaseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q101": {
|
||||
"firstObservedAt": "2026-05-08T12:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 10,
|
||||
"totalExecTime": 250,
|
||||
"totalRows": 20
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedFiles": {
|
||||
"manifest.json": {
|
||||
"json": {
|
||||
"source": "historic-sql",
|
||||
"connectionId": "warehouse",
|
||||
"dialect": "postgres",
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"windowStart": "2026-05-08T08:00:00.000Z",
|
||||
"windowEnd": "2026-05-08T12:00:00.000Z",
|
||||
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
|
||||
"templateCount": 1,
|
||||
"capped": false,
|
||||
"warnings": [
|
||||
"baseline_first_run:no_previous_pgss_baseline"
|
||||
],
|
||||
"degraded": true,
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"baselineFirstRun": true,
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"deallocCount": 0,
|
||||
"templates": [
|
||||
{
|
||||
"id": "db5_q101",
|
||||
"fingerprint": "fp_orders_status",
|
||||
"subClusterId": null,
|
||||
"path": "templates/db5_q101/page.md"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"templates/db5_q101/metadata.json": {
|
||||
"json": {
|
||||
"id": "db5_q101",
|
||||
"title": "postgres · analytics.orders [db5_q101]",
|
||||
"path": "templates/db5_q101/page.md",
|
||||
"objectType": "historic_sql_template",
|
||||
"lastEditedAt": null,
|
||||
"properties": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"sub_cluster_id": null,
|
||||
"dialect": "postgres",
|
||||
"tables_touched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literal_slots": [],
|
||||
"triage_signals": {
|
||||
"executions_bucket": "mid",
|
||||
"distinct_users_bucket": "solo",
|
||||
"error_rate_bucket": "ok",
|
||||
"recency_bucket": "active",
|
||||
"service_account_only": "false",
|
||||
"runtime_bucket": "fast"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"templates/db5_q101/page.md": {
|
||||
"text": "# db5_q101\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
|
||||
},
|
||||
"templates/db5_q101/usage.json": {
|
||||
"json": {
|
||||
"stats": {
|
||||
"executions": 10,
|
||||
"distinct_users": 1,
|
||||
"first_seen": "2026-05-08T12:00:00.000Z",
|
||||
"last_seen": "2026-05-08T12:00:00.000Z",
|
||||
"p50_runtime_ms": null,
|
||||
"p95_runtime_ms": null,
|
||||
"mean_runtime_ms": 25,
|
||||
"error_rate": 0,
|
||||
"rows_produced": 20
|
||||
},
|
||||
"literal_slots": [],
|
||||
"samples": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,181 +0,0 @@
|
|||
{
|
||||
"name": "normal-delta",
|
||||
"now": "2026-05-08T12:00:00.000Z",
|
||||
"connectionId": "warehouse",
|
||||
"probe": {
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"warnings": []
|
||||
},
|
||||
"snapshot": {
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"deallocCount": 0,
|
||||
"rows": [
|
||||
{
|
||||
"queryid": "201",
|
||||
"userid": "11",
|
||||
"username": "analyst",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 12,
|
||||
"totalExecTime": 160,
|
||||
"meanExecTime": 13.333333333333334,
|
||||
"totalRows": 58
|
||||
},
|
||||
{
|
||||
"queryid": "201",
|
||||
"userid": "12",
|
||||
"username": "svc_loader",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 5,
|
||||
"totalExecTime": 50,
|
||||
"meanExecTime": 10,
|
||||
"totalRows": 25
|
||||
}
|
||||
]
|
||||
},
|
||||
"pullConfig": {
|
||||
"dialect": "postgres",
|
||||
"windowDays": 90,
|
||||
"lastSuccessfulCursor": null,
|
||||
"serviceAccountUserPatterns": [
|
||||
"^svc_"
|
||||
],
|
||||
"redactionPatterns": [],
|
||||
"maxTemplatesPerRun": 5000,
|
||||
"minCalls": 5
|
||||
},
|
||||
"analysisBySql": {
|
||||
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"tablesTouched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literalSlots": []
|
||||
}
|
||||
},
|
||||
"baseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T10:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q201": {
|
||||
"firstObservedAt": "2026-05-08T09:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 10,
|
||||
"totalExecTime": 100,
|
||||
"totalRows": 50
|
||||
},
|
||||
"12": {
|
||||
"calls": 5,
|
||||
"totalExecTime": 50,
|
||||
"totalRows": 25
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedBaseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q201": {
|
||||
"firstObservedAt": "2026-05-08T09:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 12,
|
||||
"totalExecTime": 160,
|
||||
"totalRows": 58
|
||||
},
|
||||
"12": {
|
||||
"calls": 5,
|
||||
"totalExecTime": 50,
|
||||
"totalRows": 25
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedFiles": {
|
||||
"manifest.json": {
|
||||
"json": {
|
||||
"source": "historic-sql",
|
||||
"connectionId": "warehouse",
|
||||
"dialect": "postgres",
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"windowStart": "2026-05-08T10:00:00.000Z",
|
||||
"windowEnd": "2026-05-08T12:00:00.000Z",
|
||||
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
|
||||
"templateCount": 1,
|
||||
"capped": false,
|
||||
"warnings": [],
|
||||
"degraded": true,
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"baselineFirstRun": false,
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"deallocCount": 0,
|
||||
"templates": [
|
||||
{
|
||||
"id": "db5_q201",
|
||||
"fingerprint": "fp_orders_status",
|
||||
"subClusterId": null,
|
||||
"path": "templates/db5_q201/page.md"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"templates/db5_q201/metadata.json": {
|
||||
"json": {
|
||||
"id": "db5_q201",
|
||||
"title": "postgres · analytics.orders [db5_q201]",
|
||||
"path": "templates/db5_q201/page.md",
|
||||
"objectType": "historic_sql_template",
|
||||
"lastEditedAt": null,
|
||||
"properties": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"sub_cluster_id": null,
|
||||
"dialect": "postgres",
|
||||
"tables_touched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literal_slots": [],
|
||||
"triage_signals": {
|
||||
"executions_bucket": "low",
|
||||
"distinct_users_bucket": "solo",
|
||||
"error_rate_bucket": "ok",
|
||||
"recency_bucket": "active",
|
||||
"service_account_only": "false",
|
||||
"runtime_bucket": "fast"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"templates/db5_q201/page.md": {
|
||||
"text": "# db5_q201\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
|
||||
},
|
||||
"templates/db5_q201/usage.json": {
|
||||
"json": {
|
||||
"stats": {
|
||||
"executions": 2,
|
||||
"distinct_users": 1,
|
||||
"first_seen": "2026-05-08T09:00:00.000Z",
|
||||
"last_seen": "2026-05-08T12:00:00.000Z",
|
||||
"p50_runtime_ms": null,
|
||||
"p95_runtime_ms": null,
|
||||
"mean_runtime_ms": 30,
|
||||
"error_rate": 0,
|
||||
"rows_produced": 8
|
||||
},
|
||||
"literal_slots": [],
|
||||
"samples": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,159 +0,0 @@
|
|||
{
|
||||
"name": "reset-detected",
|
||||
"now": "2026-05-08T12:00:00.000Z",
|
||||
"connectionId": "warehouse",
|
||||
"probe": {
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"warnings": []
|
||||
},
|
||||
"snapshot": {
|
||||
"statsResetAt": "2026-05-08T11:00:00.000Z",
|
||||
"deallocCount": 0,
|
||||
"rows": [
|
||||
{
|
||||
"queryid": "301",
|
||||
"userid": "11",
|
||||
"username": "analyst",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 3,
|
||||
"totalExecTime": 90,
|
||||
"meanExecTime": 30,
|
||||
"totalRows": 9
|
||||
}
|
||||
]
|
||||
},
|
||||
"pullConfig": {
|
||||
"dialect": "postgres",
|
||||
"windowDays": 90,
|
||||
"lastSuccessfulCursor": null,
|
||||
"serviceAccountUserPatterns": [],
|
||||
"redactionPatterns": [],
|
||||
"maxTemplatesPerRun": 5000,
|
||||
"minCalls": 5
|
||||
},
|
||||
"analysisBySql": {
|
||||
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"tablesTouched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literalSlots": []
|
||||
}
|
||||
},
|
||||
"baseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T10:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q301": {
|
||||
"firstObservedAt": "2026-05-08T09:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 100,
|
||||
"totalExecTime": 1000,
|
||||
"totalRows": 500
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedBaseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T11:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q301": {
|
||||
"firstObservedAt": "2026-05-08T12:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 3,
|
||||
"totalExecTime": 90,
|
||||
"totalRows": 9
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedFiles": {
|
||||
"manifest.json": {
|
||||
"json": {
|
||||
"source": "historic-sql",
|
||||
"connectionId": "warehouse",
|
||||
"dialect": "postgres",
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"windowStart": "2026-05-08T10:00:00.000Z",
|
||||
"windowEnd": "2026-05-08T12:00:00.000Z",
|
||||
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
|
||||
"templateCount": 1,
|
||||
"capped": false,
|
||||
"warnings": [
|
||||
"baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z"
|
||||
],
|
||||
"degraded": true,
|
||||
"statsResetAt": "2026-05-08T11:00:00.000Z",
|
||||
"baselineFirstRun": true,
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"deallocCount": 0,
|
||||
"templates": [
|
||||
{
|
||||
"id": "db5_q301",
|
||||
"fingerprint": "fp_orders_status",
|
||||
"subClusterId": null,
|
||||
"path": "templates/db5_q301/page.md"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"templates/db5_q301/metadata.json": {
|
||||
"json": {
|
||||
"id": "db5_q301",
|
||||
"title": "postgres · analytics.orders [db5_q301]",
|
||||
"path": "templates/db5_q301/page.md",
|
||||
"objectType": "historic_sql_template",
|
||||
"lastEditedAt": null,
|
||||
"properties": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"sub_cluster_id": null,
|
||||
"dialect": "postgres",
|
||||
"tables_touched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literal_slots": [],
|
||||
"triage_signals": {
|
||||
"executions_bucket": "mid",
|
||||
"distinct_users_bucket": "solo",
|
||||
"error_rate_bucket": "ok",
|
||||
"recency_bucket": "active",
|
||||
"service_account_only": "false",
|
||||
"runtime_bucket": "fast"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"templates/db5_q301/page.md": {
|
||||
"text": "# db5_q301\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
|
||||
},
|
||||
"templates/db5_q301/usage.json": {
|
||||
"json": {
|
||||
"stats": {
|
||||
"executions": 3,
|
||||
"distinct_users": 1,
|
||||
"first_seen": "2026-05-08T12:00:00.000Z",
|
||||
"last_seen": "2026-05-08T12:00:00.000Z",
|
||||
"p50_runtime_ms": null,
|
||||
"p95_runtime_ms": null,
|
||||
"mean_runtime_ms": 30,
|
||||
"error_rate": 0,
|
||||
"rows_produced": 9
|
||||
},
|
||||
"literal_slots": [],
|
||||
"samples": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,159 +0,0 @@
|
|||
{
|
||||
"name": "version-change",
|
||||
"now": "2026-05-08T12:00:00.000Z",
|
||||
"connectionId": "warehouse",
|
||||
"probe": {
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"warnings": []
|
||||
},
|
||||
"snapshot": {
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"deallocCount": 0,
|
||||
"rows": [
|
||||
{
|
||||
"queryid": "401",
|
||||
"userid": "11",
|
||||
"username": "analyst",
|
||||
"dbid": "5",
|
||||
"database": "analytics",
|
||||
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"calls": 4,
|
||||
"totalExecTime": 80,
|
||||
"meanExecTime": 20,
|
||||
"totalRows": 8
|
||||
}
|
||||
]
|
||||
},
|
||||
"pullConfig": {
|
||||
"dialect": "postgres",
|
||||
"windowDays": 90,
|
||||
"lastSuccessfulCursor": null,
|
||||
"serviceAccountUserPatterns": [],
|
||||
"redactionPatterns": [],
|
||||
"maxTemplatesPerRun": 5000,
|
||||
"minCalls": 5
|
||||
},
|
||||
"analysisBySql": {
|
||||
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
|
||||
"tablesTouched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literalSlots": []
|
||||
}
|
||||
},
|
||||
"baseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T10:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 15.7",
|
||||
"templates": {
|
||||
"db5_q401": {
|
||||
"firstObservedAt": "2026-05-08T09:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 100,
|
||||
"totalExecTime": 1000,
|
||||
"totalRows": 500
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedBaseline": {
|
||||
"version": 1,
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"templates": {
|
||||
"db5_q401": {
|
||||
"firstObservedAt": "2026-05-08T12:00:00.000Z",
|
||||
"perUser": {
|
||||
"11": {
|
||||
"calls": 4,
|
||||
"totalExecTime": 80,
|
||||
"totalRows": 8
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"expectedFiles": {
|
||||
"manifest.json": {
|
||||
"json": {
|
||||
"source": "historic-sql",
|
||||
"connectionId": "warehouse",
|
||||
"dialect": "postgres",
|
||||
"fetchedAt": "2026-05-08T12:00:00.000Z",
|
||||
"windowStart": "2026-05-08T10:00:00.000Z",
|
||||
"windowEnd": "2026-05-08T12:00:00.000Z",
|
||||
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
|
||||
"templateCount": 1,
|
||||
"capped": false,
|
||||
"warnings": [
|
||||
"baseline_reset:pg_server_major changed from 15 to 16"
|
||||
],
|
||||
"degraded": true,
|
||||
"statsResetAt": "2026-05-08T08:00:00.000Z",
|
||||
"baselineFirstRun": true,
|
||||
"pgServerVersion": "PostgreSQL 16.4",
|
||||
"deallocCount": 0,
|
||||
"templates": [
|
||||
{
|
||||
"id": "db5_q401",
|
||||
"fingerprint": "fp_orders_status",
|
||||
"subClusterId": null,
|
||||
"path": "templates/db5_q401/page.md"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"templates/db5_q401/metadata.json": {
|
||||
"json": {
|
||||
"id": "db5_q401",
|
||||
"title": "postgres · analytics.orders [db5_q401]",
|
||||
"path": "templates/db5_q401/page.md",
|
||||
"objectType": "historic_sql_template",
|
||||
"lastEditedAt": null,
|
||||
"properties": {
|
||||
"fingerprint": "fp_orders_status",
|
||||
"sub_cluster_id": null,
|
||||
"dialect": "postgres",
|
||||
"tables_touched": [
|
||||
"analytics.orders"
|
||||
],
|
||||
"literal_slots": [],
|
||||
"triage_signals": {
|
||||
"executions_bucket": "mid",
|
||||
"distinct_users_bucket": "solo",
|
||||
"error_rate_bucket": "ok",
|
||||
"recency_bucket": "active",
|
||||
"service_account_only": "false",
|
||||
"runtime_bucket": "fast"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"templates/db5_q401/page.md": {
|
||||
"text": "# db5_q401\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
|
||||
},
|
||||
"templates/db5_q401/usage.json": {
|
||||
"json": {
|
||||
"stats": {
|
||||
"executions": 4,
|
||||
"distinct_users": 1,
|
||||
"first_seen": "2026-05-08T12:00:00.000Z",
|
||||
"last_seen": "2026-05-08T12:00:00.000Z",
|
||||
"p50_runtime_ms": null,
|
||||
"p95_runtime_ms": null,
|
||||
"mean_runtime_ms": 20,
|
||||
"error_rate": 0,
|
||||
"rows_produced": 8
|
||||
},
|
||||
"literal_slots": [],
|
||||
"samples": []
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -33,7 +33,7 @@ describe('BigQueryHistoricSqlQueryHistoryReader', () => {
|
|||
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
|
||||
|
||||
await expect(reader.probe(client)).resolves.toBeUndefined();
|
||||
await expect(reader.probe(client)).resolves.toEqual({ warnings: [], info: [] });
|
||||
|
||||
expect(client.executeQuery).toHaveBeenCalledWith(
|
||||
'SELECT 1 FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` LIMIT 1',
|
||||
|
|
@ -63,127 +63,85 @@ describe('BigQueryHistoricSqlQueryHistoryReader', () => {
|
|||
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
|
||||
});
|
||||
|
||||
it('fetches BigQuery jobs with cursor and maps them into RawQueryRow shape without rowsProduced', async () => {
|
||||
it('fetches aggregated BigQuery query templates', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: [
|
||||
'job_id',
|
||||
'query',
|
||||
'user_email',
|
||||
'creation_time',
|
||||
'end_time',
|
||||
'runtime_ms',
|
||||
'total_slot_ms',
|
||||
'total_bytes_processed',
|
||||
'state',
|
||||
'error_reason',
|
||||
'error_message',
|
||||
'statement_type',
|
||||
'template_id',
|
||||
'canonical_sql',
|
||||
'executions',
|
||||
'distinct_users',
|
||||
'first_seen',
|
||||
'last_seen',
|
||||
'p50_ms',
|
||||
'p95_ms',
|
||||
'error_rate',
|
||||
'rows_produced',
|
||||
'top_users',
|
||||
],
|
||||
rows: [
|
||||
[
|
||||
'bquxjob_1',
|
||||
"SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'",
|
||||
'analyst-a@example.test',
|
||||
'2026-05-04T10:00:00.000Z',
|
||||
'2026-05-04T10:00:01.250Z',
|
||||
1250,
|
||||
3106,
|
||||
161164718,
|
||||
'DONE',
|
||||
'hash-1',
|
||||
'select status from orders',
|
||||
42,
|
||||
3,
|
||||
'2026-05-01T00:00:00.000Z',
|
||||
'2026-05-11T00:00:00.000Z',
|
||||
12,
|
||||
40,
|
||||
0.05,
|
||||
null,
|
||||
null,
|
||||
'SELECT',
|
||||
],
|
||||
[
|
||||
'bquxjob_2',
|
||||
'SELECT * FROM `project-1.analytics.missing_table`',
|
||||
'analyst-b@example.test',
|
||||
new Date('2026-05-04T10:05:00.000Z'),
|
||||
null,
|
||||
null,
|
||||
0,
|
||||
0,
|
||||
'DONE',
|
||||
'notFound',
|
||||
'Not found: Table project-1.analytics.missing_table',
|
||||
'SELECT',
|
||||
JSON.stringify([{ user: 'analyst@example.test', executions: 1 }]),
|
||||
],
|
||||
],
|
||||
totalRows: 2,
|
||||
totalRows: 1,
|
||||
},
|
||||
]);
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'demo', region: 'us' });
|
||||
|
||||
const rows = [];
|
||||
for await (const row of reader.fetch(
|
||||
for await (const row of reader.fetchAggregated(
|
||||
client,
|
||||
{
|
||||
start: new Date('2026-05-01T00:00:00.000Z'),
|
||||
end: new Date('2026-05-04T12:00:00.000Z'),
|
||||
},
|
||||
'2026-05-03T00:00:00.000Z',
|
||||
{ start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') },
|
||||
{ dialect: 'bigquery', minExecutions: 5, windowDays: 90, concurrency: 12, filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 },
|
||||
)) {
|
||||
rows.push(row);
|
||||
}
|
||||
|
||||
expect(client.executeQuery).toHaveBeenCalledTimes(1);
|
||||
const sql = firstQuery(client);
|
||||
expect(sql).toContain('FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`');
|
||||
expect(sql).toContain("creation_time >= TIMESTAMP('2026-05-03T00:00:00.000Z')");
|
||||
expect(sql).toContain("creation_time < TIMESTAMP('2026-05-04T12:00:00.000Z')");
|
||||
expect(sql).toContain("job_type = 'QUERY'");
|
||||
expect(sql).toContain("(statement_type IS NULL OR statement_type != 'SCRIPT')");
|
||||
expect(sql).toContain('ORDER BY creation_time ASC, job_id ASC');
|
||||
expect(sql).toContain('total_slot_ms');
|
||||
expect(sql).toContain('total_bytes_processed');
|
||||
expect(sql).not.toMatch(/total_rows/i);
|
||||
|
||||
expect(rows).toEqual([
|
||||
expect(sql).toContain('COUNT(*) AS executions');
|
||||
expect(sql).toContain('COUNT(DISTINCT user_email) AS distinct_users');
|
||||
expect(sql).toContain('GROUP BY query_hash');
|
||||
expect(sql).toContain('HAVING COUNT(*) >= 5');
|
||||
expect(rows).toMatchObject([
|
||||
{
|
||||
id: 'bquxjob_1',
|
||||
sql: "SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'",
|
||||
user: 'analyst-a@example.test',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: '2026-05-04T10:00:01.250Z',
|
||||
runtimeMs: 1250,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'bquxjob_2',
|
||||
sql: 'SELECT * FROM `project-1.analytics.missing_table`',
|
||||
user: 'analyst-b@example.test',
|
||||
startedAt: '2026-05-04T10:05:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: null,
|
||||
success: false,
|
||||
errorMessage: 'notFound: Not found: Table project-1.analytics.missing_table',
|
||||
templateId: 'hash-1',
|
||||
stats: {
|
||||
executions: 42,
|
||||
errorRate: 0.05,
|
||||
},
|
||||
topUsers: [{ user: 'analyst@example.test', executions: 1 }],
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('uses the window start when no cursor is available', async () => {
|
||||
const client = queryClient([{ headers: ['job_id'], rows: [], totalRows: 0 }]);
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'EU' });
|
||||
|
||||
for await (const _row of reader.fetch(client, {
|
||||
start: new Date('2026-02-03T12:00:00.000Z'),
|
||||
end: new Date('2026-05-04T12:00:00.000Z'),
|
||||
})) {
|
||||
throw new Error('empty result should not yield rows');
|
||||
}
|
||||
|
||||
const sql = firstQuery(client);
|
||||
expect(sql).toContain('FROM `project-1.region-eu.INFORMATION_SCHEMA.JOBS_BY_PROJECT`');
|
||||
expect(sql).toContain("creation_time >= TIMESTAMP('2026-02-03T12:00:00.000Z')");
|
||||
});
|
||||
|
||||
it('throws a clear error when the query client cannot execute SQL', async () => {
|
||||
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
|
||||
|
||||
await expect(async () => {
|
||||
for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) {
|
||||
for await (const _row of reader.fetchAggregated(
|
||||
{},
|
||||
{ start: new Date(), end: new Date() },
|
||||
{
|
||||
dialect: 'bigquery',
|
||||
minExecutions: 5,
|
||||
windowDays: 90,
|
||||
concurrency: 12,
|
||||
filters: { dropTrivialProbes: true },
|
||||
redactionPatterns: [],
|
||||
staleArchiveAfterDays: 90,
|
||||
},
|
||||
)) {
|
||||
throw new Error('unreachable');
|
||||
}
|
||||
}).rejects.toThrow('Historic SQL BigQuery reader requires a query client with executeQuery(query)');
|
||||
|
|
|
|||
|
|
@ -1,5 +1,10 @@
|
|||
import { HistoricSqlGrantsMissingError } from './errors.js';
|
||||
import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js';
|
||||
import {
|
||||
aggregatedTemplateSchema,
|
||||
type AggregatedTemplate,
|
||||
type HistoricSqlTimeWindow,
|
||||
type HistoricSqlUnifiedPullConfig,
|
||||
} from './types.js';
|
||||
|
||||
interface QueryResultLike {
|
||||
headers: string[];
|
||||
|
|
@ -110,6 +115,23 @@ function nullableNumber(raw: unknown): number | null {
|
|||
return Math.max(0, number);
|
||||
}
|
||||
|
||||
function requiredNumber(raw: unknown, field: string): number {
|
||||
const number = nullableNumber(raw);
|
||||
if (number === null) {
|
||||
throw new Error(`BigQuery JOBS_BY_PROJECT row has invalid ${field}: ${String(raw)}`);
|
||||
}
|
||||
return number;
|
||||
}
|
||||
|
||||
function requiredInteger(raw: unknown, field: string): number {
|
||||
return Math.trunc(requiredNumber(raw, field));
|
||||
}
|
||||
|
||||
function nullableInteger(raw: unknown): number | null {
|
||||
const number = nullableNumber(raw);
|
||||
return number === null ? null : Math.trunc(number);
|
||||
}
|
||||
|
||||
function isoTimestamp(raw: unknown, field: string): string {
|
||||
if (raw instanceof Date) {
|
||||
return raw.toISOString();
|
||||
|
|
@ -122,43 +144,49 @@ function isoTimestamp(raw: unknown, field: string): string {
|
|||
return date.toISOString();
|
||||
}
|
||||
|
||||
function nullableIsoTimestamp(raw: unknown): string | null {
|
||||
if (raw === null || raw === undefined || raw === '') {
|
||||
return null;
|
||||
function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> {
|
||||
const text = nullableString(raw);
|
||||
if (!text) {
|
||||
return [];
|
||||
}
|
||||
return isoTimestamp(raw, 'end_time');
|
||||
}
|
||||
|
||||
function executionSucceeded(state: string | null, errorReason: string | null, errorMessage: string | null): boolean {
|
||||
if (errorReason || errorMessage) {
|
||||
return false;
|
||||
try {
|
||||
const parsed = JSON.parse(text) as unknown;
|
||||
if (!Array.isArray(parsed)) {
|
||||
return [];
|
||||
}
|
||||
return parsed.flatMap((entry) => {
|
||||
if (!entry || typeof entry !== 'object') {
|
||||
return [];
|
||||
}
|
||||
const user = nullableString((entry as { user?: unknown }).user);
|
||||
const executions = nullableInteger((entry as { executions?: unknown }).executions);
|
||||
return executions === null ? [] : [{ user, executions }];
|
||||
});
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
return state === null || state.toUpperCase() === 'DONE';
|
||||
}
|
||||
|
||||
function combinedErrorMessage(errorReason: string | null, errorMessage: string | null): string | null {
|
||||
if (errorReason && errorMessage) {
|
||||
return `${errorReason}: ${errorMessage}`;
|
||||
}
|
||||
return errorMessage ?? errorReason;
|
||||
function mapAggregatedRow(row: unknown[], indexes: Map<string, number>): AggregatedTemplate {
|
||||
return aggregatedTemplateSchema.parse({
|
||||
templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'),
|
||||
canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'),
|
||||
dialect: 'bigquery',
|
||||
stats: {
|
||||
executions: requiredInteger(value(row, indexes, 'executions'), 'executions'),
|
||||
distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'),
|
||||
firstSeen: isoTimestamp(value(row, indexes, 'first_seen'), 'first_seen'),
|
||||
lastSeen: isoTimestamp(value(row, indexes, 'last_seen'), 'last_seen'),
|
||||
p50RuntimeMs: nullableNumber(value(row, indexes, 'p50_ms')),
|
||||
p95RuntimeMs: nullableNumber(value(row, indexes, 'p95_ms')),
|
||||
errorRate: requiredNumber(value(row, indexes, 'error_rate'), 'error_rate'),
|
||||
rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')),
|
||||
},
|
||||
topUsers: parseTopUsers(value(row, indexes, 'top_users')),
|
||||
});
|
||||
}
|
||||
|
||||
function mapRow(row: unknown[], indexes: Map<string, number>): HistoricSqlRawQueryRow {
|
||||
const errorReason = nullableString(value(row, indexes, 'error_reason'));
|
||||
const errorMessage = nullableString(value(row, indexes, 'error_message'));
|
||||
return {
|
||||
id: requiredString(value(row, indexes, 'job_id'), 'job_id'),
|
||||
sql: requiredString(value(row, indexes, 'query'), 'query'),
|
||||
user: nullableString(value(row, indexes, 'user_email')),
|
||||
startedAt: isoTimestamp(value(row, indexes, 'creation_time'), 'creation_time'),
|
||||
endedAt: nullableIsoTimestamp(value(row, indexes, 'end_time')),
|
||||
runtimeMs: nullableNumber(value(row, indexes, 'runtime_ms')),
|
||||
success: executionSucceeded(nullableString(value(row, indexes, 'state')), errorReason, errorMessage),
|
||||
errorMessage: combinedErrorMessage(errorReason, errorMessage),
|
||||
};
|
||||
}
|
||||
|
||||
export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader {
|
||||
export class BigQueryHistoricSqlQueryHistoryReader {
|
||||
private readonly viewPath: string;
|
||||
|
||||
constructor(options: BigQueryHistoricSqlQueryHistoryReaderOptions) {
|
||||
|
|
@ -167,7 +195,7 @@ export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHi
|
|||
this.viewPath = `\`${projectId}.region-${region}.INFORMATION_SCHEMA.JOBS_BY_PROJECT\``;
|
||||
}
|
||||
|
||||
async probe(client: unknown): Promise<void> {
|
||||
async probe(client: unknown): Promise<{ warnings: string[]; info: string[] }> {
|
||||
let result: QueryResultLike;
|
||||
try {
|
||||
result = await queryClient(client).executeQuery(`SELECT 1 FROM ${this.viewPath} LIMIT 1`);
|
||||
|
|
@ -177,43 +205,43 @@ export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHi
|
|||
if (result.error) {
|
||||
throw grantsError(result.error);
|
||||
}
|
||||
return { warnings: [], info: [] };
|
||||
}
|
||||
|
||||
async *fetch(
|
||||
async *fetchAggregated(
|
||||
client: unknown,
|
||||
window: HistoricSqlTimeWindow,
|
||||
cursor?: string | null,
|
||||
): AsyncIterable<HistoricSqlRawQueryRow> {
|
||||
const start = timestampExpression(cursor ?? window.start);
|
||||
const end = timestampExpression(window.end);
|
||||
config: HistoricSqlUnifiedPullConfig,
|
||||
): AsyncIterable<AggregatedTemplate> {
|
||||
const sql = `
|
||||
SELECT
|
||||
job_id,
|
||||
query,
|
||||
user_email,
|
||||
creation_time,
|
||||
end_time,
|
||||
TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND) AS runtime_ms,
|
||||
total_slot_ms,
|
||||
total_bytes_processed,
|
||||
state,
|
||||
error_result.reason AS error_reason,
|
||||
error_result.message AS error_message,
|
||||
statement_type
|
||||
query_hash AS template_id,
|
||||
MIN(query) AS canonical_sql,
|
||||
COUNT(*) AS executions,
|
||||
COUNT(DISTINCT user_email) AS distinct_users,
|
||||
MIN(creation_time) AS first_seen,
|
||||
MAX(creation_time) AS last_seen,
|
||||
APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(50)] AS p50_ms,
|
||||
APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(95)] AS p95_ms,
|
||||
SAFE_DIVIDE(COUNTIF(error_result IS NOT NULL), COUNT(*)) AS error_rate,
|
||||
CAST(NULL AS INT64) AS rows_produced,
|
||||
TO_JSON_STRING(ARRAY_AGG(STRUCT(user_email AS user, 1 AS executions) ORDER BY creation_time DESC LIMIT 5)) AS top_users
|
||||
FROM ${this.viewPath}
|
||||
WHERE creation_time >= ${start}
|
||||
AND creation_time < ${end}
|
||||
AND job_type = 'QUERY'
|
||||
WHERE job_type = 'QUERY'
|
||||
AND statement_type IN ('SELECT', 'MERGE')
|
||||
AND creation_time >= ${timestampExpression(window.start)}
|
||||
AND creation_time < ${timestampExpression(window.end)}
|
||||
AND query IS NOT NULL
|
||||
AND (statement_type IS NULL OR statement_type != 'SCRIPT')
|
||||
ORDER BY creation_time ASC, job_id ASC`.trim();
|
||||
GROUP BY query_hash
|
||||
HAVING COUNT(*) >= ${config.minExecutions}
|
||||
ORDER BY executions DESC`.trim();
|
||||
const result = await queryClient(client).executeQuery(sql);
|
||||
if (result.error) {
|
||||
throw grantsError(result.error);
|
||||
}
|
||||
const indexes = indexByHeader(result.headers);
|
||||
for (const row of result.rows) {
|
||||
yield mapRow(row, indexes);
|
||||
yield mapAggregatedRow(row, indexes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,59 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
bucketDistinctUsers,
|
||||
bucketErrorRate,
|
||||
bucketExecutions,
|
||||
bucketFrequency,
|
||||
bucketP95Runtime,
|
||||
bucketRecency,
|
||||
} from './buckets.js';
|
||||
|
||||
describe('historic-sql bucket helpers', () => {
|
||||
it('uses stable execution buckets', () => {
|
||||
expect([0, 9, 10, 99, 100, 999, 1000, 4999, 5000, 49999, 50000].map(bucketExecutions)).toEqual([
|
||||
'<10',
|
||||
'<10',
|
||||
'10-100',
|
||||
'10-100',
|
||||
'100-1k',
|
||||
'100-1k',
|
||||
'1k-5k',
|
||||
'1k-5k',
|
||||
'5k-50k',
|
||||
'5k-50k',
|
||||
'>50k',
|
||||
]);
|
||||
});
|
||||
|
||||
it('uses stable distinct-user, error-rate, runtime, and recency buckets', () => {
|
||||
expect([0, 1, 2, 5, 6, 10, 11].map(bucketDistinctUsers)).toEqual([
|
||||
'0',
|
||||
'1',
|
||||
'2-5',
|
||||
'2-5',
|
||||
'5-10',
|
||||
'5-10',
|
||||
'>10',
|
||||
]);
|
||||
expect([0, 0.01, 0.05, 0.2].map(bucketErrorRate)).toEqual(['none', 'low', 'low', 'high']);
|
||||
expect([null, 99, 100, 999, 1000, 9999, 10000].map(bucketP95Runtime)).toEqual([
|
||||
'unknown',
|
||||
'<100ms',
|
||||
'100ms-1s',
|
||||
'100ms-1s',
|
||||
'1s-10s',
|
||||
'1s-10s',
|
||||
'>10s',
|
||||
]);
|
||||
expect(bucketRecency('2026-05-11T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('current');
|
||||
expect(bucketRecency('2026-04-20T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('recent');
|
||||
expect(bucketRecency('2026-01-01T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('stale');
|
||||
});
|
||||
|
||||
it('maps frequency counts to high, mid, and low labels', () => {
|
||||
expect(bucketFrequency(80, 100)).toBe('high');
|
||||
expect(bucketFrequency(20, 100)).toBe('mid');
|
||||
expect(bucketFrequency(1, 100)).toBe('low');
|
||||
expect(bucketFrequency(0, 0)).toBe('low');
|
||||
});
|
||||
});
|
||||
49
packages/context/src/ingest/adapters/historic-sql/buckets.ts
Normal file
49
packages/context/src/ingest/adapters/historic-sql/buckets.ts
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
export function bucketExecutions(value: number): string {
|
||||
if (value < 10) return '<10';
|
||||
if (value < 100) return '10-100';
|
||||
if (value < 1000) return '100-1k';
|
||||
if (value < 5000) return '1k-5k';
|
||||
if (value < 50000) return '5k-50k';
|
||||
return '>50k';
|
||||
}
|
||||
|
||||
export function bucketDistinctUsers(value: number): string {
|
||||
if (value <= 0) return '0';
|
||||
if (value === 1) return '1';
|
||||
if (value <= 5) return '2-5';
|
||||
if (value <= 10) return '5-10';
|
||||
return '>10';
|
||||
}
|
||||
|
||||
export function bucketErrorRate(value: number): string {
|
||||
if (value <= 0) return 'none';
|
||||
if (value < 0.1) return 'low';
|
||||
return 'high';
|
||||
}
|
||||
|
||||
export function bucketP95Runtime(value: number | null): string {
|
||||
if (value === null) return 'unknown';
|
||||
if (value < 100) return '<100ms';
|
||||
if (value < 1000) return '100ms-1s';
|
||||
if (value < 10000) return '1s-10s';
|
||||
return '>10s';
|
||||
}
|
||||
|
||||
export function bucketRecency(lastSeen: string, now: Date): string {
|
||||
const parsed = new Date(lastSeen);
|
||||
if (Number.isNaN(parsed.getTime())) {
|
||||
return 'unknown';
|
||||
}
|
||||
const ageDays = (now.getTime() - parsed.getTime()) / (24 * 60 * 60 * 1000);
|
||||
if (ageDays <= 7) return 'current';
|
||||
if (ageDays <= 45) return 'recent';
|
||||
return 'stale';
|
||||
}
|
||||
|
||||
export function bucketFrequency(count: number, total: number): 'high' | 'mid' | 'low' {
|
||||
if (total <= 0 || count <= 0) return 'low';
|
||||
const ratio = count / total;
|
||||
if (ratio >= 0.5) return 'high';
|
||||
if (ratio >= 0.1) return 'mid';
|
||||
return 'low';
|
||||
}
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './chunk-unified.js';
|
||||
|
||||
async function tempDir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-unified-chunk-'));
|
||||
}
|
||||
|
||||
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
await mkdir(join(target, '..'), { recursive: true });
|
||||
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
|
||||
}
|
||||
|
||||
async function writeUnifiedStagedDir(root: string): Promise<void> {
|
||||
await writeJson(root, 'manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-11T00:00:00.000Z',
|
||||
windowStart: '2026-02-10T00:00:00.000Z',
|
||||
windowEnd: '2026-05-11T00:00:00.000Z',
|
||||
snapshotRowCount: 1,
|
||||
touchedTableCount: 1,
|
||||
parseFailures: 0,
|
||||
warnings: [],
|
||||
probeWarnings: [],
|
||||
});
|
||||
await writeJson(root, 'tables/public.orders.json', {
|
||||
table: 'public.orders',
|
||||
stats: {
|
||||
executionsBucket: '10-100',
|
||||
distinctUsersBucket: '2-5',
|
||||
errorRateBucket: 'none',
|
||||
p95RuntimeBucket: '<100ms',
|
||||
recencyBucket: 'current',
|
||||
},
|
||||
columnsByClause: { select: [['status', 'high']] },
|
||||
observedJoins: [],
|
||||
topTemplates: [{ id: 'orders', canonicalSql: 'select * from public.orders', topUsers: [{ user: 'analyst' }] }],
|
||||
});
|
||||
await writeJson(root, 'patterns-input.json', {
|
||||
templates: [
|
||||
{
|
||||
id: 'orders',
|
||||
canonicalSql: 'select * from public.orders join public.customers on true',
|
||||
tablesTouched: ['public.orders', 'public.customers'],
|
||||
executionsBucket: '10-100',
|
||||
distinctUsersBucket: '2-5',
|
||||
dialect: 'postgres',
|
||||
},
|
||||
],
|
||||
});
|
||||
await writeJson(root, 'patterns-input/part-0001.json', {
|
||||
templates: [
|
||||
{
|
||||
id: 'orders',
|
||||
canonicalSql: 'select * from public.orders join public.customers on true',
|
||||
tablesTouched: ['public.orders', 'public.customers'],
|
||||
executionsBucket: '10-100',
|
||||
distinctUsersBucket: '2-5',
|
||||
dialect: 'postgres',
|
||||
},
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
describe('chunkHistoricSqlUnifiedStagedDir', () => {
|
||||
it('emits one table WorkUnit plus one patterns WorkUnit', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeUnifiedStagedDir(stagedDir);
|
||||
|
||||
const result = await chunkHistoricSqlUnifiedStagedDir(stagedDir);
|
||||
|
||||
expect(result.workUnits).toEqual([
|
||||
expect.objectContaining({
|
||||
unitKey: 'historic-sql-table-public-orders',
|
||||
displayLabel: 'Historic SQL usage: public.orders',
|
||||
rawFiles: ['tables/public.orders.json'],
|
||||
dependencyPaths: ['manifest.json'],
|
||||
notes: expect.stringContaining('historic_sql_table_digest'),
|
||||
}),
|
||||
expect.objectContaining({
|
||||
unitKey: 'historic-sql-patterns-part-0001',
|
||||
displayLabel: 'Historic SQL cross-table patterns: part-0001',
|
||||
rawFiles: ['patterns-input/part-0001.json'],
|
||||
dependencyPaths: ['manifest.json'],
|
||||
notes: expect.stringContaining('patterns-input/part-0001.json'),
|
||||
}),
|
||||
]);
|
||||
expect(result.workUnits[0]?.notes).toContain('emit_historic_sql_evidence');
|
||||
expect(result.workUnits[1]?.notes).toContain('emit_historic_sql_evidence');
|
||||
expect(result.reconcileNotes).toEqual(['Historic-SQL touched tables=1 parseFailures=0']);
|
||||
});
|
||||
|
||||
it('respects diff sets for unchanged table and patterns files', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeUnifiedStagedDir(stagedDir);
|
||||
|
||||
await expect(
|
||||
chunkHistoricSqlUnifiedStagedDir(stagedDir, {
|
||||
added: [],
|
||||
modified: ['tables/public.orders.json'],
|
||||
deleted: [],
|
||||
unchanged: ['manifest.json', 'patterns-input.json', 'patterns-input/part-0001.json'],
|
||||
}),
|
||||
).resolves.toMatchObject({
|
||||
workUnits: [expect.objectContaining({ unitKey: 'historic-sql-table-public-orders' })],
|
||||
});
|
||||
|
||||
await expect(
|
||||
chunkHistoricSqlUnifiedStagedDir(stagedDir, {
|
||||
added: [],
|
||||
modified: ['patterns-input/part-0001.json'],
|
||||
deleted: [],
|
||||
unchanged: ['manifest.json', 'patterns-input.json', 'tables/public.orders.json'],
|
||||
}),
|
||||
).resolves.toMatchObject({
|
||||
workUnits: [expect.objectContaining({ unitKey: 'historic-sql-patterns-part-0001' })],
|
||||
});
|
||||
|
||||
await expect(
|
||||
chunkHistoricSqlUnifiedStagedDir(stagedDir, {
|
||||
added: [],
|
||||
modified: ['patterns-input.json'],
|
||||
deleted: [],
|
||||
unchanged: ['manifest.json', 'patterns-input/part-0001.json', 'tables/public.orders.json'],
|
||||
}),
|
||||
).resolves.toMatchObject({
|
||||
workUnits: [],
|
||||
});
|
||||
});
|
||||
|
||||
it('describes unified staged scope', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeUnifiedStagedDir(stagedDir);
|
||||
|
||||
const scope = await describeHistoricSqlUnifiedScope(stagedDir);
|
||||
|
||||
expect(scope.isPathInScope('manifest.json')).toBe(true);
|
||||
expect(scope.isPathInScope('patterns-input.json')).toBe(true);
|
||||
expect(scope.isPathInScope('patterns-input/part-0001.json')).toBe(true);
|
||||
expect(scope.isPathInScope('patterns-input/part-1.json')).toBe(false);
|
||||
expect(scope.isPathInScope('tables/public.orders.json')).toBe(true);
|
||||
expect(scope.isPathInScope('templates/old/page.md')).toBe(false);
|
||||
});
|
||||
|
||||
it('emits one patterns WorkUnit per changed shard', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeUnifiedStagedDir(stagedDir);
|
||||
await writeJson(stagedDir, 'patterns-input/part-0002.json', {
|
||||
templates: [
|
||||
{
|
||||
id: 'line-items',
|
||||
canonicalSql: 'select * from public.orders join public.line_items on true',
|
||||
tablesTouched: ['public.orders', 'public.line_items'],
|
||||
executionsBucket: '10-100',
|
||||
distinctUsersBucket: '2-5',
|
||||
dialect: 'postgres',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const result = await chunkHistoricSqlUnifiedStagedDir(stagedDir, {
|
||||
added: ['patterns-input/part-0002.json'],
|
||||
modified: ['patterns-input/part-0001.json'],
|
||||
deleted: [],
|
||||
unchanged: ['manifest.json', 'patterns-input.json', 'tables/public.orders.json'],
|
||||
});
|
||||
|
||||
expect(result.workUnits.map((unit) => unit.unitKey)).toEqual([
|
||||
'historic-sql-patterns-part-0001',
|
||||
'historic-sql-patterns-part-0002',
|
||||
]);
|
||||
expect(result.workUnits.map((unit) => unit.rawFiles)).toEqual([
|
||||
['patterns-input/part-0001.json'],
|
||||
['patterns-input/part-0002.json'],
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import { readFile, readdir } from 'node:fs/promises';
|
||||
import { join, relative } from 'node:path';
|
||||
import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js';
|
||||
import { isHistoricSqlPatternInputShardPath } from './pattern-inputs.js';
|
||||
import { stagedManifestSchema, stagedPatternsInputSchema, stagedTableInputSchema } from './types.js';
|
||||
|
||||
async function walk(root: string): Promise<string[]> {
|
||||
const entries = await readdir(root, { withFileTypes: true, recursive: true });
|
||||
return entries
|
||||
.filter((entry) => entry.isFile())
|
||||
.map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/'))
|
||||
.sort();
|
||||
}
|
||||
|
||||
async function readJson<T>(stagedDir: string, relPath: string): Promise<T> {
|
||||
return JSON.parse(await readFile(join(stagedDir, relPath), 'utf-8')) as T;
|
||||
}
|
||||
|
||||
function safeUnitKey(value: string): string {
|
||||
return value.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '');
|
||||
}
|
||||
|
||||
function touchedPath(path: string, touched: Set<string> | null): boolean {
|
||||
return !touched || touched.has(path);
|
||||
}
|
||||
|
||||
export async function chunkHistoricSqlUnifiedStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
|
||||
const files = await walk(stagedDir);
|
||||
const manifest = stagedManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
|
||||
const workUnits: WorkUnit[] = [];
|
||||
|
||||
for (const path of files.filter((file) => /^tables\/.+\.json$/.test(file))) {
|
||||
if (!touchedPath(path, touched)) {
|
||||
continue;
|
||||
}
|
||||
const table = stagedTableInputSchema.parse(await readJson(stagedDir, path));
|
||||
workUnits.push({
|
||||
unitKey: `historic-sql-table-${safeUnitKey(table.table)}`,
|
||||
displayLabel: `Historic SQL usage: ${table.table}`,
|
||||
rawFiles: [path],
|
||||
dependencyPaths: ['manifest.json'],
|
||||
peerFileIndex: files.filter((file) => file !== path && file !== 'manifest.json').sort(),
|
||||
notes:
|
||||
'Use historic_sql_table_digest. Read this table usage JSON and emit exactly one table_usage object with emit_historic_sql_evidence. Do not call wiki_write or sl_write_source.',
|
||||
});
|
||||
}
|
||||
|
||||
for (const path of files.filter(isHistoricSqlPatternInputShardPath)) {
|
||||
if (!touchedPath(path, touched)) {
|
||||
continue;
|
||||
}
|
||||
stagedPatternsInputSchema.parse(await readJson(stagedDir, path));
|
||||
const shardLabel = path.replace(/^patterns-input\//, '').replace(/\.json$/, '');
|
||||
workUnits.push({
|
||||
unitKey: `historic-sql-patterns-${safeUnitKey(shardLabel)}`,
|
||||
displayLabel: `Historic SQL cross-table patterns: ${shardLabel}`,
|
||||
rawFiles: [path],
|
||||
dependencyPaths: ['manifest.json'],
|
||||
peerFileIndex: files.filter((file) => file !== path && file !== 'manifest.json').sort(),
|
||||
notes:
|
||||
`Use historic_sql_patterns. Read ${path} and emit pattern objects with emit_historic_sql_evidence using rawPath "${path}". Do not call wiki_write or sl_write_source.`,
|
||||
});
|
||||
}
|
||||
|
||||
const deleted = diffSet?.deleted
|
||||
.filter((path) => isHistoricSqlPatternInputShardPath(path) || /^tables\/.+\.json$/.test(path))
|
||||
.sort();
|
||||
return {
|
||||
workUnits,
|
||||
eviction: deleted && deleted.length > 0 ? { deletedRawPaths: deleted } : undefined,
|
||||
reconcileNotes: [`Historic-SQL touched tables=${manifest.touchedTableCount} parseFailures=${manifest.parseFailures}`],
|
||||
contextReport: {
|
||||
capped: false,
|
||||
warnings: [...manifest.probeWarnings, ...manifest.warnings],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function describeHistoricSqlUnifiedScope(stagedDir: string): Promise<ScopeDescriptor> {
|
||||
const manifest = stagedManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
const fingerprint = createHash('sha256')
|
||||
.update(JSON.stringify({
|
||||
connectionId: manifest.connectionId,
|
||||
dialect: manifest.dialect,
|
||||
windowStart: manifest.windowStart,
|
||||
windowEnd: manifest.windowEnd,
|
||||
}))
|
||||
.digest('hex');
|
||||
return {
|
||||
fingerprint,
|
||||
isPathInScope: (rawPath) =>
|
||||
rawPath === 'manifest.json' ||
|
||||
rawPath === 'patterns-input.json' ||
|
||||
isHistoricSqlPatternInputShardPath(rawPath) ||
|
||||
/^tables\/.+\.json$/.test(rawPath),
|
||||
};
|
||||
}
|
||||
|
|
@ -1,251 +0,0 @@
|
|||
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js';
|
||||
|
||||
async function tempDir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-chunk-'));
|
||||
}
|
||||
|
||||
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
await mkdir(join(target, '..'), { recursive: true });
|
||||
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
|
||||
}
|
||||
|
||||
async function writeTemplate(root: string): Promise<void> {
|
||||
await writeJson(root, 'manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
templateCount: 1,
|
||||
capped: false,
|
||||
warnings: ['source warning'],
|
||||
templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }],
|
||||
});
|
||||
await writeJson(root, 'templates/fp_1/metadata.json', {
|
||||
id: 'fp_1',
|
||||
title: 'snowflake · analytics.orders [fp_1]',
|
||||
path: 'templates/fp_1/page.md',
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_1',
|
||||
sub_cluster_id: null,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
|
||||
triage_signals: {
|
||||
executions_bucket: 'high',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 0 runtime',
|
||||
},
|
||||
},
|
||||
});
|
||||
await writeFile(join(root, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8');
|
||||
await writeJson(root, 'templates/fp_1/usage.json', {
|
||||
stats: {
|
||||
executions: 20,
|
||||
distinct_users: 3,
|
||||
first_seen: '2026-05-01T00:00:00.000Z',
|
||||
last_seen: '2026-05-04T11:55:00.000Z',
|
||||
p50_runtime_ms: 100,
|
||||
p95_runtime_ms: 200,
|
||||
error_rate: 0,
|
||||
rows_produced: 20,
|
||||
},
|
||||
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }],
|
||||
samples: [],
|
||||
});
|
||||
}
|
||||
|
||||
async function writeSubclusterTemplates(root: string): Promise<void> {
|
||||
await writeJson(root, 'manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
templateCount: 2,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [
|
||||
{
|
||||
id: 'fp_order_status__cat_2b2ff2318877',
|
||||
fingerprint: 'fp_order_status',
|
||||
subClusterId: 'cat_2b2ff2318877',
|
||||
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
|
||||
},
|
||||
{
|
||||
id: 'fp_order_status__cat_34f037ddcbfa',
|
||||
fingerprint: 'fp_order_status',
|
||||
subClusterId: 'cat_34f037ddcbfa',
|
||||
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
for (const template of [
|
||||
{ id: 'fp_order_status__cat_2b2ff2318877', subClusterId: 'cat_2b2ff2318877' },
|
||||
{ id: 'fp_order_status__cat_34f037ddcbfa', subClusterId: 'cat_34f037ddcbfa' },
|
||||
]) {
|
||||
await writeJson(root, `templates/${template.id}/metadata.json`, {
|
||||
id: template.id,
|
||||
title: `snowflake · analytics.orders [fp_ord:${template.subClusterId.slice(-6)}]`,
|
||||
path: `templates/${template.id}/page.md`,
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_order_status',
|
||||
sub_cluster_id: template.subClusterId,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
|
||||
triage_signals: {
|
||||
executions_bucket: 'mid',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '0 constant, 0 runtime',
|
||||
},
|
||||
},
|
||||
});
|
||||
await writeFile(join(root, `templates/${template.id}/page.md`), `# ${template.id}\n`, 'utf-8');
|
||||
await writeJson(root, `templates/${template.id}/usage.json`, {
|
||||
stats: {
|
||||
executions: 3,
|
||||
distinct_users: 3,
|
||||
first_seen: '2026-05-04T10:00:00.000Z',
|
||||
last_seen: '2026-05-04T10:05:00.000Z',
|
||||
p50_runtime_ms: 120,
|
||||
p95_runtime_ms: 150,
|
||||
error_rate: 0,
|
||||
rows_produced: 36,
|
||||
},
|
||||
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }],
|
||||
samples: [],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
describe('chunkHistoricSqlStagedDir', () => {
|
||||
it('emits one WorkUnit per changed template and keeps usage as dependency', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeTemplate(stagedDir);
|
||||
|
||||
const result = await chunkHistoricSqlStagedDir(stagedDir, {
|
||||
added: ['templates/fp_1/metadata.json'],
|
||||
modified: [],
|
||||
deleted: [],
|
||||
unchanged: ['templates/fp_1/page.md', 'templates/fp_1/usage.json', 'manifest.json'],
|
||||
});
|
||||
|
||||
expect(result.workUnits).toEqual([
|
||||
{
|
||||
unitKey: 'historic-sql-fp-1',
|
||||
displayLabel: 'snowflake · analytics.orders [fp_1]',
|
||||
rawFiles: ['templates/fp_1/metadata.json'],
|
||||
dependencyPaths: ['manifest.json', 'templates/fp_1/usage.json'],
|
||||
peerFileIndex: ['templates/fp_1/page.md'],
|
||||
notes:
|
||||
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
|
||||
},
|
||||
]);
|
||||
expect(result.contextReport).toEqual({ capped: false, warnings: ['source warning'] });
|
||||
});
|
||||
|
||||
it('emits one WorkUnit per changed categorical sub-cluster', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeSubclusterTemplates(stagedDir);
|
||||
|
||||
const result = await chunkHistoricSqlStagedDir(stagedDir, {
|
||||
added: [
|
||||
'templates/fp_order_status__cat_2b2ff2318877/metadata.json',
|
||||
'templates/fp_order_status__cat_34f037ddcbfa/metadata.json',
|
||||
],
|
||||
modified: [],
|
||||
deleted: [],
|
||||
unchanged: [
|
||||
'manifest.json',
|
||||
'templates/fp_order_status__cat_2b2ff2318877/page.md',
|
||||
'templates/fp_order_status__cat_2b2ff2318877/usage.json',
|
||||
'templates/fp_order_status__cat_34f037ddcbfa/page.md',
|
||||
'templates/fp_order_status__cat_34f037ddcbfa/usage.json',
|
||||
],
|
||||
});
|
||||
|
||||
expect(
|
||||
result.workUnits.map((unit) => ({
|
||||
unitKey: unit.unitKey,
|
||||
displayLabel: unit.displayLabel,
|
||||
rawFiles: unit.rawFiles,
|
||||
dependencyPaths: unit.dependencyPaths,
|
||||
})),
|
||||
).toEqual([
|
||||
{
|
||||
unitKey: 'historic-sql-fp-order-status-cat-2b2ff2318877',
|
||||
displayLabel: 'snowflake · analytics.orders [fp_ord:318877]',
|
||||
rawFiles: ['templates/fp_order_status__cat_2b2ff2318877/metadata.json'],
|
||||
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_2b2ff2318877/usage.json'],
|
||||
},
|
||||
{
|
||||
unitKey: 'historic-sql-fp-order-status-cat-34f037ddcbfa',
|
||||
displayLabel: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
|
||||
rawFiles: ['templates/fp_order_status__cat_34f037ddcbfa/metadata.json'],
|
||||
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'],
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('emits zero WorkUnits for usage-only diffs', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeTemplate(stagedDir);
|
||||
|
||||
const result = await chunkHistoricSqlStagedDir(stagedDir, {
|
||||
added: [],
|
||||
modified: ['templates/fp_1/usage.json'],
|
||||
deleted: [],
|
||||
unchanged: ['templates/fp_1/metadata.json', 'templates/fp_1/page.md', 'manifest.json'],
|
||||
});
|
||||
|
||||
expect(result.workUnits).toEqual([]);
|
||||
expect(result.eviction).toBeUndefined();
|
||||
});
|
||||
|
||||
it('emits eviction only for deleted metadata or page files', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeTemplate(stagedDir);
|
||||
|
||||
const result = await chunkHistoricSqlStagedDir(stagedDir, {
|
||||
added: [],
|
||||
modified: [],
|
||||
deleted: ['templates/fp_1/usage.json', 'templates/fp_2/page.md'],
|
||||
unchanged: [],
|
||||
});
|
||||
|
||||
expect(result.eviction).toEqual({ deletedRawPaths: ['templates/fp_2/page.md'] });
|
||||
});
|
||||
|
||||
it('describes historic-sql scope without including unrelated paths', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeTemplate(stagedDir);
|
||||
|
||||
const scope = await describeHistoricSqlScope(stagedDir);
|
||||
|
||||
expect(scope.fingerprint).toHaveLength(64);
|
||||
expect(scope.isPathInScope('manifest.json')).toBe(true);
|
||||
expect(scope.isPathInScope('templates/fp_1/usage.json')).toBe(true);
|
||||
expect(scope.isPathInScope('pages/notion/page.md')).toBe(false);
|
||||
});
|
||||
});
|
||||
|
|
@ -1,86 +0,0 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import { readFile, readdir } from 'node:fs/promises';
|
||||
import { join, relative } from 'node:path';
|
||||
import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js';
|
||||
import { historicSqlManifestSchema, historicSqlMetadataSchema } from './types.js';
|
||||
|
||||
async function walk(root: string): Promise<string[]> {
|
||||
const entries = await readdir(root, { withFileTypes: true, recursive: true });
|
||||
return entries
|
||||
.filter((entry) => entry.isFile())
|
||||
.map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/'))
|
||||
.sort();
|
||||
}
|
||||
|
||||
function safeUnitKey(id: string): string {
|
||||
return `historic-sql-${id.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
|
||||
}
|
||||
|
||||
async function readManifest(stagedDir: string) {
|
||||
try {
|
||||
return historicSqlManifestSchema.parse(JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')));
|
||||
} catch (error) {
|
||||
throw new Error(`Invalid historic-SQL manifest: ${error instanceof Error ? error.message : String(error)}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function chunkHistoricSqlStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
|
||||
const files = await walk(stagedDir);
|
||||
const manifest = await readManifest(stagedDir);
|
||||
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
|
||||
const workUnits: WorkUnit[] = [];
|
||||
|
||||
for (const pagePath of files.filter((path) => /^templates\/[^/]+\/page\.md$/.test(path))) {
|
||||
const metadataPath = pagePath.replace(/\/page\.md$/, '/metadata.json');
|
||||
const usagePath = pagePath.replace(/\/page\.md$/, '/usage.json');
|
||||
const primary = [metadataPath, pagePath].filter((path) => files.includes(path));
|
||||
if (touched && !primary.some((path) => touched.has(path))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const metadata = historicSqlMetadataSchema.parse(JSON.parse(await readFile(join(stagedDir, metadataPath), 'utf-8')));
|
||||
const rawFiles = touched ? primary.filter((path) => touched.has(path)).sort() : primary.sort();
|
||||
const dependencyPaths = ['manifest.json', files.includes(usagePath) ? usagePath : null]
|
||||
.filter((path): path is string => typeof path === 'string' && !rawFiles.includes(path))
|
||||
.sort();
|
||||
const excluded = new Set([...rawFiles, ...dependencyPaths]);
|
||||
const peerFileIndex = files.filter((path) => !excluded.has(path)).sort();
|
||||
|
||||
workUnits.push({
|
||||
unitKey: safeUnitKey(metadata.id),
|
||||
displayLabel: metadata.title,
|
||||
rawFiles,
|
||||
dependencyPaths,
|
||||
peerFileIndex,
|
||||
notes:
|
||||
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
|
||||
});
|
||||
}
|
||||
|
||||
const deletedPrimary = diffSet?.deleted.filter((path) => /^templates\/[^/]+\/(metadata\.json|page\.md)$/.test(path));
|
||||
|
||||
return {
|
||||
workUnits,
|
||||
eviction: deletedPrimary && deletedPrimary.length > 0 ? { deletedRawPaths: deletedPrimary.sort() } : undefined,
|
||||
reconcileNotes: [`Historic-SQL staged templates=${manifest.templateCount}`],
|
||||
contextReport: {
|
||||
capped: manifest.capped,
|
||||
warnings: manifest.warnings,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function describeHistoricSqlScope(stagedDir: string): Promise<ScopeDescriptor> {
|
||||
const manifest = await readManifest(stagedDir);
|
||||
const scopeKey = JSON.stringify({
|
||||
connectionId: manifest.connectionId,
|
||||
dialect: manifest.dialect,
|
||||
windowStart: manifest.windowStart,
|
||||
windowEnd: manifest.windowEnd,
|
||||
});
|
||||
const fingerprint = createHash('sha256').update(scopeKey).digest('hex');
|
||||
return {
|
||||
fingerprint,
|
||||
isPathInScope: (rawPath) => rawPath === 'manifest.json' || rawPath.startsWith('templates/'),
|
||||
};
|
||||
}
|
||||
|
|
@ -3,13 +3,7 @@ import { tmpdir } from 'node:os';
|
|||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { detectHistoricSqlStagedDir } from './detect.js';
|
||||
import {
|
||||
HISTORIC_SQL_SOURCE_KEY,
|
||||
historicSqlManifestSchema,
|
||||
historicSqlMetadataSchema,
|
||||
historicSqlPullConfigSchema,
|
||||
historicSqlUsageSchema,
|
||||
} from './types.js';
|
||||
import { HISTORIC_SQL_SOURCE_KEY, stagedManifestSchema } from './types.js';
|
||||
|
||||
async function tempDir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-detect-'));
|
||||
|
|
@ -21,32 +15,35 @@ async function writeJson(root: string, relPath: string, value: unknown): Promise
|
|||
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
|
||||
}
|
||||
|
||||
function manifest() {
|
||||
return stagedManifestSchema.parse({
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
snapshotRowCount: 0,
|
||||
touchedTableCount: 0,
|
||||
parseFailures: 0,
|
||||
warnings: [],
|
||||
probeWarnings: [],
|
||||
});
|
||||
}
|
||||
|
||||
describe('historic-sql staged dir detection', () => {
|
||||
it('detects manifest source', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeJson(stagedDir, 'manifest.json', {
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
templateCount: 0,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [],
|
||||
});
|
||||
await writeJson(stagedDir, 'manifest.json', manifest());
|
||||
|
||||
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
|
||||
});
|
||||
|
||||
it('detects document-shaped template structure without manifest', async () => {
|
||||
it('detects unified table and patterns structure without manifest', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeFile(join(stagedDir, 'not-a-match.txt'), 'x', 'utf-8');
|
||||
await mkdir(join(stagedDir, 'templates', 'fp_1'), { recursive: true });
|
||||
await writeFile(join(stagedDir, 'templates', 'fp_1', 'metadata.json'), '{}', 'utf-8');
|
||||
await writeFile(join(stagedDir, 'templates', 'fp_1', 'page.md'), '# fp_1\n', 'utf-8');
|
||||
await writeJson(stagedDir, 'patterns-input.json', { templates: [] });
|
||||
await writeJson(stagedDir, 'tables/public.orders.json', { table: 'public.orders' });
|
||||
|
||||
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
|
||||
});
|
||||
|
|
@ -58,140 +55,3 @@ describe('historic-sql staged dir detection', () => {
|
|||
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('historic-sql schemas', () => {
|
||||
it('defaults disabled optional pull-config fields through the parser', () => {
|
||||
expect(
|
||||
historicSqlPullConfigSchema.parse({
|
||||
dialect: 'bigquery',
|
||||
}),
|
||||
).toEqual({
|
||||
dialect: 'bigquery',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts postgres pull config with a minCalls floor', () => {
|
||||
expect(
|
||||
historicSqlPullConfigSchema.parse({
|
||||
dialect: 'postgres',
|
||||
minCalls: 12,
|
||||
}),
|
||||
).toEqual({
|
||||
dialect: 'postgres',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 12,
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts postgres manifest fields with defaults for older dialects', () => {
|
||||
expect(
|
||||
historicSqlManifestSchema.parse({
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: 'conn_pg',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-08T12:00:00.000Z',
|
||||
windowStart: '2026-05-08T11:00:00.000Z',
|
||||
windowEnd: '2026-05-08T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
|
||||
templateCount: 0,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [],
|
||||
degraded: true,
|
||||
statsResetAt: '2026-05-01T00:00:00.000Z',
|
||||
baselineFirstRun: true,
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
deallocCount: 3,
|
||||
}),
|
||||
).toMatchObject({
|
||||
dialect: 'postgres',
|
||||
degraded: true,
|
||||
statsResetAt: '2026-05-01T00:00:00.000Z',
|
||||
baselineFirstRun: true,
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
deallocCount: 3,
|
||||
});
|
||||
|
||||
expect(
|
||||
historicSqlManifestSchema.parse({
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: 'conn_sf',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-08T12:00:00.000Z',
|
||||
windowStart: '2026-05-01T12:00:00.000Z',
|
||||
windowEnd: '2026-05-08T12:00:00.000Z',
|
||||
nextSuccessfulCursor: null,
|
||||
templateCount: 0,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [],
|
||||
}),
|
||||
).toMatchObject({
|
||||
degraded: false,
|
||||
statsResetAt: null,
|
||||
baselineFirstRun: false,
|
||||
pgServerVersion: null,
|
||||
deallocCount: null,
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts postgres usage stats with mean_runtime_ms and empty samples', () => {
|
||||
const parsed = historicSqlUsageSchema.parse({
|
||||
stats: {
|
||||
executions: 25,
|
||||
distinct_users: 2,
|
||||
first_seen: '2026-05-08T10:00:00.000Z',
|
||||
last_seen: '2026-05-08T12:00:00.000Z',
|
||||
p50_runtime_ms: null,
|
||||
p95_runtime_ms: null,
|
||||
mean_runtime_ms: 32.5,
|
||||
error_rate: 0,
|
||||
rows_produced: 1042,
|
||||
},
|
||||
literal_slots: [],
|
||||
samples: [],
|
||||
});
|
||||
|
||||
expect(parsed.stats.mean_runtime_ms).toBe(32.5);
|
||||
expect(parsed.samples).toEqual([]);
|
||||
});
|
||||
|
||||
it('pins the Notion-compatible metadata envelope', () => {
|
||||
const parsed = historicSqlMetadataSchema.parse({
|
||||
id: 'fp_1',
|
||||
title: 'snowflake · analytics.orders [fp_1]',
|
||||
path: 'templates/fp_1/page.md',
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_1',
|
||||
sub_cluster_id: null,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
|
||||
triage_signals: {
|
||||
executions_bucket: 'high',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 0 runtime',
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(parsed.objectType).toBe('historic_sql_template');
|
||||
expect(parsed.lastEditedAt).toBeNull();
|
||||
expect(parsed.properties.triage_signals.service_account_only).toBe('false');
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -16,21 +16,9 @@ export async function detectHistoricSqlStagedDir(stagedDir: string): Promise<boo
|
|||
}
|
||||
|
||||
try {
|
||||
const entries = await readdir(join(stagedDir, 'templates'), { withFileTypes: true, recursive: true });
|
||||
const metadataDirs = new Set<string>();
|
||||
const pageDirs = new Set<string>();
|
||||
for (const entry of entries) {
|
||||
if (!entry.isFile()) {
|
||||
continue;
|
||||
}
|
||||
if (entry.name === 'metadata.json') {
|
||||
metadataDirs.add(entry.parentPath);
|
||||
}
|
||||
if (entry.name === 'page.md') {
|
||||
pageDirs.add(entry.parentPath);
|
||||
}
|
||||
}
|
||||
return [...metadataDirs].some((dir) => pageDirs.has(dir));
|
||||
await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8');
|
||||
const entries = await readdir(join(stagedDir, 'tables'), { withFileTypes: true });
|
||||
return entries.some((entry) => entry.isFile() && entry.name.endsWith('.json'));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,89 @@
|
|||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { asSchema } from 'ai';
|
||||
import { createEmitHistoricSqlEvidenceTool } from './evidence-tool.js';
|
||||
|
||||
describe('emit_historic_sql_evidence tool', () => {
|
||||
it('exposes an AI SDK v6 tool input schema with top-level object type', async () => {
|
||||
const tool = createEmitHistoricSqlEvidenceTool();
|
||||
|
||||
expect(await asSchema(tool.inputSchema).jsonSchema).toMatchObject({
|
||||
type: 'object',
|
||||
});
|
||||
});
|
||||
|
||||
it('writes table usage evidence to the ignored run evidence directory', async () => {
|
||||
const writeFile = vi.fn(async () => ({ success: true, commitHash: null }));
|
||||
const tool = createEmitHistoricSqlEvidenceTool();
|
||||
|
||||
const result = await tool.execute!(
|
||||
{
|
||||
kind: 'table_usage',
|
||||
table: 'public.orders',
|
||||
rawPath: 'tables/public.orders.json',
|
||||
usage: {
|
||||
narrative: 'Orders are repeatedly queried by paid status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [],
|
||||
staleSince: null,
|
||||
},
|
||||
},
|
||||
{
|
||||
toolCallId: 'call-1',
|
||||
messages: [],
|
||||
abortSignal: new AbortController().signal,
|
||||
experimental_context: {
|
||||
connectionId: 'warehouse',
|
||||
session: {
|
||||
ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'historic-sql' },
|
||||
configService: { writeFile },
|
||||
},
|
||||
},
|
||||
} as never,
|
||||
);
|
||||
|
||||
expect(result).toBe('Recorded historic-SQL table_usage evidence for public.orders.');
|
||||
expect(writeFile).toHaveBeenCalledWith(
|
||||
'.ktx/ingest-evidence/historic-sql/run-1/historic-sql-table-public-orders.json',
|
||||
expect.stringContaining('"kind": "table_usage"'),
|
||||
'System User',
|
||||
'system@example.com',
|
||||
'Record historic-SQL evidence: historic-sql-table-public-orders',
|
||||
{ skipLock: true },
|
||||
);
|
||||
});
|
||||
|
||||
it('rejects non-historic ingest sessions', async () => {
|
||||
const tool = createEmitHistoricSqlEvidenceTool();
|
||||
|
||||
await expect(
|
||||
tool.execute!(
|
||||
{
|
||||
kind: 'pattern',
|
||||
rawPath: 'patterns-input.json',
|
||||
pattern: {
|
||||
slug: 'orders',
|
||||
title: 'Orders',
|
||||
narrative: 'Orders pattern.',
|
||||
definitionSql: 'select * from public.orders',
|
||||
tablesInvolved: ['public.orders'],
|
||||
slRefs: ['orders'],
|
||||
constituentTemplateIds: ['pg:1'],
|
||||
},
|
||||
},
|
||||
{
|
||||
toolCallId: 'call-1',
|
||||
messages: [],
|
||||
abortSignal: new AbortController().signal,
|
||||
experimental_context: {
|
||||
connectionId: 'warehouse',
|
||||
session: {
|
||||
ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'notion' },
|
||||
configService: { writeFile: vi.fn() },
|
||||
},
|
||||
},
|
||||
} as never,
|
||||
),
|
||||
).resolves.toContain('Error: emit_historic_sql_evidence is only available during historic-sql ingest');
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
import { tool } from 'ai';
|
||||
import { z } from 'zod';
|
||||
import { historicSqlEvidencePath, serializeHistoricSqlEvidence } from './evidence.js';
|
||||
import { patternOutputSchema, tableUsageOutputSchema } from './skill-schemas.js';
|
||||
|
||||
const SYSTEM_AUTHOR = 'System User';
|
||||
const SYSTEM_EMAIL = 'system@example.com';
|
||||
|
||||
const emitHistoricSqlEvidenceInputSchema = z
|
||||
.object({
|
||||
kind: z.enum(['table_usage', 'pattern']),
|
||||
table: z.string().min(1).optional(),
|
||||
rawPath: z.string().min(1),
|
||||
usage: tableUsageOutputSchema.optional(),
|
||||
pattern: patternOutputSchema.optional(),
|
||||
})
|
||||
.superRefine((input, ctx) => {
|
||||
if (input.kind === 'table_usage') {
|
||||
if (!input.table) {
|
||||
ctx.addIssue({
|
||||
code: 'custom',
|
||||
path: ['table'],
|
||||
message: 'table is required when kind is table_usage',
|
||||
});
|
||||
}
|
||||
if (!input.usage) {
|
||||
ctx.addIssue({
|
||||
code: 'custom',
|
||||
path: ['usage'],
|
||||
message: 'usage is required when kind is table_usage',
|
||||
});
|
||||
}
|
||||
}
|
||||
if (input.kind === 'pattern' && !input.pattern) {
|
||||
ctx.addIssue({
|
||||
code: 'custom',
|
||||
path: ['pattern'],
|
||||
message: 'pattern is required when kind is pattern',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
type EmitHistoricSqlEvidenceInput = z.infer<typeof emitHistoricSqlEvidenceInputSchema>;
|
||||
|
||||
interface EmitHistoricSqlEvidenceToolContext {
|
||||
connectionId?: string | null;
|
||||
session?: {
|
||||
ingest?: { runId: string; sourceKey: string };
|
||||
configService?: {
|
||||
writeFile(
|
||||
path: string,
|
||||
content: string,
|
||||
author: string,
|
||||
authorEmail: string,
|
||||
commitMessage: string,
|
||||
options?: { skipLock?: boolean },
|
||||
): Promise<unknown>;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
function unitKeyForEvidence(input: EmitHistoricSqlEvidenceInput): string {
|
||||
if (input.kind === 'table_usage') {
|
||||
return `historic-sql-table-${String(input.table).replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
|
||||
}
|
||||
return `historic-sql-pattern-${String(input.pattern?.slug).replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
|
||||
}
|
||||
|
||||
function evidenceEnvelope(input: EmitHistoricSqlEvidenceInput, connectionId: string) {
|
||||
if (input.kind === 'table_usage') {
|
||||
if (!input.table || !input.usage) {
|
||||
throw new Error('Invalid historic-SQL table usage evidence input.');
|
||||
}
|
||||
return {
|
||||
kind: 'table_usage' as const,
|
||||
connectionId,
|
||||
table: input.table,
|
||||
rawPath: input.rawPath,
|
||||
usage: input.usage,
|
||||
};
|
||||
}
|
||||
if (!input.pattern) {
|
||||
throw new Error('Invalid historic-SQL pattern evidence input.');
|
||||
}
|
||||
return {
|
||||
kind: 'pattern' as const,
|
||||
connectionId,
|
||||
rawPath: input.rawPath,
|
||||
pattern: input.pattern,
|
||||
};
|
||||
}
|
||||
|
||||
export function createEmitHistoricSqlEvidenceTool(defaultContext?: EmitHistoricSqlEvidenceToolContext) {
|
||||
return tool({
|
||||
description:
|
||||
'Record typed historic-SQL evidence for deterministic projection. Use this instead of wiki_write, sl_write_source, sl_edit_source, or context_candidate_write during historic-SQL WorkUnits.',
|
||||
inputSchema: emitHistoricSqlEvidenceInputSchema,
|
||||
execute: async (input, options): Promise<string> => {
|
||||
const context = (options.experimental_context as EmitHistoricSqlEvidenceToolContext | undefined) ?? defaultContext;
|
||||
const ingest = context?.session?.ingest;
|
||||
const configService = context?.session?.configService;
|
||||
if (!ingest || ingest.sourceKey !== 'historic-sql' || !configService || !context?.connectionId) {
|
||||
return 'Error: emit_historic_sql_evidence is only available during historic-sql ingest.';
|
||||
}
|
||||
|
||||
const unitKey = unitKeyForEvidence(input);
|
||||
const evidence = evidenceEnvelope(input, context.connectionId);
|
||||
const content = serializeHistoricSqlEvidence(evidence);
|
||||
await configService.writeFile(
|
||||
historicSqlEvidencePath(ingest.runId, unitKey),
|
||||
content,
|
||||
SYSTEM_AUTHOR,
|
||||
SYSTEM_EMAIL,
|
||||
`Record historic-SQL evidence: ${unitKey}`,
|
||||
{ skipLock: true },
|
||||
);
|
||||
const label = evidence.kind === 'table_usage' ? evidence.table : evidence.pattern.slug;
|
||||
return `Recorded historic-SQL ${input.kind} evidence for ${label}.`;
|
||||
},
|
||||
});
|
||||
}
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
historicSqlEvidenceEnvelopeSchema,
|
||||
historicSqlEvidencePath,
|
||||
historicSqlPatternEvidenceSchema,
|
||||
historicSqlTableUsageEvidenceSchema,
|
||||
} from './evidence.js';
|
||||
|
||||
describe('historic-sql evidence contracts', () => {
|
||||
it('validates table usage evidence emitted by table digest WorkUnits', () => {
|
||||
const parsed = historicSqlTableUsageEvidenceSchema.parse({
|
||||
kind: 'table_usage',
|
||||
connectionId: 'warehouse',
|
||||
table: 'public.orders',
|
||||
rawPath: 'tables/public.orders.json',
|
||||
usage: {
|
||||
narrative: 'Orders are repeatedly queried for paid/refunded lifecycle analysis.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status', 'created_at'],
|
||||
commonGroupBys: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
staleSince: null,
|
||||
},
|
||||
});
|
||||
|
||||
expect(parsed.table).toBe('public.orders');
|
||||
expect(parsed.usage.frequencyTier).toBe('high');
|
||||
});
|
||||
|
||||
it('validates pattern evidence emitted by the patterns WorkUnit', () => {
|
||||
const parsed = historicSqlPatternEvidenceSchema.parse(
|
||||
historicSqlEvidenceEnvelopeSchema.parse({
|
||||
kind: 'pattern',
|
||||
connectionId: 'warehouse',
|
||||
rawPath: 'patterns-input.json',
|
||||
pattern: {
|
||||
slug: 'order-lifecycle-analysis',
|
||||
title: 'Order Lifecycle Analysis',
|
||||
narrative: 'Analysts compare order status changes by customer segment.',
|
||||
definitionSql: 'select status, count(*) from public.orders group by status',
|
||||
tablesInvolved: ['public.orders', 'public.customers'],
|
||||
slRefs: ['orders', 'customers'],
|
||||
constituentTemplateIds: ['pg:1', 'pg:2'],
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
expect(parsed.kind).toBe('pattern');
|
||||
expect(parsed.pattern.slug).toBe('order-lifecycle-analysis');
|
||||
});
|
||||
|
||||
it('builds a stable ignored evidence path from run and WorkUnit identity', () => {
|
||||
expect(historicSqlEvidencePath('run-1', 'historic-sql-table-public-orders')).toBe(
|
||||
'.ktx/ingest-evidence/historic-sql/run-1/historic-sql-table-public-orders.json',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
import { z } from 'zod';
|
||||
import { patternOutputSchema, tableUsageOutputSchema } from './skill-schemas.js';
|
||||
|
||||
function safeEvidenceSegment(value: string): string {
|
||||
const segment = value.replace(/[^a-zA-Z0-9._-]+/g, '-').replace(/^-+|-+$/g, '');
|
||||
if (!segment) {
|
||||
throw new Error(`Invalid historic-SQL evidence path segment: ${value}`);
|
||||
}
|
||||
return segment;
|
||||
}
|
||||
|
||||
export const historicSqlTableUsageEvidenceSchema = z.object({
|
||||
kind: z.literal('table_usage'),
|
||||
connectionId: z.string().min(1),
|
||||
table: z.string().min(1),
|
||||
rawPath: z.string().min(1),
|
||||
usage: tableUsageOutputSchema,
|
||||
});
|
||||
export type HistoricSqlTableUsageEvidence = z.infer<typeof historicSqlTableUsageEvidenceSchema>;
|
||||
|
||||
export const historicSqlPatternEvidenceSchema = z.object({
|
||||
kind: z.literal('pattern'),
|
||||
connectionId: z.string().min(1),
|
||||
rawPath: z.string().min(1),
|
||||
pattern: patternOutputSchema,
|
||||
});
|
||||
export type HistoricSqlPatternEvidence = z.infer<typeof historicSqlPatternEvidenceSchema>;
|
||||
|
||||
export const historicSqlEvidenceEnvelopeSchema = z.discriminatedUnion('kind', [
|
||||
historicSqlTableUsageEvidenceSchema,
|
||||
historicSqlPatternEvidenceSchema,
|
||||
]);
|
||||
export type HistoricSqlEvidenceEnvelope = z.infer<typeof historicSqlEvidenceEnvelopeSchema>;
|
||||
|
||||
export function historicSqlEvidencePath(runId: string, unitKey: string): string {
|
||||
return `.ktx/ingest-evidence/historic-sql/${safeEvidenceSegment(runId)}/${safeEvidenceSegment(unitKey)}.json`;
|
||||
}
|
||||
|
||||
export function serializeHistoricSqlEvidence(evidence: HistoricSqlEvidenceEnvelope): string {
|
||||
return `${JSON.stringify(historicSqlEvidenceEnvelopeSchema.parse(evidence), null, 2)}\n`;
|
||||
}
|
||||
|
|
@ -1,48 +1,30 @@
|
|||
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises';
|
||||
import { mkdtemp } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import type { SourceAdapter } from '../../types.js';
|
||||
import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js';
|
||||
import { pgssBaselinePath } from './stage-pgss.js';
|
||||
import type { HistoricSqlQueryHistoryReader, PostgresPgssReader } from './types.js';
|
||||
import type { HistoricSqlReader } from './types.js';
|
||||
|
||||
async function tempDir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-adapter-'));
|
||||
}
|
||||
|
||||
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
await mkdir(join(target, '..'), { recursive: true });
|
||||
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
|
||||
}
|
||||
|
||||
const sqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint() {
|
||||
return {
|
||||
fingerprint: 'fp_1',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [{ position: 1, type: 'string', exampleValue: 'paid' }],
|
||||
};
|
||||
throw new Error('legacy analyzeForFingerprint must not be used');
|
||||
},
|
||||
async analyzeBatch() {
|
||||
return new Map();
|
||||
},
|
||||
};
|
||||
|
||||
const reader: HistoricSqlQueryHistoryReader = {
|
||||
async probe() {},
|
||||
async *fetch() {
|
||||
yield {
|
||||
id: 'q1',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
|
||||
user: 'analyst',
|
||||
startedAt: '2026-05-04T11:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 10,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
};
|
||||
const reader: HistoricSqlReader = {
|
||||
async probe() {
|
||||
return { warnings: [], info: [] };
|
||||
},
|
||||
async *fetchAggregated() {},
|
||||
};
|
||||
|
||||
describe('HistoricSqlSourceAdapter', () => {
|
||||
|
|
@ -50,255 +32,73 @@ describe('HistoricSqlSourceAdapter', () => {
|
|||
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
|
||||
|
||||
expect(adapter.source).toBe('historic-sql');
|
||||
expect(adapter.skillNames).toEqual(['historic_sql_ingest']);
|
||||
expect(adapter.reconcileSkillNames).toEqual(['historic_sql_curator']);
|
||||
expect(adapter.evidenceIndexing).toBe('documents');
|
||||
expect(adapter.triageSupported).toBe(true);
|
||||
expect(adapter.skillNames).toEqual(['historic_sql_table_digest', 'historic_sql_patterns']);
|
||||
expect(adapter.reconcileSkillNames).toEqual([]);
|
||||
expect((adapter as SourceAdapter).evidenceIndexing).toBeUndefined();
|
||||
expect(adapter.triageSupported).toBe(false);
|
||||
});
|
||||
|
||||
it('fetches staged templates through injected reader and SqlAnalysisPort', async () => {
|
||||
it('fetches a unified aggregate snapshot and emits unified WorkUnits', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const adapter = new HistoricSqlSourceAdapter({
|
||||
sqlAnalysis,
|
||||
reader,
|
||||
queryClient: {},
|
||||
now: () => new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
await adapter.fetch(
|
||||
{
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
},
|
||||
stagedDir,
|
||||
{ connectionId: 'conn_1', sourceKey: 'historic-sql' },
|
||||
);
|
||||
|
||||
await expect(adapter.detect(stagedDir)).resolves.toBe(true);
|
||||
});
|
||||
|
||||
it('reads triage signals from usage.json and metadata properties', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeJson(stagedDir, 'manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
templateCount: 1,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }],
|
||||
});
|
||||
await writeJson(stagedDir, 'templates/fp_1/metadata.json', {
|
||||
id: 'fp_1',
|
||||
title: 'snowflake · analytics.orders [fp_1]',
|
||||
path: 'templates/fp_1/page.md',
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_1',
|
||||
sub_cluster_id: null,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
|
||||
triage_signals: {
|
||||
executions_bucket: 'high',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 0 runtime',
|
||||
},
|
||||
},
|
||||
});
|
||||
await writeFile(join(stagedDir, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8');
|
||||
await writeJson(stagedDir, 'templates/fp_1/usage.json', {
|
||||
stats: {
|
||||
executions: 20,
|
||||
distinct_users: 3,
|
||||
first_seen: '2026-05-01T00:00:00.000Z',
|
||||
last_seen: '2026-05-04T11:55:00.000Z',
|
||||
p50_runtime_ms: 100,
|
||||
p95_runtime_ms: 200,
|
||||
error_rate: 0,
|
||||
},
|
||||
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }],
|
||||
samples: [],
|
||||
});
|
||||
|
||||
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
|
||||
|
||||
await expect(adapter.getTriageSignals(stagedDir, 'fp_1')).resolves.toEqual({
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: '2026-05-04T11:55:00.000Z',
|
||||
propertyHints: {
|
||||
executions_bucket: 'high',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 0 runtime',
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('dispatches postgres fetches through PGSS staging and writes the baseline only after pull success', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const baselineRootDir = await tempDir();
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
const unusedPerExecutionReader: HistoricSqlQueryHistoryReader = {
|
||||
const aggregateReader: HistoricSqlReader = {
|
||||
async probe() {
|
||||
throw new Error('per-execution reader must not be used for postgres');
|
||||
return { warnings: [], info: [] };
|
||||
},
|
||||
async *fetch() {
|
||||
throw new Error('per-execution reader must not be used for postgres');
|
||||
},
|
||||
};
|
||||
const postgresReader: PostgresPgssReader = {
|
||||
async probe() {
|
||||
return { pgServerVersion: 'PostgreSQL 16.4', warnings: [] };
|
||||
},
|
||||
async readSnapshot() {
|
||||
return {
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
deallocCount: 0,
|
||||
rows: [
|
||||
{
|
||||
queryid: '901',
|
||||
userid: '11',
|
||||
username: 'analyst',
|
||||
dbid: '5',
|
||||
database: 'warehouse',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 9,
|
||||
totalExecTime: 90,
|
||||
meanExecTime: 10,
|
||||
totalRows: 18,
|
||||
},
|
||||
],
|
||||
async *fetchAggregated() {
|
||||
yield {
|
||||
templateId: 'pg:1',
|
||||
canonicalSql:
|
||||
'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status',
|
||||
dialect: 'postgres',
|
||||
stats: {
|
||||
executions: 25,
|
||||
distinctUsers: 3,
|
||||
firstSeen: '2026-05-01T00:00:00.000Z',
|
||||
lastSeen: '2026-05-11T00:00:00.000Z',
|
||||
p50RuntimeMs: 10,
|
||||
p95RuntimeMs: 20,
|
||||
errorRate: 0,
|
||||
rowsProduced: 10,
|
||||
},
|
||||
topUsers: [{ user: 'analyst', executions: 25 }],
|
||||
};
|
||||
},
|
||||
};
|
||||
const batchSqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint() {
|
||||
throw new Error('legacy analyzeForFingerprint must not be used');
|
||||
},
|
||||
async analyzeBatch() {
|
||||
return new Map([
|
||||
[
|
||||
'pg:1',
|
||||
{
|
||||
tablesTouched: ['public.orders', 'public.customers'],
|
||||
columnsByClause: { select: ['status'], join: ['customer_id', 'id'], groupBy: ['status'] },
|
||||
},
|
||||
],
|
||||
]);
|
||||
},
|
||||
};
|
||||
const adapter = new HistoricSqlSourceAdapter({
|
||||
sqlAnalysis,
|
||||
reader: unusedPerExecutionReader,
|
||||
sqlAnalysis: batchSqlAnalysis,
|
||||
reader: aggregateReader,
|
||||
queryClient: {},
|
||||
postgresReader,
|
||||
postgresQueryClient: {
|
||||
async executeQuery() {
|
||||
return { headers: [], rows: [] };
|
||||
},
|
||||
},
|
||||
postgresBaselineRootDir: baselineRootDir,
|
||||
now: () => new Date('2026-05-08T12:00:00.000Z'),
|
||||
now: () => new Date('2026-05-11T00:00:00.000Z'),
|
||||
});
|
||||
|
||||
await adapter.fetch(
|
||||
{
|
||||
dialect: 'postgres',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
stagedDir,
|
||||
{ connectionId: 'conn_pg', sourceKey: 'historic-sql' },
|
||||
);
|
||||
|
||||
const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as {
|
||||
dialect: string;
|
||||
baselineFirstRun: boolean;
|
||||
templates: Array<{ id: string }>;
|
||||
};
|
||||
expect(manifest.dialect).toBe('postgres');
|
||||
expect(manifest.baselineFirstRun).toBe(true);
|
||||
expect(manifest.templates).toEqual([
|
||||
{ id: 'db5_q901', fingerprint: 'fp_1', subClusterId: null, path: 'templates/db5_q901/page.md' },
|
||||
]);
|
||||
await expect(readFile(baselinePath, 'utf-8')).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
|
||||
await adapter.onPullSucceeded({
|
||||
connectionId: 'conn_pg',
|
||||
await adapter.fetch({ dialect: 'postgres', minExecutions: 5 }, stagedDir, {
|
||||
connectionId: 'warehouse',
|
||||
sourceKey: 'historic-sql',
|
||||
syncId: 'sync_pg',
|
||||
trigger: 'scheduled_pull',
|
||||
completedAt: new Date('2026-05-08T12:01:00.000Z'),
|
||||
stagedDir,
|
||||
});
|
||||
|
||||
const baseline = JSON.parse(await readFile(baselinePath, 'utf-8')) as {
|
||||
fetchedAt: string;
|
||||
templates: Record<string, { perUser: Record<string, { calls: number }> }>;
|
||||
};
|
||||
expect(baseline.fetchedAt).toBe('2026-05-08T12:00:00.000Z');
|
||||
expect(baseline.templates.db5_q901.perUser['11'].calls).toBe(9);
|
||||
});
|
||||
|
||||
it('fails postgres fetches clearly when no PGSS reader is configured', async () => {
|
||||
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
|
||||
|
||||
await expect(
|
||||
adapter.fetch(
|
||||
{
|
||||
dialect: 'postgres',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
await tempDir(),
|
||||
{ connectionId: 'conn_pg', sourceKey: 'historic-sql' },
|
||||
),
|
||||
).rejects.toThrow('Historic SQL Postgres fetch requires deps.postgresReader');
|
||||
});
|
||||
|
||||
it('forwards manifest cursor through onPullSucceeded without changing the SourceAdapter signature', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
await writeJson(stagedDir, 'manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
fetchedAt: '2026-05-04T12:00:00.000Z',
|
||||
windowStart: '2026-02-03T12:00:00.000Z',
|
||||
windowEnd: '2026-05-04T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
templateCount: 0,
|
||||
capped: false,
|
||||
warnings: [],
|
||||
templates: [],
|
||||
});
|
||||
const onPullSucceeded = vi.fn(async () => {});
|
||||
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {}, onPullSucceeded });
|
||||
const completedAt = new Date('2026-05-04T12:01:00.000Z');
|
||||
|
||||
await adapter.onPullSucceeded({
|
||||
connectionId: 'conn_1',
|
||||
sourceKey: 'historic-sql',
|
||||
syncId: 'sync_1',
|
||||
trigger: 'scheduled_pull',
|
||||
completedAt,
|
||||
stagedDir,
|
||||
});
|
||||
|
||||
expect(onPullSucceeded).toHaveBeenCalledWith({
|
||||
connectionId: 'conn_1',
|
||||
sourceKey: 'historic-sql',
|
||||
syncId: 'sync_1',
|
||||
trigger: 'scheduled_pull',
|
||||
completedAt,
|
||||
stagedDir,
|
||||
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
|
||||
await expect(adapter.detect(stagedDir)).resolves.toBe(true);
|
||||
await expect(adapter.chunk(stagedDir)).resolves.toMatchObject({
|
||||
workUnits: [
|
||||
{ unitKey: 'historic-sql-table-public-customers' },
|
||||
{ unitKey: 'historic-sql-table-public-orders' },
|
||||
{ unitKey: 'historic-sql-patterns-part-0001' },
|
||||
],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,39 +1,16 @@
|
|||
import { readFile } from 'node:fs/promises';
|
||||
import { rm } from 'node:fs/promises';
|
||||
import { join } from 'node:path';
|
||||
import type {
|
||||
ChunkResult,
|
||||
DiffSet,
|
||||
FetchContext,
|
||||
IngestTrigger,
|
||||
ScopeDescriptor,
|
||||
SourceAdapter,
|
||||
TriageSignals,
|
||||
} from '../../types.js';
|
||||
import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js';
|
||||
import type { ChunkResult, DiffSet, FetchContext, ScopeDescriptor, SourceAdapter } from '../../types.js';
|
||||
import { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './chunk-unified.js';
|
||||
import { detectHistoricSqlStagedDir } from './detect.js';
|
||||
import { stageHistoricSqlTemplates } from './stage.js';
|
||||
import {
|
||||
pgssBaselinePath,
|
||||
stagePgStatStatementsTemplates,
|
||||
writePgssBaselineAtomic,
|
||||
type StagePgStatStatementsTemplatesResult,
|
||||
} from './stage-pgss.js';
|
||||
import {
|
||||
historicSqlManifestSchema,
|
||||
historicSqlMetadataSchema,
|
||||
historicSqlPullConfigSchema,
|
||||
historicSqlUsageSchema,
|
||||
type HistoricSqlSourceAdapterDeps,
|
||||
} from './types.js';
|
||||
import { stageHistoricSqlAggregatedSnapshot } from './stage-unified.js';
|
||||
import { type HistoricSqlSourceAdapterDeps } from './types.js';
|
||||
|
||||
export class HistoricSqlSourceAdapter implements SourceAdapter {
|
||||
readonly source = 'historic-sql';
|
||||
readonly skillNames = ['historic_sql_ingest'];
|
||||
readonly reconcileSkillNames = ['historic_sql_curator'];
|
||||
readonly evidenceIndexing = 'documents' as const;
|
||||
readonly triageSupported = true;
|
||||
|
||||
private readonly pendingPgssBaselines = new Map<string, StagePgStatStatementsTemplatesResult>();
|
||||
readonly skillNames = ['historic_sql_table_digest', 'historic_sql_patterns'];
|
||||
readonly reconcileSkillNames: string[] = [];
|
||||
readonly triageSupported = false;
|
||||
|
||||
constructor(private readonly deps: HistoricSqlSourceAdapterDeps) {}
|
||||
|
||||
|
|
@ -42,94 +19,27 @@ export class HistoricSqlSourceAdapter implements SourceAdapter {
|
|||
}
|
||||
|
||||
async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
|
||||
const config = historicSqlPullConfigSchema.parse(pullConfig);
|
||||
if (config.dialect === 'postgres') {
|
||||
if (!this.deps.postgresReader) {
|
||||
throw new Error('Historic SQL Postgres fetch requires deps.postgresReader');
|
||||
}
|
||||
const postgresQueryClient = this.deps.postgresQueryClient ?? this.deps.queryClient;
|
||||
if (
|
||||
!postgresQueryClient ||
|
||||
typeof postgresQueryClient !== 'object' ||
|
||||
!('executeQuery' in postgresQueryClient) ||
|
||||
typeof (postgresQueryClient as { executeQuery?: unknown }).executeQuery !== 'function'
|
||||
) {
|
||||
throw new Error('Historic SQL Postgres fetch requires deps.postgresQueryClient with executeQuery(sql, params?)');
|
||||
}
|
||||
const result = await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: ctx.connectionId,
|
||||
queryClient: postgresQueryClient as NonNullable<HistoricSqlSourceAdapterDeps['postgresQueryClient']>,
|
||||
reader: this.deps.postgresReader,
|
||||
sqlAnalysis: this.deps.sqlAnalysis,
|
||||
pullConfig: config,
|
||||
baselinePath: pgssBaselinePath(this.deps.postgresBaselineRootDir, ctx.connectionId),
|
||||
now: this.deps.now?.(),
|
||||
});
|
||||
this.pendingPgssBaselines.set(stagedDir, result);
|
||||
return;
|
||||
}
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
await stageHistoricSqlAggregatedSnapshot({
|
||||
stagedDir,
|
||||
connectionId: ctx.connectionId,
|
||||
queryClient: this.deps.queryClient,
|
||||
reader: this.deps.reader,
|
||||
sqlAnalysis: this.deps.sqlAnalysis,
|
||||
pullConfig: config,
|
||||
pullConfig,
|
||||
now: this.deps.now?.(),
|
||||
});
|
||||
if (this.deps.legacyPostgresBaselineRootDir) {
|
||||
await rm(join(this.deps.legacyPostgresBaselineRootDir, ctx.connectionId, ['pgss', 'baseline.json'].join('-')), {
|
||||
force: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
|
||||
return chunkHistoricSqlStagedDir(stagedDir, diffSet);
|
||||
return chunkHistoricSqlUnifiedStagedDir(stagedDir, diffSet);
|
||||
}
|
||||
|
||||
describeScope(stagedDir: string): Promise<ScopeDescriptor> {
|
||||
return describeHistoricSqlScope(stagedDir);
|
||||
}
|
||||
|
||||
async getTriageSignals(stagedDir: string, externalId: string): Promise<TriageSignals> {
|
||||
const manifest = historicSqlManifestSchema.parse(
|
||||
JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')),
|
||||
);
|
||||
const template = manifest.templates.find((entry) => entry.id === externalId);
|
||||
if (!template) {
|
||||
return {};
|
||||
}
|
||||
const templateDir = template.path.replace(/\/page\.md$/, '');
|
||||
const metadata = historicSqlMetadataSchema.parse(
|
||||
JSON.parse(await readFile(join(stagedDir, templateDir, 'metadata.json'), 'utf-8')),
|
||||
);
|
||||
const usage = historicSqlUsageSchema.parse(
|
||||
JSON.parse(await readFile(join(stagedDir, templateDir, 'usage.json'), 'utf-8')),
|
||||
);
|
||||
|
||||
return {
|
||||
objectType: metadata.objectType,
|
||||
lastEditedAt: usage.stats.last_seen,
|
||||
propertyHints: metadata.properties.triage_signals,
|
||||
};
|
||||
}
|
||||
|
||||
async onPullSucceeded(ctx: {
|
||||
connectionId: string;
|
||||
sourceKey: string;
|
||||
syncId: string;
|
||||
trigger: IngestTrigger;
|
||||
completedAt: Date;
|
||||
stagedDir: string;
|
||||
}): Promise<void> {
|
||||
const manifest = historicSqlManifestSchema.parse(
|
||||
JSON.parse(await readFile(join(ctx.stagedDir, 'manifest.json'), 'utf-8')),
|
||||
);
|
||||
if (manifest.dialect === 'postgres') {
|
||||
const pending = this.pendingPgssBaselines.get(ctx.stagedDir);
|
||||
if (pending) {
|
||||
await writePgssBaselineAtomic(pending.baselinePath, pending.baseline);
|
||||
this.pendingPgssBaselines.delete(ctx.stagedDir);
|
||||
}
|
||||
}
|
||||
await this.deps.onPullSucceeded?.({ ...ctx, nextSuccessfulCursor: manifest.nextSuccessfulCursor });
|
||||
return describeHistoricSqlUnifiedScope(stagedDir);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,304 @@
|
|||
import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import YAML from 'yaml';
|
||||
import { AgentRunnerService } from '../../../agent/index.js';
|
||||
import { initKtxProject, loadKtxProject, type KtxLocalProject } from '../../../project/index.js';
|
||||
import {
|
||||
type SqlAnalysisBatchItem,
|
||||
type SqlAnalysisBatchResult,
|
||||
type SqlAnalysisDialect,
|
||||
type SqlAnalysisPort,
|
||||
} from '../../../sql-analysis/index.js';
|
||||
import { searchLocalSlSources } from '../../../sl/local-sl.js';
|
||||
import { searchLocalKnowledgePages } from '../../../wiki/local-knowledge.js';
|
||||
import { runLocalIngest } from '../../local-ingest.js';
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js';
|
||||
import type { AggregatedTemplate, HistoricSqlReader, HistoricSqlUnifiedPullConfig } from './types.js';
|
||||
|
||||
class AcceptanceHistoricSqlReader implements HistoricSqlReader {
|
||||
async probe() {
|
||||
return { warnings: [], info: [] };
|
||||
}
|
||||
|
||||
async *fetchAggregated(
|
||||
_client: unknown,
|
||||
_window: { start: Date; end: Date },
|
||||
_config: HistoricSqlUnifiedPullConfig,
|
||||
): AsyncIterable<AggregatedTemplate> {
|
||||
yield {
|
||||
templateId: 'pg:orders-lifecycle',
|
||||
canonicalSql:
|
||||
'select o.status, c.segment, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.status = $1 group by o.status, c.segment',
|
||||
dialect: 'postgres',
|
||||
stats: {
|
||||
executions: 42,
|
||||
distinctUsers: 4,
|
||||
firstSeen: '2026-05-01T00:00:00.000Z',
|
||||
lastSeen: '2026-05-11T00:00:00.000Z',
|
||||
p50RuntimeMs: 18,
|
||||
p95RuntimeMs: 84,
|
||||
errorRate: 0,
|
||||
rowsProduced: 420,
|
||||
},
|
||||
topUsers: [{ user: 'analyst@example.test', executions: 42 }],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
class HistoricSqlAcceptanceAgentRunner extends AgentRunnerService {
|
||||
override runLoop = vi.fn(async (params: any) => {
|
||||
if (params.telemetryTags?.operationName !== 'ingest-bundle-wu') {
|
||||
return { stopReason: 'natural' as const };
|
||||
}
|
||||
|
||||
const emitEvidence = params.toolSet.emit_historic_sql_evidence;
|
||||
if (!emitEvidence?.execute) {
|
||||
throw new Error('emit_historic_sql_evidence tool was not available to the historic-SQL WorkUnit');
|
||||
}
|
||||
|
||||
if (params.telemetryTags.unitKey === 'historic-sql-table-public-orders') {
|
||||
const result = await emitEvidence.execute(
|
||||
{
|
||||
kind: 'table_usage',
|
||||
table: 'public.orders',
|
||||
rawPath: 'tables/public.orders.json',
|
||||
usage: {
|
||||
narrative: 'Analysts repeatedly inspect paid order lifecycle by customer segment.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonGroupBys: ['status', 'segment'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id', 'id'] }],
|
||||
staleSince: null,
|
||||
},
|
||||
},
|
||||
{ toolCallId: 'historic-sql-orders-usage' },
|
||||
);
|
||||
if (!String(result).includes('Recorded historic-SQL table_usage evidence')) {
|
||||
throw new Error(`Unexpected orders evidence result: ${String(result)}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (params.telemetryTags.unitKey === 'historic-sql-table-public-customers') {
|
||||
const result = await emitEvidence.execute(
|
||||
{
|
||||
kind: 'table_usage',
|
||||
table: 'public.customers',
|
||||
rawPath: 'tables/public.customers.json',
|
||||
usage: {
|
||||
narrative: 'Customers provide segment context for paid order lifecycle analysis.',
|
||||
frequencyTier: 'mid',
|
||||
commonFilters: [],
|
||||
commonGroupBys: ['segment'],
|
||||
commonJoins: [{ table: 'public.orders', on: ['id', 'customer_id'] }],
|
||||
staleSince: null,
|
||||
},
|
||||
},
|
||||
{ toolCallId: 'historic-sql-customers-usage' },
|
||||
);
|
||||
if (!String(result).includes('Recorded historic-SQL table_usage evidence')) {
|
||||
throw new Error(`Unexpected customers evidence result: ${String(result)}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (params.telemetryTags.unitKey === 'historic-sql-patterns-part-0001') {
|
||||
const result = await emitEvidence.execute(
|
||||
{
|
||||
kind: 'pattern',
|
||||
rawPath: 'patterns-input/part-0001.json',
|
||||
pattern: {
|
||||
slug: 'paid-order-lifecycle',
|
||||
title: 'Paid Order Lifecycle',
|
||||
narrative: 'Analysts join orders and customers to compare paid order lifecycle by segment.',
|
||||
definitionSql:
|
||||
'select o.status, c.segment, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status, c.segment',
|
||||
tablesInvolved: ['public.orders', 'public.customers'],
|
||||
slRefs: ['orders', 'customers'],
|
||||
constituentTemplateIds: ['pg:orders-lifecycle'],
|
||||
},
|
||||
},
|
||||
{ toolCallId: 'historic-sql-pattern' },
|
||||
);
|
||||
if (!String(result).includes('Recorded historic-SQL pattern evidence')) {
|
||||
throw new Error(`Unexpected pattern evidence result: ${String(result)}`);
|
||||
}
|
||||
}
|
||||
|
||||
return { stopReason: 'natural' as const };
|
||||
});
|
||||
|
||||
constructor() {
|
||||
super({ llmProvider: { getModel: () => ({}) as never } as never });
|
||||
}
|
||||
}
|
||||
|
||||
function acceptanceSqlAnalysis(): SqlAnalysisPort {
|
||||
return {
|
||||
analyzeForFingerprint: async () => {
|
||||
throw new Error('analyzeForFingerprint should not be used by unified historic-SQL ingest');
|
||||
},
|
||||
analyzeBatch: vi.fn(
|
||||
async (
|
||||
items: SqlAnalysisBatchItem[],
|
||||
_dialect: SqlAnalysisDialect,
|
||||
): Promise<Map<string, SqlAnalysisBatchResult>> => {
|
||||
return new Map(
|
||||
items.map((item) => [
|
||||
item.id,
|
||||
{
|
||||
tablesTouched: ['public.orders', 'public.customers'],
|
||||
columnsByClause: {
|
||||
select: ['status', 'segment'],
|
||||
where: ['status'],
|
||||
join: ['customer_id', 'id'],
|
||||
groupBy: ['status', 'segment'],
|
||||
},
|
||||
},
|
||||
]),
|
||||
);
|
||||
},
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
async function writeHistoricSqlProject(project: KtxLocalProject): Promise<KtxLocalProject> {
|
||||
await writeFile(
|
||||
join(project.projectDir, 'ktx.yaml'),
|
||||
[
|
||||
'project: warehouse',
|
||||
'connections:',
|
||||
' warehouse:',
|
||||
' driver: postgres',
|
||||
' historicSql:',
|
||||
' enabled: true',
|
||||
' dialect: postgres',
|
||||
' minExecutions: 2',
|
||||
'ingest:',
|
||||
' adapters:',
|
||||
' - historic-sql',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
'storage:',
|
||||
' state: sqlite',
|
||||
' search: sqlite-fts5',
|
||||
' git:',
|
||||
' auto_commit: false',
|
||||
' author: KTX Test <system@ktx.local>',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
);
|
||||
|
||||
const loaded = await loadKtxProject({ projectDir: project.projectDir });
|
||||
await loaded.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
YAML.stringify({
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
columns: [
|
||||
{ name: 'id', type: 'string' },
|
||||
{ name: 'status', type: 'string' },
|
||||
{ name: 'customer_id', type: 'string' },
|
||||
],
|
||||
},
|
||||
customers: {
|
||||
table: 'public.customers',
|
||||
columns: [
|
||||
{ name: 'id', type: 'string' },
|
||||
{ name: 'segment', type: 'string' },
|
||||
],
|
||||
},
|
||||
},
|
||||
}),
|
||||
'KTX Test',
|
||||
'system@ktx.local',
|
||||
'Seed schema shard',
|
||||
);
|
||||
return loaded;
|
||||
}
|
||||
|
||||
describe('historic-SQL local ingest retrieval acceptance', () => {
|
||||
let tempDir: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempDir = await mkdtemp(join(tmpdir(), 'ktx-historic-sql-acceptance-'));
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('projects table and pattern evidence into semantic-layer and wiki retrieval surfaces', async () => {
|
||||
const initialized = await initKtxProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' });
|
||||
const project = await writeHistoricSqlProject(initialized);
|
||||
const sqlAnalysis = acceptanceSqlAnalysis();
|
||||
const agentRunner = new HistoricSqlAcceptanceAgentRunner();
|
||||
const adapter = new HistoricSqlSourceAdapter({
|
||||
reader: new AcceptanceHistoricSqlReader(),
|
||||
queryClient: {},
|
||||
sqlAnalysis,
|
||||
now: () => new Date('2026-05-11T00:00:00.000Z'),
|
||||
});
|
||||
|
||||
const result = await runLocalIngest({
|
||||
project,
|
||||
adapters: [adapter],
|
||||
adapter: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
jobId: 'historic-sql-retrieval-acceptance',
|
||||
agentRunner,
|
||||
});
|
||||
|
||||
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(1);
|
||||
expect(result.result.failedWorkUnits).toEqual([]);
|
||||
expect(result.result.workUnitCount).toBe(3);
|
||||
expect(agentRunner.runLoop).toHaveBeenCalledTimes(3);
|
||||
const postProcessor = result.report.body.postProcessor;
|
||||
expect(postProcessor).toBeDefined();
|
||||
if (!postProcessor) {
|
||||
throw new Error('Expected historic-SQL post-processor result');
|
||||
}
|
||||
expect(postProcessor).toMatchObject({
|
||||
sourceKey: 'historic-sql',
|
||||
status: 'success',
|
||||
result: {
|
||||
tableUsageMerged: 2,
|
||||
patternPagesWritten: 1,
|
||||
},
|
||||
});
|
||||
expect(postProcessor.touchedSources).toEqual(
|
||||
expect.arrayContaining([
|
||||
{ connectionId: 'warehouse', sourceName: 'customers' },
|
||||
{ connectionId: 'warehouse', sourceName: 'orders' },
|
||||
]),
|
||||
);
|
||||
|
||||
await expect(readFile(join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves
|
||||
.toContain('Analysts repeatedly inspect paid order lifecycle by customer segment.');
|
||||
await expect(readFile(join(project.projectDir, 'knowledge/global/historic-sql/paid-order-lifecycle.md'), 'utf-8'))
|
||||
.resolves.toContain('Paid Order Lifecycle');
|
||||
|
||||
const reloaded = await loadKtxProject({ projectDir: project.projectDir });
|
||||
await expect(
|
||||
searchLocalSlSources(reloaded, { connectionId: 'warehouse', query: 'paid order lifecycle', limit: 5 }),
|
||||
).resolves.toEqual(expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
name: 'orders',
|
||||
frequencyTier: 'high',
|
||||
snippet: expect.stringContaining('<mark>'),
|
||||
matchReasons: expect.arrayContaining(['lexical']),
|
||||
}),
|
||||
]));
|
||||
await expect(
|
||||
searchLocalKnowledgePages(reloaded, { query: 'paid order lifecycle', userId: 'local', limit: 5 }),
|
||||
).resolves.toEqual([
|
||||
expect.objectContaining({
|
||||
key: 'historic-sql/paid-order-lifecycle',
|
||||
summary: 'Paid Order Lifecycle',
|
||||
matchReasons: expect.arrayContaining(['lexical']),
|
||||
}),
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES,
|
||||
isHistoricSqlPatternInputShardPath,
|
||||
serializedStagedPatternsInputByteLength,
|
||||
splitHistoricSqlPatternInputs,
|
||||
} from './pattern-inputs.js';
|
||||
import type { StagedPatternsInput } from './types.js';
|
||||
|
||||
type PatternTemplate = StagedPatternsInput['templates'][number];
|
||||
|
||||
function template(id: string, tablesTouched: string[], canonicalSql = 'select 1'): PatternTemplate {
|
||||
return {
|
||||
id,
|
||||
canonicalSql,
|
||||
tablesTouched,
|
||||
executionsBucket: '10-100',
|
||||
distinctUsersBucket: '2-5',
|
||||
dialect: 'postgres',
|
||||
};
|
||||
}
|
||||
|
||||
describe('historic-SQL pattern input sharding', () => {
|
||||
it('keeps the audit input complete while sharding only cross-table pattern candidates', () => {
|
||||
const largeSql = `select * from public.orders join public.customers on true where marker = '${'x'.repeat(260)}'`;
|
||||
const input: StagedPatternsInput = {
|
||||
templates: [
|
||||
template('single-table-orders', ['public.orders']),
|
||||
template('orders-customers-2', ['public.orders', 'public.customers'], largeSql),
|
||||
template('orders-customers-1', ['public.customers', 'public.orders'], largeSql),
|
||||
template('orders-customers-payments', ['public.orders', 'public.customers', 'public.payments'], largeSql),
|
||||
],
|
||||
};
|
||||
|
||||
const result = splitHistoricSqlPatternInputs(input, { maxBytes: 760 });
|
||||
|
||||
expect(result.auditInput.templates.map((entry) => entry.id)).toEqual([
|
||||
'orders-customers-1',
|
||||
'orders-customers-2',
|
||||
'orders-customers-payments',
|
||||
'single-table-orders',
|
||||
]);
|
||||
expect(result.shards.length).toBeGreaterThan(1);
|
||||
expect(result.shards.map((shard) => shard.path)).toEqual([
|
||||
'patterns-input/part-0001.json',
|
||||
'patterns-input/part-0002.json',
|
||||
'patterns-input/part-0003.json',
|
||||
]);
|
||||
expect(result.shards.flatMap((shard) => shard.input.templates.map((entry) => entry.id))).toEqual([
|
||||
'orders-customers-payments',
|
||||
'orders-customers-1',
|
||||
'orders-customers-2',
|
||||
]);
|
||||
expect(result.shards.every((shard) => shard.byteLength <= 760)).toBe(true);
|
||||
expect(result.shards.flatMap((shard) => shard.input.templates).some((entry) => entry.id === 'single-table-orders')).toBe(false);
|
||||
expect(result.warnings).toEqual([]);
|
||||
});
|
||||
|
||||
it('omits a single oversized template from shards and reports a manifest warning', () => {
|
||||
const input: StagedPatternsInput = {
|
||||
templates: [
|
||||
template(
|
||||
'oversized-cross-table',
|
||||
['public.orders', 'public.customers'],
|
||||
`select * from public.orders join public.customers on true where payload = '${'x'.repeat(500)}'`,
|
||||
),
|
||||
],
|
||||
};
|
||||
|
||||
const result = splitHistoricSqlPatternInputs(input, { maxBytes: 240 });
|
||||
|
||||
expect(result.auditInput.templates.map((entry) => entry.id)).toEqual(['oversized-cross-table']);
|
||||
expect(result.shards).toEqual([]);
|
||||
expect(result.warnings).toEqual(['patterns_input_template_too_large:oversized-cross-table']);
|
||||
});
|
||||
|
||||
it('recognizes only generated pattern shard paths', () => {
|
||||
expect(isHistoricSqlPatternInputShardPath('patterns-input/part-0001.json')).toBe(true);
|
||||
expect(isHistoricSqlPatternInputShardPath('patterns-input/part-0012.json')).toBe(true);
|
||||
expect(isHistoricSqlPatternInputShardPath('patterns-input.json')).toBe(false);
|
||||
expect(isHistoricSqlPatternInputShardPath('patterns-input/part-1.json')).toBe(false);
|
||||
expect(isHistoricSqlPatternInputShardPath('patterns-input/readme.md')).toBe(false);
|
||||
});
|
||||
|
||||
it('uses a production byte budget below read_raw_file maximum size', () => {
|
||||
expect(HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES).toBeLessThan(120_000);
|
||||
expect(serializedStagedPatternsInputByteLength({ templates: [] })).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
import { Buffer } from 'node:buffer';
|
||||
import type { StagedPatternsInput } from './types.js';
|
||||
|
||||
export const HISTORIC_SQL_PATTERN_WORKUNIT_DIR = 'patterns-input';
|
||||
export const HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES = 110_000;
|
||||
export const HISTORIC_SQL_PATTERN_WORKUNIT_PATH_RE = /^patterns-input\/part-\d{4}\.json$/;
|
||||
|
||||
type PatternTemplate = StagedPatternsInput['templates'][number];
|
||||
|
||||
export interface HistoricSqlPatternInputShard {
|
||||
path: string;
|
||||
input: StagedPatternsInput;
|
||||
byteLength: number;
|
||||
}
|
||||
|
||||
export interface HistoricSqlPatternInputSplitResult {
|
||||
auditInput: StagedPatternsInput;
|
||||
shards: HistoricSqlPatternInputShard[];
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
export interface HistoricSqlPatternInputSplitOptions {
|
||||
maxBytes?: number;
|
||||
}
|
||||
|
||||
export function isHistoricSqlPatternInputShardPath(path: string): boolean {
|
||||
return HISTORIC_SQL_PATTERN_WORKUNIT_PATH_RE.test(path);
|
||||
}
|
||||
|
||||
export function serializeStagedPatternsInput(input: StagedPatternsInput): string {
|
||||
return `${JSON.stringify(input, null, 2)}\n`;
|
||||
}
|
||||
|
||||
export function serializedStagedPatternsInputByteLength(input: StagedPatternsInput): number {
|
||||
return Buffer.byteLength(serializeStagedPatternsInput(input), 'utf-8');
|
||||
}
|
||||
|
||||
function sortedAuditTemplates(templates: readonly PatternTemplate[]): PatternTemplate[] {
|
||||
return [...templates].sort((left, right) => left.id.localeCompare(right.id));
|
||||
}
|
||||
|
||||
function sortedPatternCandidates(templates: readonly PatternTemplate[]): PatternTemplate[] {
|
||||
return [...templates]
|
||||
.filter((template) => template.tablesTouched.length >= 2)
|
||||
.map((template) => ({ ...template, tablesTouched: [...template.tablesTouched].sort() }))
|
||||
.sort((left, right) => {
|
||||
const cardinality = right.tablesTouched.length - left.tablesTouched.length;
|
||||
if (cardinality !== 0) return cardinality;
|
||||
const tableSignature = left.tablesTouched.join('\0').localeCompare(right.tablesTouched.join('\0'));
|
||||
if (tableSignature !== 0) return tableSignature;
|
||||
return left.id.localeCompare(right.id);
|
||||
});
|
||||
}
|
||||
|
||||
function shardPath(index: number): string {
|
||||
return `${HISTORIC_SQL_PATTERN_WORKUNIT_DIR}/part-${String(index).padStart(4, '0')}.json`;
|
||||
}
|
||||
|
||||
export function splitHistoricSqlPatternInputs(
|
||||
input: StagedPatternsInput,
|
||||
options: HistoricSqlPatternInputSplitOptions = {},
|
||||
): HistoricSqlPatternInputSplitResult {
|
||||
const maxBytes = options.maxBytes ?? HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES;
|
||||
const auditInput: StagedPatternsInput = { templates: sortedAuditTemplates(input.templates) };
|
||||
const warnings: string[] = [];
|
||||
const shards: HistoricSqlPatternInputShard[] = [];
|
||||
let current: PatternTemplate[] = [];
|
||||
|
||||
const flush = () => {
|
||||
if (current.length === 0) {
|
||||
return;
|
||||
}
|
||||
const shardInput: StagedPatternsInput = { templates: current };
|
||||
shards.push({
|
||||
path: shardPath(shards.length + 1),
|
||||
input: shardInput,
|
||||
byteLength: serializedStagedPatternsInputByteLength(shardInput),
|
||||
});
|
||||
current = [];
|
||||
};
|
||||
|
||||
for (const template of sortedPatternCandidates(input.templates)) {
|
||||
const singleInput: StagedPatternsInput = { templates: [template] };
|
||||
if (serializedStagedPatternsInputByteLength(singleInput) > maxBytes) {
|
||||
warnings.push(`patterns_input_template_too_large:${template.id}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const nextInput: StagedPatternsInput = { templates: [...current, template] };
|
||||
if (current.length > 0 && serializedStagedPatternsInputByteLength(nextInput) > maxBytes) {
|
||||
flush();
|
||||
}
|
||||
current.push(template);
|
||||
}
|
||||
|
||||
flush();
|
||||
|
||||
return { auditInput, shards, warnings };
|
||||
}
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import YAML from 'yaml';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { HistoricSqlProjectionPostProcessor } from './post-processor.js';
|
||||
|
||||
async function tempWorkdir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-post-processor-'));
|
||||
}
|
||||
|
||||
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
await mkdir(join(target, '..'), { recursive: true });
|
||||
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
|
||||
}
|
||||
|
||||
describe('HistoricSqlProjectionPostProcessor', () => {
|
||||
it('projects current run evidence before the ingest squash commit', async () => {
|
||||
const workdir = await tempWorkdir();
|
||||
await mkdir(join(workdir, 'semantic-layer/warehouse/_schema'), { recursive: true });
|
||||
await writeFile(
|
||||
join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'),
|
||||
YAML.stringify({ tables: { orders: { table: 'public.orders', columns: [{ name: 'id', type: 'string' }] } } }),
|
||||
'utf-8',
|
||||
);
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-11T00:00:00.000Z',
|
||||
windowStart: '2026-02-10T00:00:00.000Z',
|
||||
windowEnd: '2026-05-11T00:00:00.000Z',
|
||||
snapshotRowCount: 1,
|
||||
touchedTableCount: 1,
|
||||
parseFailures: 0,
|
||||
warnings: [],
|
||||
probeWarnings: [],
|
||||
staleArchiveAfterDays: 90,
|
||||
});
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
|
||||
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/orders.json', {
|
||||
kind: 'table_usage',
|
||||
connectionId: 'warehouse',
|
||||
table: 'public.orders',
|
||||
rawPath: 'tables/public.orders.json',
|
||||
usage: {
|
||||
narrative: 'Orders are repeatedly queried by lifecycle status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [],
|
||||
staleSince: null,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await new HistoricSqlProjectionPostProcessor().run({
|
||||
connectionId: 'warehouse',
|
||||
sourceKey: 'historic-sql',
|
||||
syncId: 'sync-1',
|
||||
jobId: 'job-1',
|
||||
runId: 'run-1',
|
||||
workdir,
|
||||
parseArtifacts: null,
|
||||
});
|
||||
|
||||
expect(result.errors).toEqual([]);
|
||||
expect(result.warnings).toEqual([]);
|
||||
expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]);
|
||||
expect(result.result).toMatchObject({ tableUsageMerged: 1 });
|
||||
await expect(readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves.toContain(
|
||||
'Orders are repeatedly queried by lifecycle status.',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
import type { IngestBundlePostProcessorInput, IngestBundlePostProcessorPort, IngestBundlePostProcessorResult } from '../../ports.js';
|
||||
import { createSimpleGit } from '../../../core/git-env.js';
|
||||
import { projectHistoricSqlEvidence } from './projection.js';
|
||||
|
||||
async function commitProjectionChanges(workdir: string): Promise<void> {
|
||||
const git = createSimpleGit(workdir);
|
||||
if (!(await git.checkIsRepo().catch(() => false))) {
|
||||
return;
|
||||
}
|
||||
const status = await git.status();
|
||||
const paths = status.files
|
||||
.map((file) => file.path)
|
||||
.filter((path) => path.startsWith('semantic-layer/') || path.startsWith('knowledge/global/historic-sql/'));
|
||||
if (paths.length === 0) {
|
||||
return;
|
||||
}
|
||||
await git.add(paths);
|
||||
const staged = await git.diff(['--cached', '--name-only']);
|
||||
if (!staged.trim()) {
|
||||
return;
|
||||
}
|
||||
await git.commit('Project historic SQL evidence', { '--author': 'System User <system@example.com>' });
|
||||
}
|
||||
|
||||
export class HistoricSqlProjectionPostProcessor implements IngestBundlePostProcessorPort {
|
||||
async run(input: IngestBundlePostProcessorInput): Promise<IngestBundlePostProcessorResult> {
|
||||
const projection = await projectHistoricSqlEvidence({
|
||||
workdir: input.workdir,
|
||||
connectionId: input.connectionId,
|
||||
syncId: input.syncId,
|
||||
runId: input.runId,
|
||||
});
|
||||
await commitProjectionChanges(input.workdir);
|
||||
return {
|
||||
result: projection,
|
||||
warnings: projection.warnings,
|
||||
errors: [],
|
||||
touchedSources: projection.touchedSources,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -4,7 +4,7 @@ import {
|
|||
HistoricSqlGrantsMissingError,
|
||||
HistoricSqlVersionUnsupportedError,
|
||||
} from './errors.js';
|
||||
import { PostgresPgssQueryHistoryReader } from './postgres-pgss-query-history-reader.js';
|
||||
import { PostgresPgssReader } from './postgres-pgss-reader.js';
|
||||
|
||||
interface FakeQueryResult {
|
||||
headers: string[];
|
||||
|
|
@ -35,7 +35,7 @@ function executedSql(client: ReturnType<typeof queryClient>, index: number): str
|
|||
return call[0];
|
||||
}
|
||||
|
||||
describe('PostgresPgssQueryHistoryReader', () => {
|
||||
describe('PostgresPgssReader aggregate path', () => {
|
||||
it('probes version, extension presence, grants, and tracking state', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
|
|
@ -47,11 +47,12 @@ describe('PostgresPgssQueryHistoryReader', () => {
|
|||
{ headers: ['track'], rows: [['top']] },
|
||||
{ headers: ['max'], rows: [['5000']] },
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
const reader = new PostgresPgssReader();
|
||||
|
||||
await expect(reader.probe(client)).resolves.toEqual({
|
||||
pgServerVersion: 'PostgreSQL 16.4 on x86_64-apple-darwin',
|
||||
warnings: [],
|
||||
info: [],
|
||||
});
|
||||
|
||||
expect(executedSql(client, 0)).toContain("current_setting('server_version_num')::int");
|
||||
|
|
@ -69,12 +70,8 @@ describe('PostgresPgssQueryHistoryReader', () => {
|
|||
headers: ['server_version_num', 'server_version'],
|
||||
rows: [[130012, 'PostgreSQL 13.12']],
|
||||
},
|
||||
{
|
||||
headers: ['stats_reset', 'dealloc'],
|
||||
rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]],
|
||||
},
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
const reader = new PostgresPgssReader();
|
||||
|
||||
const promise = reader.probe(client);
|
||||
await expect(promise).rejects.toMatchObject({
|
||||
|
|
@ -95,7 +92,7 @@ describe('PostgresPgssQueryHistoryReader', () => {
|
|||
},
|
||||
new Error('relation "pg_stat_statements" does not exist'),
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
const reader = new PostgresPgssReader();
|
||||
|
||||
const promise = reader.probe(client);
|
||||
await expect(promise).rejects.toMatchObject({
|
||||
|
|
@ -113,7 +110,7 @@ describe('PostgresPgssQueryHistoryReader', () => {
|
|||
},
|
||||
new Error('pg_stat_statements must be loaded via shared_preload_libraries'),
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
const reader = new PostgresPgssReader();
|
||||
|
||||
const promise = reader.probe(client);
|
||||
await expect(promise).rejects.toMatchObject({
|
||||
|
|
@ -134,7 +131,7 @@ describe('PostgresPgssQueryHistoryReader', () => {
|
|||
{ headers: ['?column?'], rows: [[1]] },
|
||||
{ headers: ['has_role'], rows: [[false]] },
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
const reader = new PostgresPgssReader();
|
||||
|
||||
const promise = reader.probe(client);
|
||||
await expect(promise).rejects.toMatchObject({
|
||||
|
|
@ -156,17 +153,18 @@ describe('PostgresPgssQueryHistoryReader', () => {
|
|||
{ headers: ['track'], rows: [['none']] },
|
||||
{ headers: ['max'], rows: [['5000']] },
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
const reader = new PostgresPgssReader();
|
||||
|
||||
await expect(reader.probe(client)).resolves.toEqual({
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
warnings: [
|
||||
"pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config",
|
||||
],
|
||||
info: [],
|
||||
});
|
||||
});
|
||||
|
||||
it('warns when pg_stat_statements.max is below the recommended floor', async () => {
|
||||
it('returns an info note when pg_stat_statements.max is below the recommended floor', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: ['server_version_num', 'server_version'],
|
||||
|
|
@ -177,105 +175,68 @@ describe('PostgresPgssQueryHistoryReader', () => {
|
|||
{ headers: ['track'], rows: [['top']] },
|
||||
{ headers: ['max'], rows: [['1000']] },
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
const reader = new PostgresPgssReader();
|
||||
|
||||
await expect(reader.probe(client)).resolves.toEqual({
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
warnings: [
|
||||
warnings: [],
|
||||
info: [
|
||||
'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn',
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('reads a parameterized pg_stat_statements snapshot and stats info', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: [
|
||||
'queryid',
|
||||
'userid',
|
||||
'username',
|
||||
'dbid',
|
||||
'database',
|
||||
'query',
|
||||
'calls',
|
||||
'total_exec_time',
|
||||
'mean_exec_time',
|
||||
'total_rows',
|
||||
],
|
||||
it('aggregates pg_stat_statements rows by queryid and query', async () => {
|
||||
const executeQuery = vi.fn(async (sql: string, params?: unknown[]) => {
|
||||
if (sql.includes('pg_stat_statements_info')) {
|
||||
return { headers: ['stats_reset', 'dealloc'], rows: [['2026-05-01T00:00:00.000Z', 1]] };
|
||||
}
|
||||
expect(sql).toContain('GROUP BY queryid, query');
|
||||
expect(sql).toContain('HAVING SUM(calls) >= $1');
|
||||
expect(params).toEqual([5]);
|
||||
return {
|
||||
headers: ['template_id', 'canonical_sql', 'executions', 'distinct_users', 'mean_ms', 'rows_produced', 'top_users'],
|
||||
rows: [
|
||||
[
|
||||
'922337203685477580',
|
||||
'16384',
|
||||
'analyst',
|
||||
'16385',
|
||||
'warehouse',
|
||||
'SELECT count(*) FROM public.orders WHERE status = $1',
|
||||
'123',
|
||||
'select status from public.orders',
|
||||
'42',
|
||||
'2100.5',
|
||||
'50.0119',
|
||||
'9001',
|
||||
],
|
||||
[
|
||||
'922337203685477581',
|
||||
'16386',
|
||||
'unknown',
|
||||
'16385',
|
||||
'warehouse',
|
||||
'SELECT * FROM public.customers WHERE id = $1',
|
||||
5,
|
||||
30,
|
||||
6,
|
||||
5,
|
||||
'3',
|
||||
'11.5',
|
||||
'100',
|
||||
JSON.stringify([{ user: 'analyst', executions: 40 }]),
|
||||
],
|
||||
],
|
||||
},
|
||||
{
|
||||
headers: ['stats_reset', 'dealloc'],
|
||||
rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]],
|
||||
},
|
||||
]);
|
||||
const reader = new PostgresPgssQueryHistoryReader();
|
||||
|
||||
await expect(reader.readSnapshot(client, { minCalls: 5, maxTemplates: 500 })).resolves.toEqual({
|
||||
statsResetAt: '2026-05-01T00:00:00.000Z',
|
||||
deallocCount: 7,
|
||||
rows: [
|
||||
{
|
||||
queryid: '922337203685477580',
|
||||
userid: '16384',
|
||||
username: 'analyst',
|
||||
dbid: '16385',
|
||||
database: 'warehouse',
|
||||
query: 'SELECT count(*) FROM public.orders WHERE status = $1',
|
||||
calls: 42,
|
||||
totalExecTime: 2100.5,
|
||||
meanExecTime: 50.0119,
|
||||
totalRows: 9001,
|
||||
},
|
||||
{
|
||||
queryid: '922337203685477581',
|
||||
userid: '16386',
|
||||
username: 'unknown',
|
||||
dbid: '16385',
|
||||
database: 'warehouse',
|
||||
query: 'SELECT * FROM public.customers WHERE id = $1',
|
||||
calls: 5,
|
||||
totalExecTime: 30,
|
||||
meanExecTime: 6,
|
||||
totalRows: 5,
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
|
||||
const snapshotSql = executedSql(client, 0);
|
||||
expect(snapshotSql).toContain('FROM pg_stat_statements s');
|
||||
expect(snapshotSql).toContain('LEFT JOIN pg_roles');
|
||||
expect(snapshotSql).toContain('LEFT JOIN pg_database');
|
||||
expect(snapshotSql).toContain('WHERE s.toplevel = true');
|
||||
expect(snapshotSql).toContain('AND s.calls >= $1');
|
||||
expect(snapshotSql).toContain('ORDER BY s.total_exec_time DESC');
|
||||
expect(snapshotSql).toContain('LIMIT $2');
|
||||
expect(client.executeQuery.mock.calls[0]?.[1]).toEqual([5, 500]);
|
||||
expect(executedSql(client, 1)).toBe('SELECT stats_reset, dealloc FROM pg_stat_statements_info');
|
||||
const reader = new PostgresPgssReader();
|
||||
const rows = [];
|
||||
for await (const row of reader.fetchAggregated(
|
||||
{ executeQuery },
|
||||
{ start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') },
|
||||
{ dialect: 'postgres', minExecutions: 5, windowDays: 90, concurrency: 12, filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 },
|
||||
)) {
|
||||
rows.push(row);
|
||||
}
|
||||
|
||||
expect(rows).toEqual([
|
||||
{
|
||||
templateId: '123',
|
||||
canonicalSql: 'select status from public.orders',
|
||||
dialect: 'postgres',
|
||||
stats: {
|
||||
executions: 42,
|
||||
distinctUsers: 3,
|
||||
firstSeen: '2026-05-01T00:00:00.000Z',
|
||||
lastSeen: '2026-05-11T00:00:00.000Z',
|
||||
p50RuntimeMs: 11.5,
|
||||
p95RuntimeMs: 11.5,
|
||||
errorRate: 0,
|
||||
rowsProduced: 100,
|
||||
},
|
||||
topUsers: [{ user: 'analyst', executions: 40 }],
|
||||
},
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
@ -3,12 +3,13 @@ import {
|
|||
HistoricSqlGrantsMissingError,
|
||||
HistoricSqlVersionUnsupportedError,
|
||||
} from './errors.js';
|
||||
import type {
|
||||
KtxPostgresQueryClient,
|
||||
PostgresPgssProbeResult,
|
||||
PostgresPgssReader,
|
||||
PostgresPgssRow,
|
||||
PostgresPgssSnapshot,
|
||||
import {
|
||||
aggregatedTemplateSchema,
|
||||
type AggregatedTemplate,
|
||||
type HistoricSqlTimeWindow,
|
||||
type HistoricSqlUnifiedPullConfig,
|
||||
type KtxPostgresQueryClient,
|
||||
type PostgresPgssProbeResult,
|
||||
} from './types.js';
|
||||
|
||||
interface QueryResultLike {
|
||||
|
|
@ -18,37 +19,35 @@ interface QueryResultLike {
|
|||
error?: string;
|
||||
}
|
||||
|
||||
const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info';
|
||||
const VERSION_SQL = `
|
||||
SELECT current_setting('server_version_num')::int AS server_version_num,
|
||||
version() AS server_version
|
||||
`.trim();
|
||||
|
||||
const EXTENSION_PROBE_SQL = 'SELECT 1 FROM pg_stat_statements LIMIT 1';
|
||||
const GRANTS_PROBE_SQL = "SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role";
|
||||
const TRACKING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.track') AS track";
|
||||
const MAX_SETTING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.max') AS max";
|
||||
const RECOMMENDED_PGSS_MAX = 5000;
|
||||
const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info';
|
||||
|
||||
const SNAPSHOT_SQL = `
|
||||
SELECT
|
||||
s.queryid::text AS queryid,
|
||||
s.userid::text AS userid,
|
||||
COALESCE(r.rolname, 'unknown') AS username,
|
||||
s.dbid::text AS dbid,
|
||||
d.datname AS database,
|
||||
s.query,
|
||||
s.calls,
|
||||
s.total_exec_time,
|
||||
s.mean_exec_time,
|
||||
s.rows AS total_rows
|
||||
FROM pg_stat_statements s
|
||||
LEFT JOIN pg_roles r ON s.userid = r.oid
|
||||
LEFT JOIN pg_database d ON s.dbid = d.oid
|
||||
WHERE s.toplevel = true
|
||||
AND s.calls >= $1
|
||||
ORDER BY s.total_exec_time DESC
|
||||
LIMIT $2
|
||||
const AGGREGATE_SQL = `
|
||||
SELECT queryid::text AS template_id,
|
||||
query AS canonical_sql,
|
||||
SUM(calls)::bigint AS executions,
|
||||
COUNT(DISTINCT userid) AS distinct_users,
|
||||
SUM(total_exec_time) / NULLIF(SUM(calls), 0) AS mean_ms,
|
||||
SUM(rows)::bigint AS rows_produced,
|
||||
COALESCE(
|
||||
json_agg(json_build_object('user', rolname, 'executions', calls) ORDER BY calls DESC)
|
||||
FILTER (WHERE userid IS NOT NULL),
|
||||
'[]'::json
|
||||
)::text AS top_users
|
||||
FROM pg_stat_statements
|
||||
LEFT JOIN pg_roles ON pg_roles.oid = pg_stat_statements.userid
|
||||
WHERE toplevel = true
|
||||
GROUP BY queryid, query
|
||||
HAVING SUM(calls) >= $1
|
||||
ORDER BY SUM(total_exec_time) DESC
|
||||
`.trim();
|
||||
|
||||
const POSTGRES_EXTENSION_REMEDIATION = [
|
||||
|
|
@ -78,7 +77,7 @@ async function execute(client: KtxPostgresQueryClient, sql: string, params?: unk
|
|||
return result;
|
||||
}
|
||||
|
||||
function indexes(headers: string[]): Map<string, number> {
|
||||
function indexByHeader(headers: string[]): Map<string, number> {
|
||||
const out = new Map<string, number>();
|
||||
headers.forEach((header, index) => out.set(header.toLowerCase(), index));
|
||||
return out;
|
||||
|
|
@ -113,12 +112,21 @@ function requiredFiniteNumber(raw: unknown, field: string): number {
|
|||
return number;
|
||||
}
|
||||
|
||||
function nullableInteger(raw: unknown): number | null {
|
||||
function requiredInteger(raw: unknown, field: string): number {
|
||||
return Math.trunc(requiredFiniteNumber(raw, field));
|
||||
}
|
||||
|
||||
function nullableNumber(raw: unknown): number | null {
|
||||
if (raw === null || raw === undefined || raw === '') {
|
||||
return null;
|
||||
}
|
||||
const number = typeof raw === 'number' ? raw : Number(raw);
|
||||
return Number.isFinite(number) ? Math.trunc(number) : null;
|
||||
return Number.isFinite(number) ? number : null;
|
||||
}
|
||||
|
||||
function nullableInteger(raw: unknown): number | null {
|
||||
const number = nullableNumber(raw);
|
||||
return number === null ? null : Math.trunc(number);
|
||||
}
|
||||
|
||||
function nullableIsoTimestamp(raw: unknown): string | null {
|
||||
|
|
@ -137,7 +145,7 @@ function firstRow(result: QueryResultLike, context: string): { row: unknown[]; h
|
|||
if (!row) {
|
||||
throw new Error(`Postgres historic-SQL ${context} query returned no rows`);
|
||||
}
|
||||
return { row, headers: indexes(result.headers) };
|
||||
return { row, headers: indexByHeader(result.headers) };
|
||||
}
|
||||
|
||||
function isMissingPgssRelation(error: unknown): boolean {
|
||||
|
|
@ -167,22 +175,30 @@ function grantsMissingError(): HistoricSqlGrantsMissingError {
|
|||
});
|
||||
}
|
||||
|
||||
function mapSnapshotRow(row: unknown[], headerIndexes: Map<string, number>): PostgresPgssRow {
|
||||
return {
|
||||
queryid: requiredString(value(row, headerIndexes, 'queryid'), 'queryid'),
|
||||
userid: requiredString(value(row, headerIndexes, 'userid'), 'userid'),
|
||||
username: nullableString(value(row, headerIndexes, 'username')),
|
||||
dbid: requiredString(value(row, headerIndexes, 'dbid'), 'dbid'),
|
||||
database: nullableString(value(row, headerIndexes, 'database')),
|
||||
query: requiredString(value(row, headerIndexes, 'query'), 'query'),
|
||||
calls: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'calls'), 'calls')),
|
||||
totalExecTime: requiredFiniteNumber(value(row, headerIndexes, 'total_exec_time'), 'total_exec_time'),
|
||||
meanExecTime: requiredFiniteNumber(value(row, headerIndexes, 'mean_exec_time'), 'mean_exec_time'),
|
||||
totalRows: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'total_rows'), 'total_rows')),
|
||||
};
|
||||
function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> {
|
||||
const text = nullableString(raw);
|
||||
if (!text) {
|
||||
return [];
|
||||
}
|
||||
try {
|
||||
const parsed = JSON.parse(text) as unknown;
|
||||
if (!Array.isArray(parsed)) {
|
||||
return [];
|
||||
}
|
||||
return parsed.flatMap((entry) => {
|
||||
if (!entry || typeof entry !== 'object') {
|
||||
return [];
|
||||
}
|
||||
const user = nullableString((entry as { user?: unknown }).user);
|
||||
const executions = nullableInteger((entry as { executions?: unknown }).executions);
|
||||
return executions === null ? [] : [{ user, executions }];
|
||||
});
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export class PostgresPgssQueryHistoryReader implements PostgresPgssReader {
|
||||
export class PostgresPgssReader {
|
||||
async probe(client: unknown): Promise<PostgresPgssProbeResult> {
|
||||
const pgClient = queryClient(client);
|
||||
const versionResult = await execute(pgClient, VERSION_SQL);
|
||||
|
|
@ -231,32 +247,47 @@ export class PostgresPgssQueryHistoryReader implements PostgresPgssReader {
|
|||
const pgssMax = nullableInteger(value(maxRow, maxHeaders, 'max'));
|
||||
|
||||
const warnings: string[] = [];
|
||||
const info: string[] = [];
|
||||
if (track === 'none') {
|
||||
warnings.push('pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config');
|
||||
}
|
||||
if (pgssMax !== null && pgssMax < RECOMMENDED_PGSS_MAX) {
|
||||
warnings.push(
|
||||
info.push(
|
||||
`pg_stat_statements.max is ${pgssMax}; set it to at least ${RECOMMENDED_PGSS_MAX} to reduce query-template eviction churn`,
|
||||
);
|
||||
}
|
||||
|
||||
return { pgServerVersion, warnings };
|
||||
return { pgServerVersion, warnings, info };
|
||||
}
|
||||
|
||||
async readSnapshot(
|
||||
async *fetchAggregated(
|
||||
client: unknown,
|
||||
options: { minCalls: number; maxTemplates: number },
|
||||
): Promise<PostgresPgssSnapshot> {
|
||||
window: HistoricSqlTimeWindow,
|
||||
config: HistoricSqlUnifiedPullConfig,
|
||||
): AsyncIterable<AggregatedTemplate> {
|
||||
const pgClient = queryClient(client);
|
||||
const snapshotResult = await execute(pgClient, SNAPSHOT_SQL, [options.minCalls, options.maxTemplates]);
|
||||
const snapshotHeaders = indexes(snapshotResult.headers);
|
||||
const statsResult = await execute(pgClient, STATS_INFO_SQL);
|
||||
const { row: statsRow, headers: statsHeaders } = firstRow(statsResult, 'stats-info');
|
||||
|
||||
return {
|
||||
statsResetAt: nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')),
|
||||
deallocCount: nullableInteger(value(statsRow, statsHeaders, 'dealloc')),
|
||||
rows: snapshotResult.rows.map((row) => mapSnapshotRow(row, snapshotHeaders)),
|
||||
};
|
||||
const firstSeen = nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')) ?? window.start.toISOString();
|
||||
const result = await execute(pgClient, AGGREGATE_SQL, [config.minExecutions]);
|
||||
const indexes = indexByHeader(result.headers);
|
||||
for (const row of result.rows) {
|
||||
yield aggregatedTemplateSchema.parse({
|
||||
templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'),
|
||||
canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'),
|
||||
dialect: 'postgres',
|
||||
stats: {
|
||||
executions: requiredInteger(value(row, indexes, 'executions'), 'executions'),
|
||||
distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'),
|
||||
firstSeen,
|
||||
lastSeen: window.end.toISOString(),
|
||||
p50RuntimeMs: nullableNumber(value(row, indexes, 'mean_ms')),
|
||||
p95RuntimeMs: nullableNumber(value(row, indexes, 'mean_ms')),
|
||||
errorRate: 0,
|
||||
rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')),
|
||||
},
|
||||
topUsers: parseTopUsers(value(row, indexes, 'top_users')),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,372 @@
|
|||
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import YAML from 'yaml';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { projectHistoricSqlEvidence } from './projection.js';
|
||||
|
||||
async function tempWorkdir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-projection-'));
|
||||
}
|
||||
|
||||
async function writeText(root: string, relPath: string, content: string): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
await mkdir(join(target, '..'), { recursive: true });
|
||||
await writeFile(target, content, 'utf-8');
|
||||
}
|
||||
|
||||
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`);
|
||||
}
|
||||
|
||||
describe('projectHistoricSqlEvidence', () => {
|
||||
it('merges table usage into matching _schema shards and preserves external usage keys', async () => {
|
||||
const workdir = await tempWorkdir();
|
||||
await writeText(
|
||||
workdir,
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
YAML.stringify({
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
usage: {
|
||||
narrative: 'Old generated usage.',
|
||||
frequencyTier: 'low',
|
||||
commonFilters: ['old_status'],
|
||||
commonJoins: [],
|
||||
ownerNote: 'keep me',
|
||||
},
|
||||
columns: [{ name: 'id', type: 'string' }],
|
||||
},
|
||||
},
|
||||
}),
|
||||
);
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-11T00:00:00.000Z',
|
||||
windowStart: '2026-02-10T00:00:00.000Z',
|
||||
windowEnd: '2026-05-11T00:00:00.000Z',
|
||||
snapshotRowCount: 1,
|
||||
touchedTableCount: 1,
|
||||
parseFailures: 0,
|
||||
warnings: [],
|
||||
probeWarnings: [],
|
||||
staleArchiveAfterDays: 90,
|
||||
});
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
|
||||
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/orders.json', {
|
||||
kind: 'table_usage',
|
||||
connectionId: 'warehouse',
|
||||
table: 'public.orders',
|
||||
rawPath: 'tables/public.orders.json',
|
||||
usage: {
|
||||
narrative: 'Orders are repeatedly queried for lifecycle analysis.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status', 'created_at'],
|
||||
commonGroupBys: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
staleSince: null,
|
||||
},
|
||||
});
|
||||
|
||||
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
|
||||
|
||||
expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]);
|
||||
const shard = YAML.parse(await readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8'));
|
||||
expect(shard.tables.orders.usage).toEqual({
|
||||
ownerNote: 'keep me',
|
||||
narrative: 'Orders are repeatedly queried for lifecycle analysis.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status', 'created_at'],
|
||||
commonGroupBys: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
staleSince: null,
|
||||
});
|
||||
});
|
||||
|
||||
it('writes pattern pages, reuses similar slugs, and marks missing old pattern pages stale', async () => {
|
||||
const workdir = await tempWorkdir();
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-11T00:00:00.000Z',
|
||||
windowStart: '2026-02-10T00:00:00.000Z',
|
||||
windowEnd: '2026-05-11T00:00:00.000Z',
|
||||
snapshotRowCount: 2,
|
||||
touchedTableCount: 2,
|
||||
parseFailures: 0,
|
||||
warnings: [],
|
||||
probeWarnings: [],
|
||||
staleArchiveAfterDays: 90,
|
||||
});
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' });
|
||||
await writeText(
|
||||
workdir,
|
||||
'knowledge/global/historic-sql/old-order-lifecycle.md',
|
||||
[
|
||||
'---',
|
||||
YAML.stringify({
|
||||
summary: 'Old order lifecycle page',
|
||||
tags: ['historic-sql', 'pattern'],
|
||||
refs: [],
|
||||
sl_refs: ['orders'],
|
||||
usage_mode: 'auto',
|
||||
source: 'historic-sql',
|
||||
tables: ['public.orders', 'public.customers'],
|
||||
fingerprints: ['pg:1'],
|
||||
}).trimEnd(),
|
||||
'---',
|
||||
'',
|
||||
'Old body',
|
||||
'',
|
||||
].join('\n'),
|
||||
);
|
||||
await writeText(
|
||||
workdir,
|
||||
'knowledge/global/historic-sql/retired-pattern.md',
|
||||
[
|
||||
'---',
|
||||
YAML.stringify({
|
||||
summary: 'Retired pattern',
|
||||
tags: ['historic-sql', 'pattern'],
|
||||
refs: [],
|
||||
sl_refs: [],
|
||||
usage_mode: 'auto',
|
||||
source: 'historic-sql',
|
||||
tables: ['public.tickets'],
|
||||
fingerprints: ['pg:9'],
|
||||
}).trimEnd(),
|
||||
'---',
|
||||
'',
|
||||
'Retired body',
|
||||
'',
|
||||
].join('\n'),
|
||||
);
|
||||
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/pattern.json', {
|
||||
kind: 'pattern',
|
||||
connectionId: 'warehouse',
|
||||
rawPath: 'patterns-input.json',
|
||||
pattern: {
|
||||
slug: 'order-lifecycle-analysis',
|
||||
title: 'Order Lifecycle Analysis',
|
||||
narrative: 'Analysts compare order status with customer segment.',
|
||||
definitionSql: 'select * from public.orders join public.customers on customers.id = orders.customer_id',
|
||||
tablesInvolved: ['public.orders', 'public.customers'],
|
||||
slRefs: ['orders', 'customers'],
|
||||
constituentTemplateIds: ['pg:1', 'pg:2'],
|
||||
},
|
||||
});
|
||||
|
||||
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
|
||||
|
||||
expect(result.patternPagesWritten).toBe(1);
|
||||
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/old-order-lifecycle.md'), 'utf-8')).resolves.toContain(
|
||||
'Order Lifecycle Analysis',
|
||||
);
|
||||
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/retired-pattern.md'), 'utf-8')).resolves.toContain(
|
||||
'stale_since: "2026-05-11T00:00:00.000Z"',
|
||||
);
|
||||
});
|
||||
|
||||
it('writes a reappearing pattern to the active slug instead of reusing an archived page key', async () => {
|
||||
const workdir = await tempWorkdir();
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-11T00:00:00.000Z',
|
||||
windowStart: '2026-02-10T00:00:00.000Z',
|
||||
windowEnd: '2026-05-11T00:00:00.000Z',
|
||||
snapshotRowCount: 2,
|
||||
touchedTableCount: 2,
|
||||
parseFailures: 0,
|
||||
warnings: [],
|
||||
probeWarnings: [],
|
||||
staleArchiveAfterDays: 30,
|
||||
});
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' });
|
||||
await writeText(
|
||||
workdir,
|
||||
'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md',
|
||||
[
|
||||
'---',
|
||||
YAML.stringify({
|
||||
summary: 'Archived order lifecycle page',
|
||||
tags: ['historic-sql', 'pattern', 'archived'],
|
||||
refs: [],
|
||||
sl_refs: ['orders'],
|
||||
usage_mode: 'auto',
|
||||
source: 'historic-sql',
|
||||
tables: ['public.orders', 'public.customers'],
|
||||
fingerprints: ['pg:1'],
|
||||
stale_since: '2026-01-01T00:00:00.000Z',
|
||||
}).trimEnd(),
|
||||
'---',
|
||||
'',
|
||||
'Archived body',
|
||||
'',
|
||||
].join('\n'),
|
||||
);
|
||||
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/pattern.json', {
|
||||
kind: 'pattern',
|
||||
connectionId: 'warehouse',
|
||||
rawPath: 'patterns-input.json',
|
||||
pattern: {
|
||||
slug: 'order-lifecycle-analysis',
|
||||
title: 'Order Lifecycle Analysis',
|
||||
narrative: 'Analysts compare order status with customer segment again.',
|
||||
definitionSql: 'select * from public.orders join public.customers on customers.id = orders.customer_id',
|
||||
tablesInvolved: ['public.orders', 'public.customers'],
|
||||
slRefs: ['orders', 'customers'],
|
||||
constituentTemplateIds: ['pg:1', 'pg:2'],
|
||||
},
|
||||
});
|
||||
|
||||
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
|
||||
|
||||
expect(result.patternPagesWritten).toBe(1);
|
||||
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain(
|
||||
'Order Lifecycle Analysis',
|
||||
);
|
||||
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain(
|
||||
'Archived body',
|
||||
);
|
||||
await expect(
|
||||
readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/order-lifecycle-analysis.md'), 'utf-8'),
|
||||
).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
});
|
||||
|
||||
it('leaves already archived pattern pages stable when they are still absent', async () => {
|
||||
const workdir = await tempWorkdir();
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-11T00:00:00.000Z',
|
||||
windowStart: '2026-02-10T00:00:00.000Z',
|
||||
windowEnd: '2026-05-11T00:00:00.000Z',
|
||||
snapshotRowCount: 0,
|
||||
touchedTableCount: 0,
|
||||
parseFailures: 0,
|
||||
warnings: [],
|
||||
probeWarnings: [],
|
||||
staleArchiveAfterDays: 30,
|
||||
});
|
||||
await writeText(
|
||||
workdir,
|
||||
'knowledge/global/historic-sql/_archived/retired-pattern.md',
|
||||
[
|
||||
'---',
|
||||
YAML.stringify({
|
||||
summary: 'Retired pattern',
|
||||
tags: ['historic-sql', 'pattern', 'archived'],
|
||||
refs: [],
|
||||
sl_refs: [],
|
||||
usage_mode: 'auto',
|
||||
source: 'historic-sql',
|
||||
tables: ['public.tickets'],
|
||||
fingerprints: ['pg:9'],
|
||||
stale_since: '2026-01-01T00:00:00.000Z',
|
||||
}).trimEnd(),
|
||||
'---',
|
||||
'',
|
||||
'Archived retired body',
|
||||
'',
|
||||
].join('\n'),
|
||||
);
|
||||
|
||||
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
|
||||
|
||||
expect(result.archivedPatternPages).toBe(0);
|
||||
expect(result.stalePatternPagesMarked).toBe(0);
|
||||
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/retired-pattern.md'), 'utf-8')).resolves.toContain(
|
||||
'Archived retired body',
|
||||
);
|
||||
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/retired-pattern.md'), 'utf-8')).rejects.toMatchObject({
|
||||
code: 'ENOENT',
|
||||
});
|
||||
});
|
||||
|
||||
it('marks missing table usage stale and deletes legacy historic SQL query pages', async () => {
|
||||
const workdir = await tempWorkdir();
|
||||
await writeText(
|
||||
workdir,
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
YAML.stringify({
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
usage: {
|
||||
narrative: 'Orders were active before.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonGroupBys: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
ownerNote: 'keep analyst annotation',
|
||||
},
|
||||
columns: [{ name: 'id', type: 'string' }],
|
||||
},
|
||||
},
|
||||
}),
|
||||
);
|
||||
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
|
||||
source: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-11T00:00:00.000Z',
|
||||
windowStart: '2026-02-10T00:00:00.000Z',
|
||||
windowEnd: '2026-05-11T00:00:00.000Z',
|
||||
snapshotRowCount: 0,
|
||||
touchedTableCount: 0,
|
||||
parseFailures: 0,
|
||||
warnings: [],
|
||||
probeWarnings: [],
|
||||
staleArchiveAfterDays: 90,
|
||||
});
|
||||
await writeText(
|
||||
workdir,
|
||||
'knowledge/global/historic-sql/legacy-template.md',
|
||||
[
|
||||
'---',
|
||||
YAML.stringify({
|
||||
summary: 'Legacy template page',
|
||||
tags: ['historic-sql', 'query-pattern'],
|
||||
refs: [],
|
||||
sl_refs: ['orders'],
|
||||
usage_mode: 'auto',
|
||||
source: 'historic-sql',
|
||||
tables: ['public.orders'],
|
||||
fingerprints: ['legacy:1'],
|
||||
}).trimEnd(),
|
||||
'---',
|
||||
'',
|
||||
'Legacy body',
|
||||
'',
|
||||
].join('\n'),
|
||||
);
|
||||
|
||||
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
|
||||
|
||||
expect(result.staleTablesMarked).toBe(1);
|
||||
expect(result.legacyPagesDeleted).toBe(1);
|
||||
expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]);
|
||||
const shard = YAML.parse(await readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8'));
|
||||
expect(shard.tables.orders.usage).toEqual({
|
||||
ownerNote: 'keep analyst annotation',
|
||||
narrative: 'No recent historic SQL usage was observed in the latest snapshot.',
|
||||
frequencyTier: 'unused',
|
||||
commonFilters: [],
|
||||
commonGroupBys: [],
|
||||
commonJoins: [],
|
||||
staleSince: '2026-05-11T00:00:00.000Z',
|
||||
});
|
||||
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/legacy-template.md'), 'utf-8')).rejects.toMatchObject({
|
||||
code: 'ENOENT',
|
||||
});
|
||||
});
|
||||
});
|
||||
334
packages/context/src/ingest/adapters/historic-sql/projection.ts
Normal file
334
packages/context/src/ingest/adapters/historic-sql/projection.ts
Normal file
|
|
@ -0,0 +1,334 @@
|
|||
import { access, mkdir, readdir, readFile, rename, rm, writeFile } from 'node:fs/promises';
|
||||
import { dirname, join, relative } from 'node:path';
|
||||
import YAML from 'yaml';
|
||||
import { rawSourcesDirForSync } from '../../raw-sources-paths.js';
|
||||
import { mergeUsagePreservingExternal } from '../live-database/manifest.js';
|
||||
import { historicSqlEvidenceEnvelopeSchema, type HistoricSqlEvidenceEnvelope } from './evidence.js';
|
||||
import type { TableUsageOutput } from './skill-schemas.js';
|
||||
import { stagedManifestSchema } from './types.js';
|
||||
|
||||
export interface HistoricSqlProjectionInput {
|
||||
workdir: string;
|
||||
connectionId: string;
|
||||
syncId: string;
|
||||
runId: string;
|
||||
}
|
||||
|
||||
export interface HistoricSqlProjectionResult {
|
||||
tableUsageMerged: number;
|
||||
staleTablesMarked: number;
|
||||
patternPagesWritten: number;
|
||||
stalePatternPagesMarked: number;
|
||||
archivedPatternPages: number;
|
||||
legacyPagesDeleted: number;
|
||||
touchedSources: Array<{ connectionId: string; sourceName: string }>;
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
interface ManifestShard {
|
||||
tables?: Record<string, { table?: string; usage?: Record<string, unknown>; columns?: unknown[]; [key: string]: unknown }>;
|
||||
}
|
||||
|
||||
interface HistoricSqlPatternPage {
|
||||
key: string;
|
||||
path: string;
|
||||
frontmatter: Record<string, unknown>;
|
||||
content: string;
|
||||
}
|
||||
|
||||
function safeKnowledgeSlug(value: string): string {
|
||||
return value.toLowerCase().replace(/[^a-z0-9/-]+/g, '-').replace(/^-+|-+$/g, '');
|
||||
}
|
||||
|
||||
async function pathExists(path: string): Promise<boolean> {
|
||||
try {
|
||||
await access(path);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function walkFiles(root: string): Promise<string[]> {
|
||||
if (!(await pathExists(root))) return [];
|
||||
const result: string[] = [];
|
||||
async function visit(dir: string): Promise<void> {
|
||||
const entries = await readdir(dir, { withFileTypes: true });
|
||||
for (const entry of entries) {
|
||||
const absolute = join(dir, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
await visit(absolute);
|
||||
} else if (entry.isFile()) {
|
||||
result.push(relative(root, absolute).replace(/\\/g, '/'));
|
||||
}
|
||||
}
|
||||
}
|
||||
await visit(root);
|
||||
return result.sort();
|
||||
}
|
||||
|
||||
async function readJson(path: string): Promise<unknown> {
|
||||
return JSON.parse(await readFile(path, 'utf-8')) as unknown;
|
||||
}
|
||||
|
||||
async function writeYamlAtomic(path: string, value: unknown): Promise<void> {
|
||||
await mkdir(dirname(path), { recursive: true });
|
||||
const tmp = `${path}.tmp`;
|
||||
await writeFile(tmp, YAML.stringify(value, { indent: 2, lineWidth: 0 }), 'utf-8');
|
||||
await rename(tmp, path);
|
||||
}
|
||||
|
||||
function tableSourceName(tableRef: string): string {
|
||||
return tableRef.split('.').filter(Boolean).at(-1) ?? tableRef;
|
||||
}
|
||||
|
||||
function staleUsage(fetchedAt: string) {
|
||||
return {
|
||||
narrative: 'No recent historic SQL usage was observed in the latest snapshot.',
|
||||
frequencyTier: 'unused' as const,
|
||||
commonFilters: [],
|
||||
commonGroupBys: [],
|
||||
commonJoins: [],
|
||||
staleSince: fetchedAt,
|
||||
};
|
||||
}
|
||||
|
||||
async function loadEvidence(workdir: string, runId: string): Promise<HistoricSqlEvidenceEnvelope[]> {
|
||||
const root = join(workdir, '.ktx/ingest-evidence/historic-sql', runId);
|
||||
const files = await walkFiles(root);
|
||||
const evidence: HistoricSqlEvidenceEnvelope[] = [];
|
||||
for (const file of files.filter((candidate) => candidate.endsWith('.json'))) {
|
||||
evidence.push(historicSqlEvidenceEnvelopeSchema.parse(await readJson(join(root, file))));
|
||||
}
|
||||
return evidence;
|
||||
}
|
||||
|
||||
function renderPatternMarkdown(pattern: HistoricSqlEvidenceEnvelope & { kind: 'pattern' }): string {
|
||||
return [
|
||||
`# ${pattern.pattern.title}`,
|
||||
'',
|
||||
pattern.pattern.narrative,
|
||||
'',
|
||||
'## Representative SQL',
|
||||
'',
|
||||
'```sql',
|
||||
pattern.pattern.definitionSql,
|
||||
'```',
|
||||
'',
|
||||
'## Tables',
|
||||
'',
|
||||
...pattern.pattern.tablesInvolved.map((table) => `- ${table}`),
|
||||
'',
|
||||
'## Constituent Templates',
|
||||
'',
|
||||
...pattern.pattern.constituentTemplateIds.map((id) => `- ${id}`),
|
||||
'',
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function overlapRatio(left: string[], right: string[]): number {
|
||||
const rightSet = new Set(right);
|
||||
const intersection = left.filter((value) => rightSet.has(value)).length;
|
||||
return left.length === 0 ? 0 : intersection / left.length;
|
||||
}
|
||||
|
||||
function parseMarkdownPage(key: string, path: string, raw: string): HistoricSqlPatternPage | null {
|
||||
const match = raw.match(/^---\n([\s\S]*?)\n---\n?([\s\S]*)$/);
|
||||
if (!match) return null;
|
||||
return {
|
||||
key,
|
||||
path,
|
||||
frontmatter: (YAML.parse(match[1] ?? '') ?? {}) as Record<string, unknown>,
|
||||
content: match[2] ?? '',
|
||||
};
|
||||
}
|
||||
|
||||
function isHistoricPatternPage(page: HistoricSqlPatternPage): boolean {
|
||||
const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : [];
|
||||
return (
|
||||
page.frontmatter.source === 'historic-sql' &&
|
||||
tags.includes('historic-sql') &&
|
||||
tags.includes('pattern')
|
||||
);
|
||||
}
|
||||
|
||||
function isLegacyQueryPage(page: HistoricSqlPatternPage): boolean {
|
||||
const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : [];
|
||||
return page.frontmatter.source === 'historic-sql' && tags.includes('query-pattern') && !tags.includes('pattern');
|
||||
}
|
||||
|
||||
function isArchivedPatternPage(page: HistoricSqlPatternPage): boolean {
|
||||
const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : [];
|
||||
return page.key.startsWith('_archived/') || tags.includes('archived');
|
||||
}
|
||||
|
||||
function stringArray(value: unknown): string[] {
|
||||
return Array.isArray(value) ? value.filter((entry): entry is string => typeof entry === 'string') : [];
|
||||
}
|
||||
|
||||
function renderMarkdownPage(frontmatter: Record<string, unknown>, content: string): string {
|
||||
let yaml = YAML.stringify(frontmatter, { indent: 2, lineWidth: 0 }).trimEnd();
|
||||
const staleSince = frontmatter.stale_since;
|
||||
if (typeof staleSince === 'string') {
|
||||
yaml = yaml.replace(`stale_since: ${staleSince}`, `stale_since: "${staleSince}"`);
|
||||
}
|
||||
return `---\n${yaml}\n---\n\n${content.trim()}\n`;
|
||||
}
|
||||
|
||||
function existingPageSignals(page: HistoricSqlPatternPage): string[] {
|
||||
return [...stringArray(page.frontmatter.tables), ...stringArray(page.frontmatter.fingerprints)];
|
||||
}
|
||||
|
||||
function shouldArchive(staleSince: unknown, fetchedAt: string, days: number): boolean {
|
||||
if (typeof staleSince !== 'string') return false;
|
||||
const staleTime = Date.parse(staleSince);
|
||||
const fetchedTime = Date.parse(fetchedAt);
|
||||
if (!Number.isFinite(staleTime) || !Number.isFinite(fetchedTime)) return false;
|
||||
return fetchedTime - staleTime > days * 24 * 60 * 60 * 1000;
|
||||
}
|
||||
|
||||
async function loadPatternPages(root: string): Promise<HistoricSqlPatternPage[]> {
|
||||
const files = await walkFiles(root);
|
||||
const pages: HistoricSqlPatternPage[] = [];
|
||||
for (const file of files.filter((candidate) => candidate.endsWith('.md'))) {
|
||||
const key = file.replace(/\.md$/, '');
|
||||
const path = join(root, file);
|
||||
const page = parseMarkdownPage(key, path, await readFile(path, 'utf-8'));
|
||||
if (page) {
|
||||
pages.push(page);
|
||||
}
|
||||
}
|
||||
return pages;
|
||||
}
|
||||
|
||||
async function currentStagedTables(rawDir: string): Promise<Set<string>> {
|
||||
const tablesRoot = join(rawDir, 'tables');
|
||||
const files = await walkFiles(tablesRoot);
|
||||
const tables = new Set<string>();
|
||||
for (const file of files.filter((candidate) => candidate.endsWith('.json'))) {
|
||||
const value = await readJson(join(tablesRoot, file));
|
||||
if (typeof value === 'object' && value !== null && 'table' in value && typeof value.table === 'string') {
|
||||
tables.add(value.table);
|
||||
}
|
||||
}
|
||||
return tables;
|
||||
}
|
||||
|
||||
export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInput): Promise<HistoricSqlProjectionResult> {
|
||||
const result: HistoricSqlProjectionResult = {
|
||||
tableUsageMerged: 0,
|
||||
staleTablesMarked: 0,
|
||||
patternPagesWritten: 0,
|
||||
stalePatternPagesMarked: 0,
|
||||
archivedPatternPages: 0,
|
||||
legacyPagesDeleted: 0,
|
||||
touchedSources: [],
|
||||
warnings: [],
|
||||
};
|
||||
const touchedKeys = new Set<string>();
|
||||
const rawDir = join(input.workdir, rawSourcesDirForSync(input.connectionId, 'historic-sql', input.syncId));
|
||||
const manifest = stagedManifestSchema.parse(await readJson(join(rawDir, 'manifest.json')));
|
||||
const currentTables = await currentStagedTables(rawDir);
|
||||
const evidence = await loadEvidence(input.workdir, input.runId);
|
||||
const tableEvidence = evidence.filter((entry): entry is HistoricSqlEvidenceEnvelope & { kind: 'table_usage' } => entry.kind === 'table_usage');
|
||||
const patternEvidence = evidence.filter((entry): entry is HistoricSqlEvidenceEnvelope & { kind: 'pattern' } => entry.kind === 'pattern');
|
||||
|
||||
const schemaRoot = join(input.workdir, 'semantic-layer', input.connectionId, '_schema');
|
||||
for (const file of (await walkFiles(schemaRoot)).filter((candidate) => candidate.endsWith('.yaml') || candidate.endsWith('.yml'))) {
|
||||
const path = join(schemaRoot, file);
|
||||
const before = await readFile(path, 'utf-8');
|
||||
const shard = (YAML.parse(before) ?? {}) as ManifestShard;
|
||||
if (!shard.tables) continue;
|
||||
for (const [tableName, entry] of Object.entries(shard.tables)) {
|
||||
const tableRef = entry.table ?? tableName;
|
||||
const matchingEvidence = tableEvidence.find(
|
||||
(candidate) => candidate.table === tableRef || tableSourceName(candidate.table) === tableName,
|
||||
);
|
||||
if (matchingEvidence) {
|
||||
const merged = mergeUsagePreservingExternal(entry.usage as TableUsageOutput | undefined, matchingEvidence.usage);
|
||||
if (JSON.stringify(entry.usage ?? null) !== JSON.stringify(merged ?? null)) {
|
||||
entry.usage = merged as Record<string, unknown>;
|
||||
result.tableUsageMerged += 1;
|
||||
const sourceName = tableSourceName(matchingEvidence.table);
|
||||
const key = `${input.connectionId}:${sourceName}`;
|
||||
if (!touchedKeys.has(key)) {
|
||||
touchedKeys.add(key);
|
||||
result.touchedSources.push({ connectionId: input.connectionId, sourceName });
|
||||
}
|
||||
}
|
||||
} else if (entry.usage && !currentTables.has(tableRef)) {
|
||||
const merged = mergeUsagePreservingExternal(entry.usage as TableUsageOutput | undefined, staleUsage(manifest.fetchedAt));
|
||||
if (JSON.stringify(entry.usage ?? null) !== JSON.stringify(merged ?? null)) {
|
||||
entry.usage = merged as Record<string, unknown>;
|
||||
result.staleTablesMarked += 1;
|
||||
const sourceName = tableSourceName(tableRef);
|
||||
const key = `${input.connectionId}:${sourceName}`;
|
||||
if (!touchedKeys.has(key)) {
|
||||
touchedKeys.add(key);
|
||||
result.touchedSources.push({ connectionId: input.connectionId, sourceName });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
const after = YAML.stringify(shard, { indent: 2, lineWidth: 0 });
|
||||
if (after !== before) {
|
||||
await writeYamlAtomic(path, shard);
|
||||
}
|
||||
}
|
||||
|
||||
const wikiRoot = join(input.workdir, 'knowledge/global/historic-sql');
|
||||
await mkdir(wikiRoot, { recursive: true });
|
||||
const allPages = await loadPatternPages(wikiRoot);
|
||||
const activePages = allPages.filter((page) => !isArchivedPatternPage(page));
|
||||
const patternPages = activePages.filter(isHistoricPatternPage);
|
||||
const writtenKeys = new Set<string>();
|
||||
|
||||
for (const pattern of patternEvidence) {
|
||||
const incomingSignals = [...pattern.pattern.tablesInvolved, ...pattern.pattern.constituentTemplateIds];
|
||||
const reusable = patternPages.find((page) => overlapRatio(incomingSignals, existingPageSignals(page)) >= 0.6);
|
||||
const key = reusable?.key ?? safeKnowledgeSlug(pattern.pattern.slug);
|
||||
const pagePath = join(wikiRoot, `${key}.md`);
|
||||
const frontmatter = {
|
||||
summary: pattern.pattern.title,
|
||||
tags: ['historic-sql', 'pattern'],
|
||||
refs: [],
|
||||
sl_refs: pattern.pattern.slRefs,
|
||||
usage_mode: 'auto',
|
||||
source: 'historic-sql',
|
||||
tables: pattern.pattern.tablesInvolved,
|
||||
representative_sql: pattern.pattern.definitionSql,
|
||||
fingerprints: pattern.pattern.constituentTemplateIds,
|
||||
};
|
||||
await mkdir(dirname(pagePath), { recursive: true });
|
||||
await writeFile(pagePath, renderMarkdownPage(frontmatter, renderPatternMarkdown(pattern)), 'utf-8');
|
||||
writtenKeys.add(key);
|
||||
result.patternPagesWritten += 1;
|
||||
}
|
||||
|
||||
for (const page of patternPages) {
|
||||
if (writtenKeys.has(page.key)) continue;
|
||||
if (shouldArchive(page.frontmatter.stale_since, manifest.fetchedAt, manifest.staleArchiveAfterDays)) {
|
||||
const archivePath = join(wikiRoot, '_archived', `${page.key}.md`);
|
||||
const tags = [...new Set([...stringArray(page.frontmatter.tags), 'archived'])];
|
||||
await mkdir(dirname(archivePath), { recursive: true });
|
||||
await writeFile(archivePath, renderMarkdownPage({ ...page.frontmatter, tags }, page.content), 'utf-8');
|
||||
await rm(page.path, { force: true });
|
||||
result.archivedPatternPages += 1;
|
||||
continue;
|
||||
}
|
||||
const tags = [...new Set([...stringArray(page.frontmatter.tags), 'stale'])];
|
||||
await writeFile(
|
||||
page.path,
|
||||
renderMarkdownPage({ ...page.frontmatter, tags, stale_since: manifest.fetchedAt }, page.content),
|
||||
'utf-8',
|
||||
);
|
||||
result.stalePatternPagesMarked += 1;
|
||||
}
|
||||
|
||||
for (const page of allPages.filter(isLegacyQueryPage)) {
|
||||
await rm(page.path, { force: true });
|
||||
result.legacyPagesDeleted += 1;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { compileHistoricSqlRedactionPatterns, redactHistoricSqlText } from './redaction.js';
|
||||
|
||||
describe('historic-SQL redaction', () => {
|
||||
it('redacts regex matches and supports the (?i) case-insensitive prefix', () => {
|
||||
const redactors = compileHistoricSqlRedactionPatterns([
|
||||
'sk_live_[A-Za-z0-9]+',
|
||||
'(?i)secret_token_[a-z0-9]+',
|
||||
]);
|
||||
|
||||
const sql =
|
||||
"select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'";
|
||||
|
||||
expect(redactHistoricSqlText(sql, redactors)).toBe(
|
||||
"select * from public.api_events where api_key = '[REDACTED]' and note = '[REDACTED]'",
|
||||
);
|
||||
});
|
||||
|
||||
it('returns the original SQL text when no redaction patterns are configured', () => {
|
||||
const sql = "select * from public.orders where status = 'paid'";
|
||||
|
||||
expect(redactHistoricSqlText(sql, compileHistoricSqlRedactionPatterns([]))).toBe(sql);
|
||||
});
|
||||
|
||||
it('throws a config-focused error for invalid redaction regex patterns', () => {
|
||||
expect(() => compileHistoricSqlRedactionPatterns(['[broken'])).toThrow(
|
||||
'Invalid historicSql.redactionPatterns entry "[broken"',
|
||||
);
|
||||
});
|
||||
|
||||
it('throws a config-focused error for empty redaction regex patterns', () => {
|
||||
expect(() => compileHistoricSqlRedactionPatterns([' '])).toThrow(
|
||||
'Invalid historicSql.redactionPatterns entry " "',
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
export interface HistoricSqlRedactionPattern {
|
||||
pattern: string;
|
||||
expression: RegExp;
|
||||
}
|
||||
|
||||
const CASE_INSENSITIVE_PREFIX = '(?i)';
|
||||
const REDACTION_TOKEN = '[REDACTED]';
|
||||
|
||||
export function compileHistoricSqlRedactionPatterns(patterns: readonly string[]): HistoricSqlRedactionPattern[] {
|
||||
return patterns.map((pattern) => {
|
||||
const trimmed = pattern.trim();
|
||||
const caseInsensitive = trimmed.startsWith(CASE_INSENSITIVE_PREFIX);
|
||||
const source = caseInsensitive ? trimmed.slice(CASE_INSENSITIVE_PREFIX.length) : trimmed;
|
||||
if (source.length === 0) {
|
||||
throw new Error(`Invalid historicSql.redactionPatterns entry "${pattern}": pattern must not be empty`);
|
||||
}
|
||||
|
||||
try {
|
||||
return {
|
||||
pattern,
|
||||
expression: new RegExp(source, caseInsensitive ? 'gi' : 'g'),
|
||||
};
|
||||
} catch (error) {
|
||||
const reason = error instanceof Error ? error.message : String(error);
|
||||
throw new Error(`Invalid historicSql.redactionPatterns entry "${pattern}": ${reason}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export function redactHistoricSqlText(text: string, redactors: readonly HistoricSqlRedactionPattern[]): string {
|
||||
let next = text;
|
||||
for (const redactor of redactors) {
|
||||
redactor.expression.lastIndex = 0;
|
||||
next = next.replace(redactor.expression, REDACTION_TOKEN);
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import { z } from 'zod';
|
||||
import {
|
||||
patternOutputSchema,
|
||||
patternsArraySchema,
|
||||
tableUsageOutputSchema,
|
||||
} from './skill-schemas.js';
|
||||
|
||||
describe('historic-sql skill schemas', () => {
|
||||
it('accepts table usage output and preserves future keys', () => {
|
||||
const parsed = tableUsageOutputSchema.parse({
|
||||
narrative: 'Orders are queried for paid/refunded lifecycle analysis.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status', 'created_at'],
|
||||
commonGroupBys: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
staleSince: null,
|
||||
analystNote: 'preserve me',
|
||||
});
|
||||
|
||||
expect(parsed).toMatchObject({
|
||||
narrative: 'Orders are queried for paid/refunded lifecycle analysis.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status', 'created_at'],
|
||||
commonGroupBys: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
staleSince: null,
|
||||
analystNote: 'preserve me',
|
||||
});
|
||||
});
|
||||
|
||||
it('rejects invalid frequency tiers', () => {
|
||||
const result = tableUsageOutputSchema.safeParse({
|
||||
narrative: 'Orders are queried often.',
|
||||
frequencyTier: 'sometimes',
|
||||
commonFilters: [],
|
||||
commonJoins: [],
|
||||
});
|
||||
|
||||
expect(result.success).toBe(false);
|
||||
});
|
||||
|
||||
it('accepts pattern outputs used for wiki projection', () => {
|
||||
const parsed = patternsArraySchema.parse([
|
||||
{
|
||||
slug: 'order-lifecycle-analysis',
|
||||
title: 'Order Lifecycle Analysis',
|
||||
narrative: 'Teams inspect order status by customer and month.',
|
||||
definitionSql: 'select status, count(*) from public.orders group by status',
|
||||
tablesInvolved: ['public.orders', 'public.customers'],
|
||||
slRefs: ['orders', 'customers'],
|
||||
constituentTemplateIds: ['template_1', 'template_2'],
|
||||
},
|
||||
]);
|
||||
|
||||
expect(parsed[0]).toEqual({
|
||||
slug: 'order-lifecycle-analysis',
|
||||
title: 'Order Lifecycle Analysis',
|
||||
narrative: 'Teams inspect order status by customer and month.',
|
||||
definitionSql: 'select status, count(*) from public.orders group by status',
|
||||
tablesInvolved: ['public.orders', 'public.customers'],
|
||||
slRefs: ['orders', 'customers'],
|
||||
constituentTemplateIds: ['template_1', 'template_2'],
|
||||
});
|
||||
});
|
||||
|
||||
it('exports zod schemas that can produce JSON schema for prompt prefixes', () => {
|
||||
const tableUsageJsonSchema = z.toJSONSchema(tableUsageOutputSchema);
|
||||
const patternJsonSchema = z.toJSONSchema(patternOutputSchema);
|
||||
|
||||
expect(tableUsageJsonSchema).toMatchObject({ type: 'object' });
|
||||
expect(patternJsonSchema).toMatchObject({ type: 'object' });
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
import { z } from 'zod';
|
||||
|
||||
export const tableUsageOutputSchema = z
|
||||
.object({
|
||||
narrative: z.string(),
|
||||
frequencyTier: z.enum(['high', 'mid', 'low', 'unused']),
|
||||
commonFilters: z.array(z.string()),
|
||||
commonGroupBys: z.array(z.string()).optional(),
|
||||
commonJoins: z.array(
|
||||
z.object({
|
||||
table: z.string(),
|
||||
on: z.array(z.string()),
|
||||
}),
|
||||
),
|
||||
staleSince: z.iso.datetime().nullable().optional(),
|
||||
})
|
||||
.passthrough();
|
||||
export type TableUsageOutput = z.infer<typeof tableUsageOutputSchema>;
|
||||
|
||||
export const patternOutputSchema = z.object({
|
||||
slug: z.string(),
|
||||
title: z.string(),
|
||||
narrative: z.string(),
|
||||
definitionSql: z.string(),
|
||||
tablesInvolved: z.array(z.string()),
|
||||
slRefs: z.array(z.string()),
|
||||
constituentTemplateIds: z.array(z.string()),
|
||||
});
|
||||
export type PatternOutput = z.infer<typeof patternOutputSchema>;
|
||||
|
||||
export const patternsArraySchema = z.array(patternOutputSchema);
|
||||
|
|
@ -33,7 +33,7 @@ describe('SnowflakeHistoricSqlQueryHistoryReader', () => {
|
|||
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
|
||||
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
|
||||
|
||||
await expect(reader.probe(client)).resolves.toBeUndefined();
|
||||
await expect(reader.probe(client)).resolves.toEqual({ warnings: [], info: [] });
|
||||
|
||||
expect(client.executeQuery).toHaveBeenCalledWith(
|
||||
'SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY LIMIT 1',
|
||||
|
|
@ -62,130 +62,85 @@ describe('SnowflakeHistoricSqlQueryHistoryReader', () => {
|
|||
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
|
||||
});
|
||||
|
||||
it('fetches query-history rows with cursor and maps them into RawQueryRow shape', async () => {
|
||||
it('fetches aggregated Snowflake query templates', async () => {
|
||||
const client = queryClient([
|
||||
{
|
||||
headers: [
|
||||
'QUERY_ID',
|
||||
'QUERY_TEXT',
|
||||
'USER_NAME',
|
||||
'ROLE_NAME',
|
||||
'WAREHOUSE_NAME',
|
||||
'DATABASE_NAME',
|
||||
'SCHEMA_NAME',
|
||||
'START_TIME',
|
||||
'END_TIME',
|
||||
'TOTAL_ELAPSED_TIME',
|
||||
'ROWS_PRODUCED',
|
||||
'EXECUTION_STATUS',
|
||||
'ERROR_CODE',
|
||||
'ERROR_MESSAGE',
|
||||
'template_id',
|
||||
'canonical_sql',
|
||||
'executions',
|
||||
'distinct_users',
|
||||
'first_seen',
|
||||
'last_seen',
|
||||
'p50_ms',
|
||||
'p95_ms',
|
||||
'error_rate',
|
||||
'rows_produced',
|
||||
'top_users',
|
||||
],
|
||||
rows: [
|
||||
[
|
||||
'01a',
|
||||
"SELECT count(*) FROM ANALYTICS.ORDERS WHERE STATUS = 'paid'",
|
||||
'ANALYST_A',
|
||||
'ANALYST_ROLE',
|
||||
'WH_XS',
|
||||
'ANALYTICS',
|
||||
'PUBLIC',
|
||||
'2026-05-04T10:00:00.000Z',
|
||||
'2026-05-04T10:00:01.250Z',
|
||||
1250,
|
||||
'hash-1',
|
||||
'select status from orders',
|
||||
42,
|
||||
3,
|
||||
'2026-05-01T00:00:00.000Z',
|
||||
'2026-05-11T00:00:00.000Z',
|
||||
12,
|
||||
'SUCCESS',
|
||||
null,
|
||||
null,
|
||||
],
|
||||
[
|
||||
'01b',
|
||||
'SELECT * FROM MISSING_TABLE',
|
||||
'ANALYST_B',
|
||||
'ANALYST_ROLE',
|
||||
'WH_XS',
|
||||
'ANALYTICS',
|
||||
'PUBLIC',
|
||||
new Date('2026-05-04T10:05:00.000Z'),
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
'FAILED_WITH_ERROR',
|
||||
'002003',
|
||||
'SQL compilation error',
|
||||
40,
|
||||
0.05,
|
||||
100,
|
||||
JSON.stringify([{ user: 'ANALYST', executions: 1 }]),
|
||||
],
|
||||
],
|
||||
totalRows: 2,
|
||||
totalRows: 1,
|
||||
},
|
||||
]);
|
||||
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
|
||||
|
||||
const rows = [];
|
||||
for await (const row of reader.fetch(
|
||||
for await (const row of reader.fetchAggregated(
|
||||
client,
|
||||
{
|
||||
start: new Date('2026-05-01T00:00:00.000Z'),
|
||||
end: new Date('2026-05-04T12:00:00.000Z'),
|
||||
},
|
||||
'2026-05-03T00:00:00.000Z',
|
||||
{ start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') },
|
||||
{ dialect: 'snowflake', minExecutions: 5, windowDays: 90, concurrency: 12, filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 },
|
||||
)) {
|
||||
rows.push(row);
|
||||
}
|
||||
|
||||
expect(client.executeQuery).toHaveBeenCalledTimes(1);
|
||||
const sql = firstQuery(client);
|
||||
expect(sql).toContain('FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY');
|
||||
expect(sql).toContain("START_TIME >= '2026-05-03T00:00:00.000Z'::TIMESTAMP_TZ");
|
||||
expect(sql).toContain("START_TIME < '2026-05-04T12:00:00.000Z'::TIMESTAMP_TZ");
|
||||
expect(sql).toContain('ORDER BY START_TIME ASC, QUERY_ID ASC');
|
||||
expect(sql).toContain('ROWS_PRODUCED');
|
||||
|
||||
expect(rows).toEqual([
|
||||
expect(sql).toContain('SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY');
|
||||
expect(sql).toContain('COUNT(*) AS executions');
|
||||
expect(sql).toContain('GROUP BY query_hash');
|
||||
expect(sql).toContain('HAVING COUNT(*) >= 5');
|
||||
expect(rows).toMatchObject([
|
||||
{
|
||||
id: '01a',
|
||||
sql: "SELECT count(*) FROM ANALYTICS.ORDERS WHERE STATUS = 'paid'",
|
||||
user: 'ANALYST_A',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: '2026-05-04T10:00:01.250Z',
|
||||
runtimeMs: 1250,
|
||||
rowsProduced: 12,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: '01b',
|
||||
sql: 'SELECT * FROM MISSING_TABLE',
|
||||
user: 'ANALYST_B',
|
||||
startedAt: '2026-05-04T10:05:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: null,
|
||||
rowsProduced: null,
|
||||
success: false,
|
||||
errorMessage: '002003: SQL compilation error',
|
||||
templateId: 'hash-1',
|
||||
stats: {
|
||||
executions: 42,
|
||||
errorRate: 0.05,
|
||||
},
|
||||
topUsers: [{ user: 'ANALYST', executions: 1 }],
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('uses the window start when no cursor is available', async () => {
|
||||
const client = queryClient([{ headers: ['QUERY_ID'], rows: [], totalRows: 0 }]);
|
||||
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
|
||||
|
||||
for await (const _row of reader.fetch(client, {
|
||||
start: new Date('2026-02-03T12:00:00.000Z'),
|
||||
end: new Date('2026-05-04T12:00:00.000Z'),
|
||||
})) {
|
||||
throw new Error('empty result should not yield rows');
|
||||
}
|
||||
|
||||
const sql = firstQuery(client);
|
||||
expect(sql).toContain("START_TIME >= '2026-02-03T12:00:00.000Z'::TIMESTAMP_TZ");
|
||||
});
|
||||
|
||||
it('throws a clear error when the query client cannot execute SQL', async () => {
|
||||
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
|
||||
|
||||
await expect(async () => {
|
||||
for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) {
|
||||
for await (const _row of reader.fetchAggregated(
|
||||
{},
|
||||
{ start: new Date(), end: new Date() },
|
||||
{
|
||||
dialect: 'snowflake',
|
||||
minExecutions: 5,
|
||||
windowDays: 90,
|
||||
concurrency: 12,
|
||||
filters: { dropTrivialProbes: true },
|
||||
redactionPatterns: [],
|
||||
staleArchiveAfterDays: 90,
|
||||
},
|
||||
)) {
|
||||
throw new Error('unreachable');
|
||||
}
|
||||
}).rejects.toThrow('Historic SQL Snowflake reader requires a query client with executeQuery(query)');
|
||||
|
|
|
|||
|
|
@ -1,5 +1,10 @@
|
|||
import { HistoricSqlGrantsMissingError } from './errors.js';
|
||||
import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js';
|
||||
import {
|
||||
aggregatedTemplateSchema,
|
||||
type AggregatedTemplate,
|
||||
type HistoricSqlTimeWindow,
|
||||
type HistoricSqlUnifiedPullConfig,
|
||||
} from './types.js';
|
||||
|
||||
interface QueryResultLike {
|
||||
headers: string[];
|
||||
|
|
@ -52,32 +57,6 @@ function timestampLiteral(value: Date | string): string {
|
|||
return `'${date.toISOString().replace(/'/g, "''")}'::TIMESTAMP_TZ`;
|
||||
}
|
||||
|
||||
function queryHistorySql(window: HistoricSqlTimeWindow, cursor?: string | null): string {
|
||||
const start = timestampLiteral(cursor ?? window.start);
|
||||
const end = timestampLiteral(window.end);
|
||||
return `
|
||||
SELECT
|
||||
QUERY_ID,
|
||||
QUERY_TEXT,
|
||||
USER_NAME,
|
||||
ROLE_NAME,
|
||||
WAREHOUSE_NAME,
|
||||
DATABASE_NAME,
|
||||
SCHEMA_NAME,
|
||||
START_TIME,
|
||||
END_TIME,
|
||||
TOTAL_ELAPSED_TIME,
|
||||
ROWS_PRODUCED,
|
||||
EXECUTION_STATUS,
|
||||
ERROR_CODE,
|
||||
ERROR_MESSAGE
|
||||
FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
|
||||
WHERE START_TIME >= ${start}
|
||||
AND START_TIME < ${end}
|
||||
AND QUERY_TEXT IS NOT NULL
|
||||
ORDER BY START_TIME ASC, QUERY_ID ASC`.trim();
|
||||
}
|
||||
|
||||
function indexByHeader(headers: string[]): Map<string, number> {
|
||||
const out = new Map<string, number>();
|
||||
headers.forEach((header, index) => {
|
||||
|
|
@ -87,7 +66,7 @@ function indexByHeader(headers: string[]): Map<string, number> {
|
|||
}
|
||||
|
||||
function value(row: unknown[], indexes: Map<string, number>, name: string): unknown {
|
||||
const index = indexes.get(name);
|
||||
const index = indexes.get(name.toUpperCase());
|
||||
return index === undefined ? null : row[index];
|
||||
}
|
||||
|
||||
|
|
@ -118,6 +97,18 @@ function nullableNumber(raw: unknown): number | null {
|
|||
return number;
|
||||
}
|
||||
|
||||
function requiredNumber(raw: unknown, field: string): number {
|
||||
const number = nullableNumber(raw);
|
||||
if (number === null) {
|
||||
throw new Error(`Snowflake QUERY_HISTORY row has invalid ${field}: ${String(raw)}`);
|
||||
}
|
||||
return number;
|
||||
}
|
||||
|
||||
function requiredInteger(raw: unknown, field: string): number {
|
||||
return Math.trunc(requiredNumber(raw, field));
|
||||
}
|
||||
|
||||
function nullableInteger(raw: unknown): number | null {
|
||||
const number = nullableNumber(raw);
|
||||
return number === null ? null : Math.trunc(number);
|
||||
|
|
@ -135,46 +126,50 @@ function isoTimestamp(raw: unknown, field: string): string {
|
|||
return date.toISOString();
|
||||
}
|
||||
|
||||
function nullableIsoTimestamp(raw: unknown): string | null {
|
||||
if (raw === null || raw === undefined || raw === '') {
|
||||
return null;
|
||||
function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> {
|
||||
const text = nullableString(raw);
|
||||
if (!text) {
|
||||
return [];
|
||||
}
|
||||
return isoTimestamp(raw, 'END_TIME');
|
||||
}
|
||||
|
||||
function executionSucceeded(status: string | null, errorCode: string | null, errorMessage: string | null): boolean {
|
||||
if (errorCode || errorMessage) {
|
||||
return false;
|
||||
try {
|
||||
const parsed = JSON.parse(text) as unknown;
|
||||
if (!Array.isArray(parsed)) {
|
||||
return [];
|
||||
}
|
||||
return parsed.flatMap((entry) => {
|
||||
if (!entry || typeof entry !== 'object') {
|
||||
return [];
|
||||
}
|
||||
const user = nullableString((entry as { user?: unknown }).user);
|
||||
const executions = nullableInteger((entry as { executions?: unknown }).executions);
|
||||
return executions === null ? [] : [{ user, executions }];
|
||||
});
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
return status === null || status.toUpperCase().startsWith('SUCCESS');
|
||||
}
|
||||
|
||||
function combinedErrorMessage(errorCode: string | null, errorMessage: string | null): string | null {
|
||||
if (errorCode && errorMessage) {
|
||||
return `${errorCode}: ${errorMessage}`;
|
||||
}
|
||||
return errorMessage ?? errorCode;
|
||||
function mapAggregatedRow(row: unknown[], indexes: Map<string, number>): AggregatedTemplate {
|
||||
return aggregatedTemplateSchema.parse({
|
||||
templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'),
|
||||
canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'),
|
||||
dialect: 'snowflake',
|
||||
stats: {
|
||||
executions: requiredInteger(value(row, indexes, 'executions'), 'executions'),
|
||||
distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'),
|
||||
firstSeen: isoTimestamp(value(row, indexes, 'first_seen'), 'first_seen'),
|
||||
lastSeen: isoTimestamp(value(row, indexes, 'last_seen'), 'last_seen'),
|
||||
p50RuntimeMs: nullableNumber(value(row, indexes, 'p50_ms')),
|
||||
p95RuntimeMs: nullableNumber(value(row, indexes, 'p95_ms')),
|
||||
errorRate: requiredNumber(value(row, indexes, 'error_rate'), 'error_rate'),
|
||||
rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')),
|
||||
},
|
||||
topUsers: parseTopUsers(value(row, indexes, 'top_users')),
|
||||
});
|
||||
}
|
||||
|
||||
function mapRow(row: unknown[], indexes: Map<string, number>): HistoricSqlRawQueryRow {
|
||||
const errorCode = nullableString(value(row, indexes, 'ERROR_CODE'));
|
||||
const errorMessage = nullableString(value(row, indexes, 'ERROR_MESSAGE'));
|
||||
const rowsProduced = nullableInteger(value(row, indexes, 'ROWS_PRODUCED'));
|
||||
return {
|
||||
id: requiredString(value(row, indexes, 'QUERY_ID'), 'QUERY_ID'),
|
||||
sql: requiredString(value(row, indexes, 'QUERY_TEXT'), 'QUERY_TEXT'),
|
||||
user: nullableString(value(row, indexes, 'USER_NAME')),
|
||||
startedAt: isoTimestamp(value(row, indexes, 'START_TIME'), 'START_TIME'),
|
||||
endedAt: nullableIsoTimestamp(value(row, indexes, 'END_TIME')),
|
||||
runtimeMs: nullableNumber(value(row, indexes, 'TOTAL_ELAPSED_TIME')),
|
||||
rowsProduced,
|
||||
success: executionSucceeded(nullableString(value(row, indexes, 'EXECUTION_STATUS')), errorCode, errorMessage),
|
||||
errorMessage: combinedErrorMessage(errorCode, errorMessage),
|
||||
};
|
||||
}
|
||||
|
||||
export class SnowflakeHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader {
|
||||
async probe(client: unknown): Promise<void> {
|
||||
export class SnowflakeHistoricSqlQueryHistoryReader {
|
||||
async probe(client: unknown): Promise<{ warnings: string[]; info: string[] }> {
|
||||
let result: QueryResultLike;
|
||||
try {
|
||||
result = await queryClient(client).executeQuery(PROBE_SQL);
|
||||
|
|
@ -184,20 +179,42 @@ export class SnowflakeHistoricSqlQueryHistoryReader implements HistoricSqlQueryH
|
|||
if (result.error) {
|
||||
throw grantsError(result.error);
|
||||
}
|
||||
return { warnings: [], info: [] };
|
||||
}
|
||||
|
||||
async *fetch(
|
||||
async *fetchAggregated(
|
||||
client: unknown,
|
||||
window: HistoricSqlTimeWindow,
|
||||
cursor?: string | null,
|
||||
): AsyncIterable<HistoricSqlRawQueryRow> {
|
||||
const result = await queryClient(client).executeQuery(queryHistorySql(window, cursor));
|
||||
config: HistoricSqlUnifiedPullConfig,
|
||||
): AsyncIterable<AggregatedTemplate> {
|
||||
const sql = `
|
||||
SELECT
|
||||
query_hash AS template_id,
|
||||
MIN(query_text) AS canonical_sql,
|
||||
COUNT(*) AS executions,
|
||||
COUNT(DISTINCT user_name) AS distinct_users,
|
||||
MIN(start_time) AS first_seen,
|
||||
MAX(start_time) AS last_seen,
|
||||
APPROX_PERCENTILE(total_elapsed_time, 0.50) AS p50_ms,
|
||||
APPROX_PERCENTILE(total_elapsed_time, 0.95) AS p95_ms,
|
||||
DIV0(COUNT_IF(execution_status != 'SUCCESS'), COUNT(*)) AS error_rate,
|
||||
SUM(rows_produced) AS rows_produced,
|
||||
ARRAY_AGG(OBJECT_CONSTRUCT('user', user_name, 'executions', 1)) WITHIN GROUP (ORDER BY start_time DESC)::string AS top_users
|
||||
FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
|
||||
WHERE query_text IS NOT NULL
|
||||
AND query_type IN ('SELECT', 'MERGE')
|
||||
AND start_time >= ${timestampLiteral(window.start)}
|
||||
AND start_time < ${timestampLiteral(window.end)}
|
||||
GROUP BY query_hash
|
||||
HAVING COUNT(*) >= ${config.minExecutions}
|
||||
ORDER BY executions DESC`.trim();
|
||||
const result = await queryClient(client).executeQuery(sql);
|
||||
if (result.error) {
|
||||
throw grantsError(result.error);
|
||||
}
|
||||
const indexes = indexByHeader(result.headers);
|
||||
for (const row of result.rows) {
|
||||
yield mapRow(row, indexes);
|
||||
yield mapAggregatedRow(row, indexes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,152 +0,0 @@
|
|||
import { mkdir, mkdtemp, readdir, readFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { dirname, join, relative } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import { stagePgStatStatementsTemplates, writePgssBaselineAtomic, type PgssBaseline } from './stage-pgss.js';
|
||||
import type { HistoricSqlPullConfig, KtxPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js';
|
||||
|
||||
const FIXTURE_ROOT = join(__dirname, '__fixtures__/postgres');
|
||||
|
||||
interface GoldenFixture {
|
||||
name: string;
|
||||
now: string;
|
||||
connectionId: string;
|
||||
probe: {
|
||||
pgServerVersion: string;
|
||||
warnings: string[];
|
||||
};
|
||||
snapshot: {
|
||||
statsResetAt: string | null;
|
||||
deallocCount: number | null;
|
||||
rows: PostgresPgssRow[];
|
||||
};
|
||||
pullConfig: HistoricSqlPullConfig & { dialect: 'postgres' };
|
||||
analysisBySql: Record<
|
||||
string,
|
||||
{
|
||||
fingerprint: string;
|
||||
normalizedSql: string;
|
||||
tablesTouched: string[];
|
||||
literalSlots: [];
|
||||
error?: string;
|
||||
}
|
||||
>;
|
||||
baseline: PgssBaseline | null;
|
||||
expectedBaseline: PgssBaseline;
|
||||
expectedFiles: Record<string, { json?: unknown; text?: string }>;
|
||||
}
|
||||
|
||||
async function readFixture(name: string): Promise<GoldenFixture> {
|
||||
return JSON.parse(await readFile(join(FIXTURE_ROOT, name, 'input.json'), 'utf-8')) as GoldenFixture;
|
||||
}
|
||||
|
||||
async function tempDir(prefix: string): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), prefix));
|
||||
}
|
||||
|
||||
function fakePgClient(): KtxPostgresQueryClient {
|
||||
return {
|
||||
async executeQuery() {
|
||||
return { headers: [], rows: [] };
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function fixtureReader(fixture: GoldenFixture): PostgresPgssReader {
|
||||
return {
|
||||
async probe() {
|
||||
return fixture.probe;
|
||||
},
|
||||
async readSnapshot(_client, options) {
|
||||
return {
|
||||
statsResetAt: fixture.snapshot.statsResetAt,
|
||||
deallocCount: fixture.snapshot.deallocCount,
|
||||
rows: fixture.snapshot.rows.slice(0, options.maxTemplates),
|
||||
};
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function fixtureSqlAnalysis(fixture: GoldenFixture): SqlAnalysisPort {
|
||||
return {
|
||||
async analyzeForFingerprint(sql) {
|
||||
const result = fixture.analysisBySql[sql];
|
||||
if (!result) {
|
||||
return {
|
||||
fingerprint: '',
|
||||
normalizedSql: '',
|
||||
tablesTouched: [],
|
||||
literalSlots: [],
|
||||
error: `missing fixture analysis for ${sql}`,
|
||||
};
|
||||
}
|
||||
return result;
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async function writeFixtureBaseline(path: string, baseline: PgssBaseline | null): Promise<void> {
|
||||
if (!baseline) {
|
||||
return;
|
||||
}
|
||||
await writePgssBaselineAtomic(path, baseline);
|
||||
}
|
||||
|
||||
async function listFiles(root: string, current = root): Promise<string[]> {
|
||||
const entries = await readdir(current, { withFileTypes: true });
|
||||
const files: string[] = [];
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(current, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await listFiles(root, fullPath)));
|
||||
} else {
|
||||
files.push(relative(root, fullPath));
|
||||
}
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
async function expectGoldenFiles(stagedDir: string, expectedFiles: GoldenFixture['expectedFiles']): Promise<void> {
|
||||
const actualFiles = await listFiles(stagedDir);
|
||||
const expectedPaths = Object.keys(expectedFiles).sort();
|
||||
expect(actualFiles.sort()).toEqual(expectedPaths);
|
||||
|
||||
for (const path of expectedPaths) {
|
||||
const expected = expectedFiles[path];
|
||||
const actual = await readFile(join(stagedDir, path), 'utf-8');
|
||||
if ('json' in expected) {
|
||||
expect(JSON.parse(actual)).toEqual(expected.json);
|
||||
} else {
|
||||
expect(actual).toBe(expected.text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
describe('stagePgStatStatementsTemplates golden fixtures', () => {
|
||||
it.each(['first-run', 'normal-delta', 'reset-detected', 'version-change', 'eviction-churn'] as const)(
|
||||
'matches the committed %s golden output',
|
||||
async (fixtureName) => {
|
||||
const fixture = await readFixture(fixtureName);
|
||||
const root = await tempDir(`pgss-golden-${fixtureName}-`);
|
||||
const stagedDir = join(root, 'staged');
|
||||
const baselinePath = join(root, 'cache', fixture.connectionId, 'pgss-baseline.json');
|
||||
await mkdir(dirname(baselinePath), { recursive: true });
|
||||
await writeFixtureBaseline(baselinePath, fixture.baseline);
|
||||
|
||||
const result = await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: fixture.connectionId,
|
||||
queryClient: fakePgClient(),
|
||||
reader: fixtureReader(fixture),
|
||||
sqlAnalysis: fixtureSqlAnalysis(fixture),
|
||||
pullConfig: fixture.pullConfig,
|
||||
baselinePath,
|
||||
now: new Date(fixture.now),
|
||||
});
|
||||
|
||||
await expectGoldenFiles(stagedDir, fixture.expectedFiles);
|
||||
expect(result.baseline).toEqual(fixture.expectedBaseline);
|
||||
},
|
||||
);
|
||||
});
|
||||
|
|
@ -1,652 +0,0 @@
|
|||
import { mkdtemp, readFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import {
|
||||
pgssBaselinePath,
|
||||
readPgssBaseline,
|
||||
stagePgStatStatementsTemplates,
|
||||
writePgssBaselineAtomic,
|
||||
type PgssBaseline,
|
||||
} from './stage-pgss.js';
|
||||
import { historicSqlManifestSchema, historicSqlMetadataSchema, historicSqlUsageSchema } from './types.js';
|
||||
import type { KtxPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js';
|
||||
|
||||
async function tempDir(prefix: string): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), prefix));
|
||||
}
|
||||
|
||||
async function readJson<T>(root: string, relPath: string): Promise<T> {
|
||||
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
|
||||
}
|
||||
|
||||
function fakePgClient(): KtxPostgresQueryClient {
|
||||
return {
|
||||
async executeQuery() {
|
||||
return { headers: [], rows: [] };
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function row(overrides: Partial<PostgresPgssRow> & Pick<PostgresPgssRow, 'queryid' | 'query'>): PostgresPgssRow {
|
||||
return {
|
||||
userid: '11',
|
||||
username: 'analyst',
|
||||
dbid: '5',
|
||||
database: 'warehouse',
|
||||
calls: 10,
|
||||
totalExecTime: 250,
|
||||
meanExecTime: 25,
|
||||
totalRows: 20,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function fakeReader(input: {
|
||||
pgServerVersion?: string;
|
||||
warnings?: string[];
|
||||
statsResetAt?: string | null;
|
||||
deallocCount?: number | null;
|
||||
rows: PostgresPgssRow[];
|
||||
}): PostgresPgssReader {
|
||||
return {
|
||||
probe: vi.fn(async () => ({
|
||||
pgServerVersion: input.pgServerVersion ?? 'PostgreSQL 16.4',
|
||||
warnings: input.warnings ?? [],
|
||||
})),
|
||||
readSnapshot: vi.fn(async (_client, options) => ({
|
||||
statsResetAt: input.statsResetAt ?? '2026-05-08T08:00:00.000Z',
|
||||
deallocCount: input.deallocCount ?? 0,
|
||||
rows: input.rows.slice(0, options.maxTemplates),
|
||||
})),
|
||||
};
|
||||
}
|
||||
|
||||
const sqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
if (sql.includes('broken')) {
|
||||
return {
|
||||
fingerprint: '',
|
||||
normalizedSql: '',
|
||||
tablesTouched: [],
|
||||
literalSlots: [],
|
||||
error: 'parse failed',
|
||||
};
|
||||
}
|
||||
if (sql.includes('customers')) {
|
||||
return {
|
||||
fingerprint: 'fp_customers',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.customers',
|
||||
tablesTouched: ['analytics.customers'],
|
||||
literalSlots: [],
|
||||
};
|
||||
}
|
||||
return {
|
||||
fingerprint: 'fp_orders',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
function postgresPullConfig(maxTemplatesPerRun = 5000) {
|
||||
return {
|
||||
dialect: 'postgres' as const,
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: ['^svc_'],
|
||||
redactionPatterns: ['secret'],
|
||||
maxTemplatesPerRun,
|
||||
minCalls: 5,
|
||||
};
|
||||
}
|
||||
|
||||
describe('stagePgStatStatementsTemplates', () => {
|
||||
it('stages first-run PGSS templates as degraded aggregate templates and builds a next baseline', async () => {
|
||||
const stagedDir = await tempDir('pgss-stage-first-');
|
||||
const baselineRootDir = await tempDir('pgss-baseline-first-');
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
|
||||
const result = await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
warnings: ['pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config'],
|
||||
deallocCount: 2,
|
||||
rows: [
|
||||
row({
|
||||
queryid: '101',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 10,
|
||||
totalExecTime: 250,
|
||||
totalRows: 20,
|
||||
}),
|
||||
row({
|
||||
queryid: '102',
|
||||
query: 'SELECT * FROM pg_catalog.pg_class',
|
||||
calls: 50,
|
||||
totalExecTime: 500,
|
||||
}),
|
||||
row({
|
||||
queryid: '103',
|
||||
query: 'BEGIN',
|
||||
calls: 75,
|
||||
totalExecTime: 75,
|
||||
}),
|
||||
row({
|
||||
queryid: '104',
|
||||
query: 'SELECT broken FROM analytics.orders',
|
||||
calls: 8,
|
||||
totalExecTime: 80,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest).toMatchObject({
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_pg',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-08T12:00:00.000Z',
|
||||
windowEnd: '2026-05-08T12:00:00.000Z',
|
||||
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
|
||||
templateCount: 1,
|
||||
capped: false,
|
||||
degraded: true,
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
baselineFirstRun: true,
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
deallocCount: 2,
|
||||
});
|
||||
expect(manifest.warnings).toEqual([
|
||||
'pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config',
|
||||
'pgss_dealloc_count:2; pg_stat_statements.max may be too low, causing template eviction churn',
|
||||
'baseline_first_run:no_previous_pgss_baseline',
|
||||
'analysis_failed:db5_q104',
|
||||
]);
|
||||
expect(manifest.templates).toEqual([
|
||||
{
|
||||
id: 'db5_q101',
|
||||
fingerprint: 'fp_orders',
|
||||
subClusterId: null,
|
||||
path: 'templates/db5_q101/page.md',
|
||||
},
|
||||
]);
|
||||
|
||||
const metadata = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q101/metadata.json'));
|
||||
expect(metadata).toMatchObject({
|
||||
id: 'db5_q101',
|
||||
title: 'postgres · analytics.orders [db5_q101]',
|
||||
path: 'templates/db5_q101/page.md',
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_orders',
|
||||
sub_cluster_id: null,
|
||||
dialect: 'postgres',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [],
|
||||
},
|
||||
});
|
||||
expect(metadata.properties.triage_signals).toEqual({
|
||||
executions_bucket: 'mid',
|
||||
distinct_users_bucket: 'solo',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
runtime_bucket: 'fast',
|
||||
});
|
||||
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q101/usage.json'));
|
||||
expect(usage).toEqual({
|
||||
stats: {
|
||||
executions: 10,
|
||||
distinct_users: 1,
|
||||
first_seen: '2026-05-08T12:00:00.000Z',
|
||||
last_seen: '2026-05-08T12:00:00.000Z',
|
||||
p50_runtime_ms: null,
|
||||
p95_runtime_ms: null,
|
||||
mean_runtime_ms: 25,
|
||||
error_rate: 0,
|
||||
rows_produced: 20,
|
||||
},
|
||||
literal_slots: [],
|
||||
samples: [],
|
||||
});
|
||||
|
||||
expect(await readFile(join(stagedDir, 'templates/db5_q101/page.md'), 'utf-8')).toContain(
|
||||
'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
);
|
||||
expect(result.baselinePath).toBe(baselinePath);
|
||||
expect(result.baseline.templates.db5_q101.perUser['11']).toEqual({
|
||||
calls: 10,
|
||||
totalExecTime: 250,
|
||||
totalRows: 20,
|
||||
});
|
||||
await expect(readPgssBaseline(baselinePath)).resolves.toBeNull();
|
||||
});
|
||||
|
||||
it('warns when pg_stat_statements reports dealloc churn', async () => {
|
||||
const root = await tempDir('pgss-churn-');
|
||||
const stagedDir = join(root, 'staged');
|
||||
const baselinePath = join(root, 'cache', 'warehouse', 'pgss-baseline.json');
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'warehouse',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
rows: [
|
||||
row({
|
||||
queryid: '901',
|
||||
query: 'SELECT COUNT(*) FROM public.orders WHERE status = $1',
|
||||
calls: 20,
|
||||
totalExecTime: 500,
|
||||
meanExecTime: 25,
|
||||
}),
|
||||
],
|
||||
deallocCount: 3,
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(50),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = await readJson<{ warnings: string[]; deallocCount: number }>(stagedDir, 'manifest.json');
|
||||
expect(manifest.deallocCount).toBe(3);
|
||||
expect(manifest.warnings).toContain(
|
||||
'pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn',
|
||||
);
|
||||
});
|
||||
|
||||
it('uses the saved cumulative baseline to stage only positive deltas on later runs', async () => {
|
||||
const stagedDir = await tempDir('pgss-stage-delta-');
|
||||
const baselineRootDir = await tempDir('pgss-baseline-delta-');
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
const baseline: PgssBaseline = {
|
||||
version: 1,
|
||||
fetchedAt: '2026-05-08T10:00:00.000Z',
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
templates: {
|
||||
db5_q201: {
|
||||
firstObservedAt: '2026-05-08T09:00:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 10, totalExecTime: 100, totalRows: 50 },
|
||||
'12': { calls: 5, totalExecTime: 50, totalRows: 25 },
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
await writePgssBaselineAtomic(baselinePath, baseline);
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
rows: [
|
||||
row({
|
||||
queryid: '201',
|
||||
userid: '11',
|
||||
username: 'analyst',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 12,
|
||||
totalExecTime: 160,
|
||||
totalRows: 58,
|
||||
}),
|
||||
row({
|
||||
queryid: '201',
|
||||
userid: '12',
|
||||
username: 'svc_loader',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 5,
|
||||
totalExecTime: 50,
|
||||
totalRows: 25,
|
||||
}),
|
||||
row({
|
||||
queryid: '202',
|
||||
userid: '13',
|
||||
username: 'analyst_2',
|
||||
query: 'SELECT count(*) FROM analytics.customers',
|
||||
calls: 7,
|
||||
totalExecTime: 210,
|
||||
totalRows: 7,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.baselineFirstRun).toBe(false);
|
||||
expect(manifest.windowStart).toBe('2026-05-08T10:00:00.000Z');
|
||||
expect(manifest.templateCount).toBe(2);
|
||||
expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q202', 'db5_q201']);
|
||||
|
||||
const usage201 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q201/usage.json'));
|
||||
expect(usage201.stats).toMatchObject({
|
||||
executions: 2,
|
||||
distinct_users: 1,
|
||||
first_seen: '2026-05-08T09:00:00.000Z',
|
||||
last_seen: '2026-05-08T12:00:00.000Z',
|
||||
mean_runtime_ms: 30,
|
||||
rows_produced: 8,
|
||||
});
|
||||
const metadata201 = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q201/metadata.json'));
|
||||
expect(metadata201.properties.triage_signals.service_account_only).toBe('false');
|
||||
|
||||
const usage202 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q202/usage.json'));
|
||||
expect(usage202.stats).toMatchObject({
|
||||
executions: 7,
|
||||
distinct_users: 1,
|
||||
first_seen: '2026-05-08T12:00:00.000Z',
|
||||
mean_runtime_ms: 30,
|
||||
rows_produced: 7,
|
||||
});
|
||||
});
|
||||
|
||||
it('keeps matching queryid values from different databases as distinct templates and baseline entries', async () => {
|
||||
const stagedDir = await tempDir('pgss-stage-db-key-');
|
||||
const baselineRootDir = await tempDir('pgss-baseline-db-key-');
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
await writePgssBaselineAtomic(baselinePath, {
|
||||
version: 1,
|
||||
fetchedAt: '2026-05-08T10:00:00.000Z',
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
templates: {
|
||||
db5_q701: {
|
||||
firstObservedAt: '2026-05-08T09:00:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 10, totalExecTime: 100, totalRows: 50 },
|
||||
},
|
||||
},
|
||||
db6_q701: {
|
||||
firstObservedAt: '2026-05-08T09:30:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 4, totalExecTime: 40, totalRows: 20 },
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const result = await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
rows: [
|
||||
row({
|
||||
queryid: '701',
|
||||
dbid: '5',
|
||||
database: 'warehouse',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 12,
|
||||
totalExecTime: 160,
|
||||
totalRows: 58,
|
||||
}),
|
||||
row({
|
||||
queryid: '701',
|
||||
dbid: '6',
|
||||
database: 'app',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 9,
|
||||
totalExecTime: 130,
|
||||
totalRows: 35,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.templates.map((template) => template.id).sort()).toEqual(['db5_q701', 'db6_q701']);
|
||||
|
||||
const warehouseUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q701/usage.json'));
|
||||
expect(warehouseUsage.stats).toMatchObject({
|
||||
executions: 2,
|
||||
rows_produced: 8,
|
||||
first_seen: '2026-05-08T09:00:00.000Z',
|
||||
});
|
||||
|
||||
const appUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db6_q701/usage.json'));
|
||||
expect(appUsage.stats).toMatchObject({
|
||||
executions: 5,
|
||||
rows_produced: 15,
|
||||
first_seen: '2026-05-08T09:30:00.000Z',
|
||||
});
|
||||
|
||||
expect(result.baseline.templates.db5_q701.perUser['11']).toEqual({
|
||||
calls: 12,
|
||||
totalExecTime: 160,
|
||||
totalRows: 58,
|
||||
});
|
||||
expect(result.baseline.templates.db6_q701.perUser['11']).toEqual({
|
||||
calls: 9,
|
||||
totalExecTime: 130,
|
||||
totalRows: 35,
|
||||
});
|
||||
});
|
||||
|
||||
it('treats stats_reset advancement and major-version changes as fresh baselines', async () => {
|
||||
const resetStagedDir = await tempDir('pgss-stage-reset-');
|
||||
const resetBaselineRootDir = await tempDir('pgss-baseline-reset-');
|
||||
const resetBaselinePath = pgssBaselinePath(resetBaselineRootDir, 'conn_pg');
|
||||
await writePgssBaselineAtomic(resetBaselinePath, {
|
||||
version: 1,
|
||||
fetchedAt: '2026-05-08T10:00:00.000Z',
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
templates: {
|
||||
db5_q301: {
|
||||
firstObservedAt: '2026-05-08T09:00:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir: resetStagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
statsResetAt: '2026-05-08T11:00:00.000Z',
|
||||
rows: [
|
||||
row({
|
||||
queryid: '301',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 3,
|
||||
totalExecTime: 90,
|
||||
totalRows: 9,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath: resetBaselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const resetManifest = historicSqlManifestSchema.parse(await readJson(resetStagedDir, 'manifest.json'));
|
||||
expect(resetManifest.baselineFirstRun).toBe(true);
|
||||
expect(resetManifest.warnings).toContain(
|
||||
'baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z',
|
||||
);
|
||||
const resetUsage = historicSqlUsageSchema.parse(await readJson(resetStagedDir, 'templates/db5_q301/usage.json'));
|
||||
expect(resetUsage.stats.executions).toBe(3);
|
||||
|
||||
const versionStagedDir = await tempDir('pgss-stage-version-');
|
||||
const versionBaselineRootDir = await tempDir('pgss-baseline-version-');
|
||||
const versionBaselinePath = pgssBaselinePath(versionBaselineRootDir, 'conn_pg');
|
||||
await writePgssBaselineAtomic(versionBaselinePath, {
|
||||
version: 1,
|
||||
fetchedAt: '2026-05-08T10:00:00.000Z',
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
pgServerVersion: 'PostgreSQL 15.7',
|
||||
templates: {
|
||||
db5_q302: {
|
||||
firstObservedAt: '2026-05-08T09:00:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir: versionStagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
rows: [
|
||||
row({
|
||||
queryid: '302',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 4,
|
||||
totalExecTime: 80,
|
||||
totalRows: 8,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath: versionBaselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const versionManifest = historicSqlManifestSchema.parse(await readJson(versionStagedDir, 'manifest.json'));
|
||||
expect(versionManifest.baselineFirstRun).toBe(true);
|
||||
expect(versionManifest.warnings).toContain('baseline_reset:pg_server_major changed from 15 to 16');
|
||||
});
|
||||
|
||||
it('handles scoped counter regressions without forcing a global first-run baseline', async () => {
|
||||
const stagedDir = await tempDir('pgss-stage-scoped-');
|
||||
const baselineRootDir = await tempDir('pgss-baseline-scoped-');
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
await writePgssBaselineAtomic(baselinePath, {
|
||||
version: 1,
|
||||
fetchedAt: '2026-05-08T10:00:00.000Z',
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
pgServerVersion: 'PostgreSQL 16.4',
|
||||
templates: {
|
||||
db5_q401: {
|
||||
firstObservedAt: '2026-05-08T09:00:00.000Z',
|
||||
perUser: {
|
||||
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
|
||||
'12': { calls: 50, totalExecTime: 500, totalRows: 250 },
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
statsResetAt: '2026-05-08T08:00:00.000Z',
|
||||
rows: [
|
||||
row({
|
||||
queryid: '401',
|
||||
userid: '11',
|
||||
username: 'analyst',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 2,
|
||||
totalExecTime: 30,
|
||||
totalRows: 6,
|
||||
}),
|
||||
row({
|
||||
queryid: '401',
|
||||
userid: '12',
|
||||
username: 'svc_loader',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 55,
|
||||
totalExecTime: 650,
|
||||
totalRows: 275,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.baselineFirstRun).toBe(false);
|
||||
expect(manifest.warnings).toContain('scoped_reset:dbid=5 queryid=401 userid=11');
|
||||
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q401/usage.json'));
|
||||
expect(usage.stats).toMatchObject({
|
||||
executions: 7,
|
||||
distinct_users: 2,
|
||||
mean_runtime_ms: 25.714285714285715,
|
||||
rows_produced: 31,
|
||||
});
|
||||
});
|
||||
|
||||
it('ranks and caps selected PGSS templates after skip and analysis filtering', async () => {
|
||||
const stagedDir = await tempDir('pgss-stage-cap-');
|
||||
const baselineRootDir = await tempDir('pgss-baseline-cap-');
|
||||
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
|
||||
|
||||
await stagePgStatStatementsTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_pg',
|
||||
queryClient: fakePgClient(),
|
||||
reader: fakeReader({
|
||||
rows: [
|
||||
row({
|
||||
queryid: '501',
|
||||
username: 'analyst_a',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 2,
|
||||
totalExecTime: 20,
|
||||
}),
|
||||
row({
|
||||
queryid: '502',
|
||||
username: 'analyst_b',
|
||||
query: 'SELECT count(*) FROM analytics.customers',
|
||||
calls: 20,
|
||||
totalExecTime: 200,
|
||||
}),
|
||||
row({
|
||||
queryid: '503',
|
||||
username: 'analyst_c',
|
||||
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
|
||||
calls: 10,
|
||||
totalExecTime: 100,
|
||||
}),
|
||||
],
|
||||
}),
|
||||
sqlAnalysis,
|
||||
pullConfig: postgresPullConfig(2),
|
||||
baselinePath,
|
||||
now: new Date('2026-05-08T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.capped).toBe(true);
|
||||
expect(manifest.warnings).toContain('templates_truncated: kept 2 of 3 templates');
|
||||
expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q502', 'db5_q503']);
|
||||
});
|
||||
});
|
||||
|
|
@ -1,508 +0,0 @@
|
|||
import { mkdir, readFile, rename, writeFile } from 'node:fs/promises';
|
||||
import { dirname, join } from 'node:path';
|
||||
import { z } from 'zod';
|
||||
import type { SqlAnalysisFingerprintResult, SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import {
|
||||
HISTORIC_SQL_OBJECT_TYPE,
|
||||
HISTORIC_SQL_SOURCE_KEY,
|
||||
historicSqlPullConfigSchema,
|
||||
type HistoricSqlManifest,
|
||||
type HistoricSqlMetadata,
|
||||
type HistoricSqlPullConfig,
|
||||
type HistoricSqlUsage,
|
||||
type KtxPostgresQueryClient,
|
||||
type PostgresPgssAggregateRow,
|
||||
type PostgresPgssReader,
|
||||
type PostgresPgssRow,
|
||||
} from './types.js';
|
||||
|
||||
const PGSS_BASELINE_VERSION = 1 as const;
|
||||
|
||||
const pgssCounterSchema = z.object({
|
||||
calls: z.number().int().nonnegative(),
|
||||
totalExecTime: z.number().nonnegative(),
|
||||
totalRows: z.number().int().nonnegative(),
|
||||
});
|
||||
|
||||
const pgssBaselineSchema = z.object({
|
||||
version: z.literal(PGSS_BASELINE_VERSION),
|
||||
fetchedAt: z.string().datetime(),
|
||||
statsResetAt: z.string().datetime().nullable(),
|
||||
pgServerVersion: z.string(),
|
||||
templates: z.record(
|
||||
z.string(),
|
||||
z.object({
|
||||
firstObservedAt: z.string().datetime(),
|
||||
perUser: z.record(z.string(), pgssCounterSchema),
|
||||
}),
|
||||
),
|
||||
});
|
||||
|
||||
export type PgssBaseline = z.infer<typeof pgssBaselineSchema>;
|
||||
|
||||
export interface StagePgStatStatementsTemplatesInput {
|
||||
stagedDir: string;
|
||||
connectionId: string;
|
||||
queryClient: KtxPostgresQueryClient;
|
||||
reader: PostgresPgssReader;
|
||||
sqlAnalysis: SqlAnalysisPort;
|
||||
pullConfig: HistoricSqlPullConfig;
|
||||
baselinePath: string;
|
||||
now?: Date;
|
||||
}
|
||||
|
||||
export interface StagePgStatStatementsTemplatesResult {
|
||||
baselinePath: string;
|
||||
baseline: PgssBaseline;
|
||||
}
|
||||
|
||||
interface PgssBaselineCounter {
|
||||
calls: number;
|
||||
totalExecTime: number;
|
||||
totalRows: number;
|
||||
}
|
||||
|
||||
interface PgssAggregateMutable {
|
||||
id: string;
|
||||
queryid: string;
|
||||
dbid: string;
|
||||
database: string | null;
|
||||
query: string;
|
||||
deltaCalls: number;
|
||||
deltaExecTime: number;
|
||||
deltaRows: number;
|
||||
users: Set<string>;
|
||||
firstObservedAt: string;
|
||||
}
|
||||
|
||||
interface AnalyzedPgssTemplate {
|
||||
aggregate: PostgresPgssAggregateRow;
|
||||
analysis: SqlAnalysisFingerprintResult;
|
||||
}
|
||||
|
||||
const ZERO_COUNTER: PgssBaselineCounter = {
|
||||
calls: 0,
|
||||
totalExecTime: 0,
|
||||
totalRows: 0,
|
||||
};
|
||||
|
||||
const PGSS_SNAPSHOT_READ_LIMIT = 5000;
|
||||
const PGSS_HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET|BEGIN|COMMIT|ROLLBACK|VACUUM|ANALYZE)\b/i;
|
||||
const PGSS_HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|pg_catalog\.|pg_toast\.|pg_stat_)/i;
|
||||
|
||||
function pgssTemplateId(row: Pick<PostgresPgssRow, 'dbid' | 'queryid'>): string {
|
||||
return `db${row.dbid}_q${row.queryid}`;
|
||||
}
|
||||
|
||||
export function pgssBaselinePath(rootDir: string | undefined, connectionId: string): string {
|
||||
return join(rootDir ?? join(process.cwd(), '.ktx/cache/historic-sql'), connectionId, 'pgss-baseline.json');
|
||||
}
|
||||
|
||||
export async function readPgssBaseline(path: string): Promise<PgssBaseline | null> {
|
||||
try {
|
||||
return pgssBaselineSchema.parse(JSON.parse(await readFile(path, 'utf-8')));
|
||||
} catch (error) {
|
||||
if (error && typeof error === 'object' && 'code' in error && error.code === 'ENOENT') {
|
||||
return null;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export async function writePgssBaselineAtomic(path: string, baseline: PgssBaseline): Promise<void> {
|
||||
const parsed = pgssBaselineSchema.parse(baseline);
|
||||
await mkdir(dirname(path), { recursive: true });
|
||||
const tempPath = `${path}.tmp`;
|
||||
await writeFile(tempPath, `${JSON.stringify(parsed, null, 2)}\n`, 'utf-8');
|
||||
await rename(tempPath, path);
|
||||
}
|
||||
|
||||
export async function stagePgStatStatementsTemplates(
|
||||
input: StagePgStatStatementsTemplatesInput,
|
||||
): Promise<StagePgStatStatementsTemplatesResult> {
|
||||
const config = historicSqlPullConfigSchema.parse(input.pullConfig);
|
||||
if (config.dialect !== 'postgres') {
|
||||
throw new Error(`stagePgStatStatementsTemplates requires dialect postgres, got ${config.dialect}`);
|
||||
}
|
||||
|
||||
const now = input.now ?? new Date();
|
||||
const fetchedAt = now.toISOString();
|
||||
const probe = await input.reader.probe(input.queryClient);
|
||||
const warnings = [...probe.warnings];
|
||||
const baseline = await readPgssBaseline(input.baselinePath);
|
||||
const snapshot = await input.reader.readSnapshot(input.queryClient, {
|
||||
minCalls: config.minCalls,
|
||||
maxTemplates: PGSS_SNAPSHOT_READ_LIMIT,
|
||||
});
|
||||
if (snapshot.deallocCount !== null && snapshot.deallocCount > 0) {
|
||||
warnings.push(
|
||||
`pgss_dealloc_count:${snapshot.deallocCount}; pg_stat_statements.max may be too low, causing template eviction churn`,
|
||||
);
|
||||
}
|
||||
const reset = detectBaselineReset({
|
||||
baseline,
|
||||
snapshotStatsResetAt: snapshot.statsResetAt,
|
||||
currentPgServerVersion: probe.pgServerVersion,
|
||||
});
|
||||
warnings.push(...reset.warnings);
|
||||
|
||||
const aggregates = aggregatePgssRows({
|
||||
rows: snapshot.rows,
|
||||
baseline,
|
||||
baselineFirstRun: reset.baselineFirstRun,
|
||||
fetchedAt,
|
||||
warnings,
|
||||
}).filter((aggregate) => !shouldSkipPgssSql(aggregate.query));
|
||||
|
||||
const analyzed: AnalyzedPgssTemplate[] = [];
|
||||
for (const aggregate of aggregates) {
|
||||
const analysis = await input.sqlAnalysis.analyzeForFingerprint(aggregate.query, 'postgres');
|
||||
if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) {
|
||||
warnings.push(`analysis_failed:${aggregate.id}`);
|
||||
continue;
|
||||
}
|
||||
analyzed.push({ aggregate, analysis });
|
||||
}
|
||||
|
||||
const selected = selectPgssTemplates(analyzed, config.maxTemplatesPerRun);
|
||||
if (selected.length < analyzed.length) {
|
||||
warnings.push(`templates_truncated: kept ${selected.length} of ${analyzed.length} templates`);
|
||||
}
|
||||
|
||||
await mkdir(input.stagedDir, { recursive: true });
|
||||
const templates: HistoricSqlManifest['templates'] = [];
|
||||
for (const template of selected) {
|
||||
const staged = buildPgssStagedTemplate(template, config, now);
|
||||
const basePath = `templates/${staged.metadata.id}`;
|
||||
await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata);
|
||||
await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown);
|
||||
await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage);
|
||||
templates.push({
|
||||
id: staged.metadata.id,
|
||||
fingerprint: staged.metadata.properties.fingerprint,
|
||||
subClusterId: staged.metadata.properties.sub_cluster_id,
|
||||
path: staged.metadata.path,
|
||||
});
|
||||
}
|
||||
|
||||
await writeJson(input.stagedDir, 'manifest.json', {
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: input.connectionId,
|
||||
dialect: 'postgres',
|
||||
fetchedAt,
|
||||
windowStart: baseline?.fetchedAt ?? snapshot.statsResetAt ?? fetchedAt,
|
||||
windowEnd: fetchedAt,
|
||||
nextSuccessfulCursor: fetchedAt,
|
||||
templateCount: selected.length,
|
||||
capped: selected.length < analyzed.length,
|
||||
warnings,
|
||||
degraded: true,
|
||||
statsResetAt: snapshot.statsResetAt,
|
||||
baselineFirstRun: reset.baselineFirstRun,
|
||||
pgServerVersion: probe.pgServerVersion,
|
||||
deallocCount: snapshot.deallocCount,
|
||||
templates,
|
||||
} satisfies HistoricSqlManifest);
|
||||
|
||||
return {
|
||||
baselinePath: input.baselinePath,
|
||||
baseline: buildNextBaseline({
|
||||
rows: snapshot.rows,
|
||||
fetchedAt,
|
||||
statsResetAt: snapshot.statsResetAt,
|
||||
pgServerVersion: probe.pgServerVersion,
|
||||
previousBaseline: reset.baselineFirstRun ? null : baseline,
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
function detectBaselineReset(input: {
|
||||
baseline: PgssBaseline | null;
|
||||
snapshotStatsResetAt: string | null;
|
||||
currentPgServerVersion: string;
|
||||
}): { baselineFirstRun: boolean; warnings: string[] } {
|
||||
if (!input.baseline) {
|
||||
return { baselineFirstRun: true, warnings: ['baseline_first_run:no_previous_pgss_baseline'] };
|
||||
}
|
||||
|
||||
const warnings: string[] = [];
|
||||
if (
|
||||
input.baseline.statsResetAt &&
|
||||
input.snapshotStatsResetAt &&
|
||||
input.baseline.statsResetAt < input.snapshotStatsResetAt
|
||||
) {
|
||||
warnings.push(
|
||||
`baseline_reset:stats_reset advanced from ${input.baseline.statsResetAt} to ${input.snapshotStatsResetAt}`,
|
||||
);
|
||||
}
|
||||
|
||||
const previousMajor = postgresMajor(input.baseline.pgServerVersion);
|
||||
const currentMajor = postgresMajor(input.currentPgServerVersion);
|
||||
if (previousMajor && currentMajor && previousMajor !== currentMajor) {
|
||||
warnings.push(`baseline_reset:pg_server_major changed from ${previousMajor} to ${currentMajor}`);
|
||||
}
|
||||
|
||||
return { baselineFirstRun: warnings.length > 0, warnings };
|
||||
}
|
||||
|
||||
function postgresMajor(version: string): string | null {
|
||||
return version.match(/PostgreSQL\s+(\d+)/i)?.[1] ?? version.match(/^(\d+)(?:\.|$)/)?.[1] ?? null;
|
||||
}
|
||||
|
||||
function aggregatePgssRows(input: {
|
||||
rows: PostgresPgssRow[];
|
||||
baseline: PgssBaseline | null;
|
||||
baselineFirstRun: boolean;
|
||||
fetchedAt: string;
|
||||
warnings: string[];
|
||||
}): PostgresPgssAggregateRow[] {
|
||||
const aggregates = new Map<string, PgssAggregateMutable>();
|
||||
|
||||
for (const row of input.rows) {
|
||||
const templateId = pgssTemplateId(row);
|
||||
const baselineTemplate = input.baselineFirstRun ? undefined : input.baseline?.templates[templateId];
|
||||
const baselineCounter = baselineTemplate?.perUser[row.userid];
|
||||
const previous = scopedCounterBaseline(row, baselineCounter, input.baselineFirstRun, input.warnings);
|
||||
const deltaCalls = row.calls - previous.calls;
|
||||
const deltaExecTime = row.totalExecTime - previous.totalExecTime;
|
||||
const deltaRows = row.totalRows - previous.totalRows;
|
||||
if (deltaCalls === 0 && !input.baselineFirstRun) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const existing =
|
||||
aggregates.get(templateId) ??
|
||||
({
|
||||
id: templateId,
|
||||
queryid: row.queryid,
|
||||
dbid: row.dbid,
|
||||
database: row.database,
|
||||
query: row.query,
|
||||
deltaCalls: 0,
|
||||
deltaExecTime: 0,
|
||||
deltaRows: 0,
|
||||
users: new Set<string>(),
|
||||
firstObservedAt: baselineTemplate?.firstObservedAt ?? input.fetchedAt,
|
||||
} satisfies PgssAggregateMutable);
|
||||
|
||||
existing.deltaCalls += Math.max(0, deltaCalls);
|
||||
existing.deltaExecTime += Math.max(0, deltaExecTime);
|
||||
existing.deltaRows += Math.max(0, deltaRows);
|
||||
if (deltaCalls > 0) {
|
||||
existing.users.add(row.username ?? 'unknown');
|
||||
}
|
||||
aggregates.set(templateId, existing);
|
||||
}
|
||||
|
||||
return [...aggregates.values()]
|
||||
.filter((aggregate) => aggregate.deltaCalls > 0)
|
||||
.map((aggregate) => ({
|
||||
id: aggregate.id,
|
||||
queryid: aggregate.queryid,
|
||||
dbid: aggregate.dbid,
|
||||
database: aggregate.database,
|
||||
query: aggregate.query,
|
||||
deltaCalls: aggregate.deltaCalls,
|
||||
deltaExecTime: aggregate.deltaExecTime,
|
||||
deltaRows: aggregate.deltaRows,
|
||||
meanExecTime: aggregate.deltaExecTime / Math.max(aggregate.deltaCalls, 1),
|
||||
distinctUsersDelta: aggregate.users.size,
|
||||
users: [...aggregate.users].sort(),
|
||||
firstObservedAt: aggregate.firstObservedAt,
|
||||
}));
|
||||
}
|
||||
|
||||
function scopedCounterBaseline(
|
||||
row: PostgresPgssRow,
|
||||
baselineCounter: PgssBaselineCounter | undefined,
|
||||
baselineFirstRun: boolean,
|
||||
warnings: string[],
|
||||
): PgssBaselineCounter {
|
||||
if (!baselineCounter || baselineFirstRun) {
|
||||
return ZERO_COUNTER;
|
||||
}
|
||||
if (
|
||||
baselineCounter.calls > row.calls ||
|
||||
baselineCounter.totalExecTime > row.totalExecTime ||
|
||||
baselineCounter.totalRows > row.totalRows
|
||||
) {
|
||||
warnings.push(`scoped_reset:dbid=${row.dbid} queryid=${row.queryid} userid=${row.userid}`);
|
||||
return ZERO_COUNTER;
|
||||
}
|
||||
return baselineCounter;
|
||||
}
|
||||
|
||||
function shouldSkipPgssSql(sql: string): boolean {
|
||||
return PGSS_HARD_SKIP_PREFIX_RE.test(sql) || PGSS_HARD_SKIP_TABLE_RE.test(sql);
|
||||
}
|
||||
|
||||
function selectPgssTemplates(templates: AnalyzedPgssTemplate[], maxTemplatesPerRun: number): AnalyzedPgssTemplate[] {
|
||||
return templates
|
||||
.map((template) => ({
|
||||
template,
|
||||
score: template.aggregate.users.length * Math.log1p(template.aggregate.deltaCalls),
|
||||
}))
|
||||
.sort(
|
||||
(left, right) => right.score - left.score || left.template.aggregate.id.localeCompare(right.template.aggregate.id),
|
||||
)
|
||||
.slice(0, maxTemplatesPerRun)
|
||||
.map((entry) => entry.template);
|
||||
}
|
||||
|
||||
function buildPgssStagedTemplate(
|
||||
template: AnalyzedPgssTemplate,
|
||||
config: HistoricSqlPullConfig,
|
||||
now: Date,
|
||||
): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } {
|
||||
const tablesTouched = [...template.analysis.tablesTouched].sort();
|
||||
const firstTable = tablesTouched[0] ?? 'query';
|
||||
const id = template.aggregate.id;
|
||||
|
||||
const metadata: HistoricSqlMetadata = {
|
||||
id,
|
||||
title: `postgres · ${firstTable} [${id.slice(0, 12)}]`,
|
||||
path: `templates/${id}/page.md`,
|
||||
objectType: HISTORIC_SQL_OBJECT_TYPE,
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: template.analysis.fingerprint,
|
||||
sub_cluster_id: null,
|
||||
dialect: 'postgres',
|
||||
tables_touched: tablesTouched,
|
||||
literal_slots: [],
|
||||
triage_signals: buildPgssTriageSignals({
|
||||
executions: template.aggregate.deltaCalls,
|
||||
distinctUsers: template.aggregate.distinctUsersDelta,
|
||||
firstSeen: template.aggregate.firstObservedAt,
|
||||
lastSeen: now.toISOString(),
|
||||
meanRuntimeMs: template.aggregate.meanExecTime,
|
||||
serviceAccountOnly: isServiceAccountOnly(template.aggregate.users, config.serviceAccountUserPatterns),
|
||||
now,
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
return {
|
||||
metadata,
|
||||
pageMarkdown: renderTemplatePage(id, template.analysis.normalizedSql, tablesTouched),
|
||||
usage: {
|
||||
stats: {
|
||||
executions: template.aggregate.deltaCalls,
|
||||
distinct_users: template.aggregate.distinctUsersDelta,
|
||||
first_seen: template.aggregate.firstObservedAt,
|
||||
last_seen: now.toISOString(),
|
||||
p50_runtime_ms: null,
|
||||
p95_runtime_ms: null,
|
||||
mean_runtime_ms: template.aggregate.meanExecTime,
|
||||
error_rate: 0,
|
||||
rows_produced: template.aggregate.deltaRows,
|
||||
},
|
||||
literal_slots: [],
|
||||
samples: [],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function buildPgssTriageSignals(input: {
|
||||
executions: number;
|
||||
distinctUsers: number;
|
||||
firstSeen: string;
|
||||
lastSeen: string;
|
||||
meanRuntimeMs: number;
|
||||
serviceAccountOnly: boolean;
|
||||
now: Date;
|
||||
}): Record<string, string> {
|
||||
return {
|
||||
executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high',
|
||||
distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: recencyBucket(input.lastSeen, input.now),
|
||||
service_account_only: String(input.serviceAccountOnly),
|
||||
runtime_bucket: runtimeBucket(input.meanRuntimeMs),
|
||||
};
|
||||
}
|
||||
|
||||
function runtimeBucket(meanRuntimeMs: number): string {
|
||||
if (meanRuntimeMs < 100) {
|
||||
return 'fast';
|
||||
}
|
||||
if (meanRuntimeMs < 1000) {
|
||||
return 'moderate';
|
||||
}
|
||||
return 'slow';
|
||||
}
|
||||
|
||||
function recencyBucket(lastSeen: string, now: Date): string {
|
||||
const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / 86400000);
|
||||
if (ageDays <= 14) {
|
||||
return 'active';
|
||||
}
|
||||
if (ageDays <= 60) {
|
||||
return 'warm';
|
||||
}
|
||||
return 'cold';
|
||||
}
|
||||
|
||||
function isServiceAccountOnly(users: string[], patterns: string[]): boolean {
|
||||
if (users.length === 0 || patterns.length === 0) {
|
||||
return false;
|
||||
}
|
||||
const regexes = patterns.map((pattern) => new RegExp(pattern));
|
||||
return users.every((user) => regexes.some((regex) => regex.test(user)));
|
||||
}
|
||||
|
||||
function renderTemplatePage(id: string, normalizedSql: string, tablesTouched: string[]): string {
|
||||
return [
|
||||
`# ${id}`,
|
||||
'',
|
||||
'## Normalized SQL',
|
||||
'```sql',
|
||||
normalizedSql,
|
||||
'```',
|
||||
'',
|
||||
'## Tables touched',
|
||||
...tablesTouched.map((table) => `- ${table}`),
|
||||
'',
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function buildNextBaseline(input: {
|
||||
rows: PostgresPgssRow[];
|
||||
fetchedAt: string;
|
||||
statsResetAt: string | null;
|
||||
pgServerVersion: string;
|
||||
previousBaseline: PgssBaseline | null;
|
||||
}): PgssBaseline {
|
||||
const templates: PgssBaseline['templates'] = {};
|
||||
for (const row of input.rows) {
|
||||
const templateId = pgssTemplateId(row);
|
||||
const previous = input.previousBaseline?.templates[templateId];
|
||||
const template = templates[templateId] ?? {
|
||||
firstObservedAt: previous?.firstObservedAt ?? input.fetchedAt,
|
||||
perUser: {},
|
||||
};
|
||||
template.perUser[row.userid] = {
|
||||
calls: row.calls,
|
||||
totalExecTime: row.totalExecTime,
|
||||
totalRows: row.totalRows,
|
||||
};
|
||||
templates[templateId] = template;
|
||||
}
|
||||
return {
|
||||
version: PGSS_BASELINE_VERSION,
|
||||
fetchedAt: input.fetchedAt,
|
||||
statsResetAt: input.statsResetAt,
|
||||
pgServerVersion: input.pgServerVersion,
|
||||
templates,
|
||||
};
|
||||
}
|
||||
|
||||
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`);
|
||||
}
|
||||
|
||||
async function writeText(root: string, relPath: string, value: string): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
await mkdir(dirname(target), { recursive: true });
|
||||
await writeFile(target, value, 'utf-8');
|
||||
}
|
||||
|
|
@ -0,0 +1,358 @@
|
|||
import { mkdtemp, readFile, readdir } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it, vi } from 'vitest';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import { stageHistoricSqlAggregatedSnapshot } from './stage-unified.js';
|
||||
import type { AggregatedTemplate, HistoricSqlReader } from './types.js';
|
||||
|
||||
async function tempDir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-unified-stage-'));
|
||||
}
|
||||
|
||||
async function readJson<T>(root: string, relPath: string): Promise<T> {
|
||||
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
|
||||
}
|
||||
|
||||
function aggregate(overrides: Partial<AggregatedTemplate> & { templateId: string; canonicalSql: string }): AggregatedTemplate {
|
||||
return {
|
||||
templateId: overrides.templateId,
|
||||
canonicalSql: overrides.canonicalSql,
|
||||
dialect: overrides.dialect ?? 'postgres',
|
||||
stats: overrides.stats ?? {
|
||||
executions: 42,
|
||||
distinctUsers: 3,
|
||||
firstSeen: '2026-05-01T00:00:00.000Z',
|
||||
lastSeen: '2026-05-11T00:00:00.000Z',
|
||||
p50RuntimeMs: 20,
|
||||
p95RuntimeMs: 80,
|
||||
errorRate: 0,
|
||||
rowsProduced: 100,
|
||||
},
|
||||
topUsers: overrides.topUsers ?? [{ user: 'analyst', executions: 40 }],
|
||||
};
|
||||
}
|
||||
|
||||
describe('stageHistoricSqlAggregatedSnapshot', () => {
|
||||
it('batch parses templates and writes stable table and patterns artifacts', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const reader: HistoricSqlReader = {
|
||||
async probe() {
|
||||
return { warnings: ['pg_stat_statements.track is none; aggregation still proceeds'], info: [] };
|
||||
},
|
||||
async *fetchAggregated() {
|
||||
yield aggregate({
|
||||
templateId: 'orders-by-status',
|
||||
canonicalSql: 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.created_at >= $1 group by o.status',
|
||||
});
|
||||
yield aggregate({
|
||||
templateId: 'service-account-only',
|
||||
canonicalSql: 'select * from public.orders where id = $1',
|
||||
stats: {
|
||||
executions: 20,
|
||||
distinctUsers: 1,
|
||||
firstSeen: '2026-05-01T00:00:00.000Z',
|
||||
lastSeen: '2026-05-11T00:00:00.000Z',
|
||||
p50RuntimeMs: 5,
|
||||
p95RuntimeMs: 10,
|
||||
errorRate: 0,
|
||||
rowsProduced: 1,
|
||||
},
|
||||
topUsers: [{ user: 'svc_loader', executions: 20 }],
|
||||
});
|
||||
yield aggregate({
|
||||
templateId: 'bad-parse',
|
||||
canonicalSql: 'select broken from',
|
||||
});
|
||||
},
|
||||
};
|
||||
const sqlAnalysis: SqlAnalysisPort = {
|
||||
analyzeForFingerprint: vi.fn(),
|
||||
analyzeBatch: vi.fn(async () => new Map([
|
||||
[
|
||||
'orders-by-status',
|
||||
{
|
||||
tablesTouched: ['public.orders', 'public.customers'],
|
||||
columnsByClause: {
|
||||
select: ['status'],
|
||||
where: ['created_at'],
|
||||
join: ['customer_id'],
|
||||
groupBy: ['status'],
|
||||
},
|
||||
},
|
||||
],
|
||||
['bad-parse', { tablesTouched: [], columnsByClause: {}, error: 'parse failed' }],
|
||||
])),
|
||||
};
|
||||
|
||||
await stageHistoricSqlAggregatedSnapshot({
|
||||
stagedDir,
|
||||
connectionId: 'warehouse',
|
||||
queryClient: {},
|
||||
reader,
|
||||
sqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'postgres',
|
||||
filters: {
|
||||
serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' },
|
||||
},
|
||||
},
|
||||
now: new Date('2026-05-11T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(1);
|
||||
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith(
|
||||
[
|
||||
{
|
||||
id: 'orders-by-status',
|
||||
sql: 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.created_at >= $1 group by o.status',
|
||||
},
|
||||
{ id: 'bad-parse', sql: 'select broken from' },
|
||||
],
|
||||
'postgres',
|
||||
);
|
||||
|
||||
expect(await readdir(join(stagedDir, 'tables'))).toEqual(['public.customers.json', 'public.orders.json']);
|
||||
|
||||
const manifest = await readJson<Record<string, unknown>>(stagedDir, 'manifest.json');
|
||||
expect(manifest).toMatchObject({
|
||||
source: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
snapshotRowCount: 3,
|
||||
touchedTableCount: 2,
|
||||
parseFailures: 1,
|
||||
warnings: ['parse_failed:bad-parse'],
|
||||
probeWarnings: ['pg_stat_statements.track is none; aggregation still proceeds'],
|
||||
staleArchiveAfterDays: 90,
|
||||
});
|
||||
|
||||
const orders = await readJson<Record<string, any>>(stagedDir, 'tables/public.orders.json');
|
||||
expect(orders).toMatchObject({
|
||||
table: 'public.orders',
|
||||
stats: {
|
||||
executionsBucket: '10-100',
|
||||
distinctUsersBucket: '2-5',
|
||||
errorRateBucket: 'none',
|
||||
p95RuntimeBucket: '<100ms',
|
||||
recencyBucket: 'current',
|
||||
},
|
||||
columnsByClause: {
|
||||
select: [['status', 'high']],
|
||||
where: [['created_at', 'high']],
|
||||
join: [['customer_id', 'high']],
|
||||
groupBy: [['status', 'high']],
|
||||
},
|
||||
observedJoins: [{ withTable: 'public.customers', on: ['customer_id'], freq: 'high' }],
|
||||
topTemplates: [
|
||||
{
|
||||
id: 'orders-by-status',
|
||||
topUsers: [{ user: 'analyst' }],
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(orders.topTemplates[0].canonicalSql).toContain('group by o.status');
|
||||
|
||||
const patterns = await readJson<Record<string, any>>(stagedDir, 'patterns-input.json');
|
||||
expect(patterns.templates).toEqual([
|
||||
{
|
||||
id: 'orders-by-status',
|
||||
canonicalSql: expect.stringContaining('public.orders'),
|
||||
tablesTouched: ['public.customers', 'public.orders'],
|
||||
executionsBucket: '10-100',
|
||||
distinctUsersBucket: '2-5',
|
||||
dialect: 'postgres',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('redacts configured SQL substrings in staged artifacts while analyzing original SQL', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const originalSql =
|
||||
"select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'";
|
||||
const reader: HistoricSqlReader = {
|
||||
async probe() {
|
||||
return { warnings: [], info: [] };
|
||||
},
|
||||
async *fetchAggregated() {
|
||||
yield aggregate({
|
||||
templateId: 'api-events-with-secret',
|
||||
canonicalSql: originalSql,
|
||||
stats: {
|
||||
executions: 15,
|
||||
distinctUsers: 2,
|
||||
firstSeen: '2026-05-01T00:00:00.000Z',
|
||||
lastSeen: '2026-05-11T00:00:00.000Z',
|
||||
p50RuntimeMs: 12,
|
||||
p95RuntimeMs: 25,
|
||||
errorRate: 0,
|
||||
rowsProduced: 15,
|
||||
},
|
||||
});
|
||||
},
|
||||
};
|
||||
const sqlAnalysis: SqlAnalysisPort = {
|
||||
analyzeForFingerprint: vi.fn(),
|
||||
analyzeBatch: vi.fn(async () => new Map([
|
||||
[
|
||||
'api-events-with-secret',
|
||||
{
|
||||
tablesTouched: ['public.api_events'],
|
||||
columnsByClause: {
|
||||
select: [],
|
||||
where: ['api_key', 'note'],
|
||||
join: [],
|
||||
groupBy: [],
|
||||
},
|
||||
},
|
||||
],
|
||||
])),
|
||||
};
|
||||
|
||||
await stageHistoricSqlAggregatedSnapshot({
|
||||
stagedDir,
|
||||
connectionId: 'warehouse',
|
||||
queryClient: {},
|
||||
reader,
|
||||
sqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'postgres',
|
||||
redactionPatterns: ['sk_live_[A-Za-z0-9]+', '(?i)secret_token_[a-z0-9]+'],
|
||||
},
|
||||
now: new Date('2026-05-11T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith(
|
||||
[{ id: 'api-events-with-secret', sql: originalSql }],
|
||||
'postgres',
|
||||
);
|
||||
|
||||
const tableJson = await readFile(join(stagedDir, 'tables/public.api_events.json'), 'utf-8');
|
||||
const patternsJson = await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8');
|
||||
expect(tableJson).not.toContain('sk_live_abc123');
|
||||
expect(tableJson).not.toContain('Secret_Token_9f');
|
||||
expect(patternsJson).not.toContain('sk_live_abc123');
|
||||
expect(patternsJson).not.toContain('Secret_Token_9f');
|
||||
expect(tableJson).toContain('[REDACTED]');
|
||||
expect(patternsJson).toContain('[REDACTED]');
|
||||
});
|
||||
|
||||
it('preserves full patterns audit input and writes bounded cross-table pattern shards', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const largeSql = `select * from public.orders o join public.customers c on c.id = o.customer_id where payload = '${'x'.repeat(8000)}'`;
|
||||
const reader: HistoricSqlReader = {
|
||||
async probe() {
|
||||
return { warnings: [], info: [] };
|
||||
},
|
||||
async *fetchAggregated() {
|
||||
yield aggregate({
|
||||
templateId: 'orders-customers-a',
|
||||
canonicalSql: largeSql,
|
||||
stats: {
|
||||
executions: 25,
|
||||
distinctUsers: 4,
|
||||
firstSeen: '2026-05-01T00:00:00.000Z',
|
||||
lastSeen: '2026-05-11T00:00:00.000Z',
|
||||
p50RuntimeMs: 15,
|
||||
p95RuntimeMs: 90,
|
||||
errorRate: 0,
|
||||
rowsProduced: 250,
|
||||
},
|
||||
});
|
||||
yield aggregate({
|
||||
templateId: 'orders-customers-b',
|
||||
canonicalSql: largeSql.replace('payload', 'payload_b'),
|
||||
stats: {
|
||||
executions: 22,
|
||||
distinctUsers: 3,
|
||||
firstSeen: '2026-05-01T00:00:00.000Z',
|
||||
lastSeen: '2026-05-11T00:00:00.000Z',
|
||||
p50RuntimeMs: 20,
|
||||
p95RuntimeMs: 95,
|
||||
errorRate: 0,
|
||||
rowsProduced: 220,
|
||||
},
|
||||
});
|
||||
yield aggregate({
|
||||
templateId: 'orders-single-table',
|
||||
canonicalSql: 'select count(*) from public.orders',
|
||||
stats: {
|
||||
executions: 30,
|
||||
distinctUsers: 2,
|
||||
firstSeen: '2026-05-01T00:00:00.000Z',
|
||||
lastSeen: '2026-05-11T00:00:00.000Z',
|
||||
p50RuntimeMs: 10,
|
||||
p95RuntimeMs: 20,
|
||||
errorRate: 0,
|
||||
rowsProduced: 30,
|
||||
},
|
||||
});
|
||||
},
|
||||
};
|
||||
const sqlAnalysis: SqlAnalysisPort = {
|
||||
analyzeForFingerprint: vi.fn(),
|
||||
analyzeBatch: vi.fn(async () => new Map([
|
||||
[
|
||||
'orders-customers-a',
|
||||
{
|
||||
tablesTouched: ['public.orders', 'public.customers'],
|
||||
columnsByClause: {
|
||||
select: [],
|
||||
where: ['payload'],
|
||||
join: ['customer_id', 'id'],
|
||||
groupBy: [],
|
||||
},
|
||||
},
|
||||
],
|
||||
[
|
||||
'orders-customers-b',
|
||||
{
|
||||
tablesTouched: ['public.orders', 'public.customers'],
|
||||
columnsByClause: {
|
||||
select: [],
|
||||
where: ['payload_b'],
|
||||
join: ['customer_id', 'id'],
|
||||
groupBy: [],
|
||||
},
|
||||
},
|
||||
],
|
||||
[
|
||||
'orders-single-table',
|
||||
{
|
||||
tablesTouched: ['public.orders'],
|
||||
columnsByClause: {
|
||||
select: [],
|
||||
where: [],
|
||||
join: [],
|
||||
groupBy: [],
|
||||
},
|
||||
},
|
||||
],
|
||||
])),
|
||||
};
|
||||
|
||||
await stageHistoricSqlAggregatedSnapshot({
|
||||
stagedDir,
|
||||
connectionId: 'warehouse',
|
||||
queryClient: {},
|
||||
reader,
|
||||
sqlAnalysis,
|
||||
pullConfig: { dialect: 'postgres' },
|
||||
now: new Date('2026-05-11T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const audit = await readJson<Record<string, any>>(stagedDir, 'patterns-input.json');
|
||||
expect(audit.templates.map((entry: any) => entry.id)).toEqual([
|
||||
'orders-customers-a',
|
||||
'orders-customers-b',
|
||||
'orders-single-table',
|
||||
]);
|
||||
|
||||
const firstShard = await readJson<Record<string, any>>(stagedDir, 'patterns-input/part-0001.json');
|
||||
expect(firstShard.templates.map((entry: any) => entry.id)).toEqual(['orders-customers-a', 'orders-customers-b']);
|
||||
expect(firstShard.templates.some((entry: any) => entry.id === 'orders-single-table')).toBe(false);
|
||||
|
||||
const manifest = await readJson<Record<string, any>>(stagedDir, 'manifest.json');
|
||||
expect(manifest.warnings).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,308 @@
|
|||
import { mkdir, writeFile } from 'node:fs/promises';
|
||||
import { dirname, join } from 'node:path';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import {
|
||||
bucketDistinctUsers,
|
||||
bucketErrorRate,
|
||||
bucketExecutions,
|
||||
bucketFrequency,
|
||||
bucketP95Runtime,
|
||||
bucketRecency,
|
||||
} from './buckets.js';
|
||||
import { splitHistoricSqlPatternInputs } from './pattern-inputs.js';
|
||||
import {
|
||||
compileHistoricSqlRedactionPatterns,
|
||||
redactHistoricSqlText,
|
||||
type HistoricSqlRedactionPattern,
|
||||
} from './redaction.js';
|
||||
import {
|
||||
HISTORIC_SQL_SOURCE_KEY,
|
||||
aggregatedTemplateSchema,
|
||||
historicSqlUnifiedPullConfigSchema,
|
||||
type AggregatedTemplate,
|
||||
type HistoricSqlReader,
|
||||
type HistoricSqlUnifiedPullConfig,
|
||||
type StagedPatternsInput,
|
||||
type StagedTableInput,
|
||||
} from './types.js';
|
||||
|
||||
interface StageHistoricSqlAggregatedSnapshotInput {
|
||||
stagedDir: string;
|
||||
connectionId: string;
|
||||
queryClient: unknown;
|
||||
reader: HistoricSqlReader;
|
||||
sqlAnalysis: SqlAnalysisPort;
|
||||
pullConfig: unknown;
|
||||
now?: Date;
|
||||
}
|
||||
|
||||
interface ParsedTemplate {
|
||||
template: AggregatedTemplate;
|
||||
tablesTouched: string[];
|
||||
columnsByClause: Record<string, string[]>;
|
||||
}
|
||||
|
||||
interface TableAccumulator {
|
||||
table: string;
|
||||
executions: number;
|
||||
distinctUsers: number;
|
||||
errorRateNumerator: number;
|
||||
p95RuntimeMs: number | null;
|
||||
lastSeen: string;
|
||||
columnsByClause: Map<string, Map<string, number>>;
|
||||
observedJoins: Map<string, Map<string, number>>;
|
||||
topTemplates: AggregatedTemplate[];
|
||||
}
|
||||
|
||||
const TRIVIAL_SQL_RE = /^\s*SELECT\s+(1|NOW\(\)|CURRENT_TIMESTAMP|VERSION\(\))\s*;?\s*$/i;
|
||||
const NOISE_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i;
|
||||
const SYSTEM_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i;
|
||||
|
||||
function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
|
||||
const target = join(root, relPath);
|
||||
return mkdir(dirname(target), { recursive: true }).then(() =>
|
||||
writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'),
|
||||
);
|
||||
}
|
||||
|
||||
function compilePatterns(patterns: string[]): RegExp[] {
|
||||
return patterns.map((pattern) => new RegExp(pattern));
|
||||
}
|
||||
|
||||
function matchesAny(value: string | null, patterns: RegExp[]): boolean {
|
||||
return !!value && patterns.some((pattern) => pattern.test(value));
|
||||
}
|
||||
|
||||
function shouldDropBySql(sql: string, config: HistoricSqlUnifiedPullConfig): boolean {
|
||||
if (NOISE_PREFIX_RE.test(sql) || SYSTEM_TABLE_RE.test(sql)) return true;
|
||||
if (config.filters.dropTrivialProbes !== false && TRIVIAL_SQL_RE.test(sql)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
function shouldDropByUsers(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean {
|
||||
const service = config.filters.serviceAccounts;
|
||||
if (!service || service.mode === 'mark-only' || service.patterns.length === 0) return false;
|
||||
const patterns = compilePatterns(service.patterns);
|
||||
const matchingExecutions = template.topUsers
|
||||
.filter((entry) => matchesAny(entry.user, patterns))
|
||||
.reduce((sum, entry) => sum + entry.executions, 0);
|
||||
const allExecutions = template.topUsers.reduce((sum, entry) => sum + entry.executions, 0);
|
||||
const serviceOnly = allExecutions > 0 && matchingExecutions >= allExecutions;
|
||||
return service.mode === 'exclude' ? serviceOnly : !serviceOnly;
|
||||
}
|
||||
|
||||
function shouldDropByFailure(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean {
|
||||
const failed = config.filters.dropFailedBelow;
|
||||
return !!failed && template.stats.errorRate > failed.errorRate && template.stats.executions < failed.executions;
|
||||
}
|
||||
|
||||
function shouldDropTemplate(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean {
|
||||
if (shouldDropBySql(template.canonicalSql, config)) return true;
|
||||
if (shouldDropByUsers(template, config)) return true;
|
||||
if (shouldDropByFailure(template, config)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
function redactTemplateSql(
|
||||
template: AggregatedTemplate,
|
||||
redactors: readonly HistoricSqlRedactionPattern[],
|
||||
): AggregatedTemplate {
|
||||
if (redactors.length === 0) {
|
||||
return template;
|
||||
}
|
||||
return {
|
||||
...template,
|
||||
canonicalSql: redactHistoricSqlText(template.canonicalSql, redactors),
|
||||
};
|
||||
}
|
||||
|
||||
function recordColumn(acc: TableAccumulator, clause: string, column: string, executions: number): void {
|
||||
const byColumn = acc.columnsByClause.get(clause) ?? new Map<string, number>();
|
||||
byColumn.set(column, (byColumn.get(column) ?? 0) + executions);
|
||||
acc.columnsByClause.set(clause, byColumn);
|
||||
}
|
||||
|
||||
function recordJoin(acc: TableAccumulator, otherTable: string, columns: string[], executions: number): void {
|
||||
const byColumns = acc.observedJoins.get(otherTable) ?? new Map<string, number>();
|
||||
const key = [...new Set(columns)].sort().join(',');
|
||||
if (key.length > 0) {
|
||||
byColumns.set(key, (byColumns.get(key) ?? 0) + executions);
|
||||
acc.observedJoins.set(otherTable, byColumns);
|
||||
}
|
||||
}
|
||||
|
||||
function accumulatorFor(table: string): TableAccumulator {
|
||||
return {
|
||||
table,
|
||||
executions: 0,
|
||||
distinctUsers: 0,
|
||||
errorRateNumerator: 0,
|
||||
p95RuntimeMs: null,
|
||||
lastSeen: '1970-01-01T00:00:00.000Z',
|
||||
columnsByClause: new Map(),
|
||||
observedJoins: new Map(),
|
||||
topTemplates: [],
|
||||
};
|
||||
}
|
||||
|
||||
function addTemplate(acc: TableAccumulator, parsed: ParsedTemplate): void {
|
||||
const executions = parsed.template.stats.executions;
|
||||
acc.executions += executions;
|
||||
acc.distinctUsers = Math.max(acc.distinctUsers, parsed.template.stats.distinctUsers);
|
||||
acc.errorRateNumerator += parsed.template.stats.errorRate * executions;
|
||||
acc.p95RuntimeMs =
|
||||
acc.p95RuntimeMs === null
|
||||
? parsed.template.stats.p95RuntimeMs
|
||||
: parsed.template.stats.p95RuntimeMs === null
|
||||
? acc.p95RuntimeMs
|
||||
: Math.max(acc.p95RuntimeMs, parsed.template.stats.p95RuntimeMs);
|
||||
acc.lastSeen = parsed.template.stats.lastSeen > acc.lastSeen ? parsed.template.stats.lastSeen : acc.lastSeen;
|
||||
for (const [clause, columns] of Object.entries(parsed.columnsByClause)) {
|
||||
for (const column of columns) {
|
||||
recordColumn(acc, clause, column, executions);
|
||||
}
|
||||
}
|
||||
const joinColumns = parsed.columnsByClause.join ?? [];
|
||||
for (const otherTable of parsed.tablesTouched.filter((table) => table !== acc.table)) {
|
||||
recordJoin(acc, otherTable, joinColumns, executions);
|
||||
}
|
||||
acc.topTemplates.push(parsed.template);
|
||||
}
|
||||
|
||||
function toStagedTable(acc: TableAccumulator, now: Date): StagedTableInput {
|
||||
const errorRate = acc.executions > 0 ? acc.errorRateNumerator / acc.executions : 0;
|
||||
const columnsByClause: Record<string, Array<[string, string]>> = Object.fromEntries(
|
||||
[...acc.columnsByClause.entries()]
|
||||
.sort(([left], [right]) => left.localeCompare(right))
|
||||
.map(([clause, counts]) => [
|
||||
clause,
|
||||
[...counts.entries()]
|
||||
.sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))
|
||||
.map(([column, count]) => [column, bucketFrequency(count, acc.executions)] as [string, string]),
|
||||
]),
|
||||
);
|
||||
const observedJoins = [...acc.observedJoins.entries()]
|
||||
.flatMap(([withTable, byColumns]) =>
|
||||
[...byColumns.entries()].map(([columns, count]) => ({
|
||||
withTable,
|
||||
on: columns.split(',').filter(Boolean),
|
||||
freq: bucketFrequency(count, acc.executions),
|
||||
})),
|
||||
)
|
||||
.sort((left, right) => left.withTable.localeCompare(right.withTable) || left.on.join(',').localeCompare(right.on.join(',')));
|
||||
const topTemplates = [...acc.topTemplates]
|
||||
.sort((left, right) => right.stats.executions - left.stats.executions || left.templateId.localeCompare(right.templateId))
|
||||
.slice(0, 5)
|
||||
.map((template) => ({
|
||||
id: template.templateId,
|
||||
canonicalSql: template.canonicalSql,
|
||||
topUsers: template.topUsers.slice(0, 5).map((entry) => ({ user: entry.user })),
|
||||
}));
|
||||
|
||||
return {
|
||||
table: acc.table,
|
||||
stats: {
|
||||
executionsBucket: bucketExecutions(acc.executions),
|
||||
distinctUsersBucket: bucketDistinctUsers(acc.distinctUsers),
|
||||
errorRateBucket: bucketErrorRate(errorRate),
|
||||
p95RuntimeBucket: bucketP95Runtime(acc.p95RuntimeMs),
|
||||
recencyBucket: bucketRecency(acc.lastSeen, now),
|
||||
},
|
||||
columnsByClause,
|
||||
observedJoins,
|
||||
topTemplates,
|
||||
};
|
||||
}
|
||||
|
||||
function toPatternsInput(parsedTemplates: ParsedTemplate[]): StagedPatternsInput {
|
||||
return {
|
||||
templates: parsedTemplates
|
||||
.map(({ template, tablesTouched }) => ({
|
||||
id: template.templateId,
|
||||
canonicalSql: template.canonicalSql,
|
||||
tablesTouched: [...tablesTouched].sort(),
|
||||
executionsBucket: bucketExecutions(template.stats.executions),
|
||||
distinctUsersBucket: bucketDistinctUsers(template.stats.distinctUsers),
|
||||
dialect: template.dialect,
|
||||
}))
|
||||
.sort((left, right) => left.id.localeCompare(right.id)),
|
||||
};
|
||||
}
|
||||
|
||||
export async function stageHistoricSqlAggregatedSnapshot(input: StageHistoricSqlAggregatedSnapshotInput): Promise<void> {
|
||||
const config = historicSqlUnifiedPullConfigSchema.parse(input.pullConfig);
|
||||
const redactors = compileHistoricSqlRedactionPatterns(config.redactionPatterns);
|
||||
const now = input.now ?? new Date();
|
||||
const windowStart = new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000);
|
||||
const probe = await input.reader.probe(input.queryClient);
|
||||
const snapshot: AggregatedTemplate[] = [];
|
||||
let snapshotRowCount = 0;
|
||||
|
||||
for await (const row of input.reader.fetchAggregated(input.queryClient, { start: windowStart, end: now }, config)) {
|
||||
snapshotRowCount += 1;
|
||||
const parsed = aggregatedTemplateSchema.parse(row);
|
||||
if (!shouldDropTemplate(parsed, config)) {
|
||||
snapshot.push(parsed);
|
||||
}
|
||||
}
|
||||
|
||||
const analysis = await input.sqlAnalysis.analyzeBatch(
|
||||
snapshot.map((template) => ({ id: template.templateId, sql: template.canonicalSql })),
|
||||
config.dialect,
|
||||
);
|
||||
const warnings: string[] = [];
|
||||
const parsedTemplates: ParsedTemplate[] = [];
|
||||
for (const template of snapshot) {
|
||||
const parsed = analysis.get(template.templateId);
|
||||
if (!parsed || parsed.error) {
|
||||
warnings.push(`parse_failed:${template.templateId}`);
|
||||
continue;
|
||||
}
|
||||
const tablesTouched = [...new Set(parsed.tablesTouched)].filter((table) => table.length > 0).sort();
|
||||
if (tablesTouched.length === 0) {
|
||||
continue;
|
||||
}
|
||||
parsedTemplates.push({
|
||||
template: redactTemplateSql(template, redactors),
|
||||
tablesTouched,
|
||||
columnsByClause: Object.fromEntries(
|
||||
Object.entries(parsed.columnsByClause).map(([clause, columns]) => [clause, [...new Set(columns)].sort()]),
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
const byTable = new Map<string, TableAccumulator>();
|
||||
for (const parsed of parsedTemplates) {
|
||||
for (const table of parsed.tablesTouched) {
|
||||
const acc = byTable.get(table) ?? accumulatorFor(table);
|
||||
addTemplate(acc, parsed);
|
||||
byTable.set(table, acc);
|
||||
}
|
||||
}
|
||||
|
||||
await mkdir(input.stagedDir, { recursive: true });
|
||||
for (const [table, acc] of [...byTable.entries()].sort(([left], [right]) => left.localeCompare(right))) {
|
||||
await writeJson(input.stagedDir, `tables/${table}.json`, toStagedTable(acc, now));
|
||||
}
|
||||
const patternsInput = toPatternsInput(parsedTemplates);
|
||||
const patternInputSplit = splitHistoricSqlPatternInputs(patternsInput);
|
||||
const allWarnings = [...warnings, ...patternInputSplit.warnings];
|
||||
await writeJson(input.stagedDir, 'patterns-input.json', patternInputSplit.auditInput);
|
||||
for (const shard of patternInputSplit.shards) {
|
||||
await writeJson(input.stagedDir, shard.path, shard.input);
|
||||
}
|
||||
await writeJson(input.stagedDir, 'manifest.json', {
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: input.connectionId,
|
||||
dialect: config.dialect,
|
||||
fetchedAt: now.toISOString(),
|
||||
windowStart: windowStart.toISOString(),
|
||||
windowEnd: now.toISOString(),
|
||||
snapshotRowCount,
|
||||
touchedTableCount: byTable.size,
|
||||
parseFailures: allWarnings.filter((warning) => warning.startsWith('parse_failed:')).length,
|
||||
warnings: allWarnings,
|
||||
probeWarnings: probe.warnings,
|
||||
staleArchiveAfterDays: config.staleArchiveAfterDays,
|
||||
});
|
||||
}
|
||||
|
|
@ -1,798 +0,0 @@
|
|||
import { mkdtemp, readFile, readdir } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
import { stageHistoricSqlTemplates } from './stage.js';
|
||||
import {
|
||||
historicSqlManifestSchema,
|
||||
historicSqlMetadataSchema,
|
||||
historicSqlUsageSchema,
|
||||
type HistoricSqlQueryHistoryReader,
|
||||
type HistoricSqlRawQueryRow,
|
||||
} from './types.js';
|
||||
|
||||
async function tempDir(): Promise<string> {
|
||||
return mkdtemp(join(tmpdir(), 'historic-sql-stage-'));
|
||||
}
|
||||
|
||||
async function readJson<T>(root: string, relPath: string): Promise<T> {
|
||||
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
|
||||
}
|
||||
|
||||
function fakeReader(rows: HistoricSqlRawQueryRow[]): HistoricSqlQueryHistoryReader {
|
||||
return {
|
||||
async probe() {},
|
||||
async *fetch() {
|
||||
for (const row of rows) {
|
||||
yield row;
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const fakeSqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
if (sql.includes('paid')) {
|
||||
return {
|
||||
fingerprint: 'fp_paid_orders',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [
|
||||
{ position: 1, type: 'string', exampleValue: 'paid' },
|
||||
{ position: 2, type: 'date', exampleValue: '2026-04-01' },
|
||||
],
|
||||
};
|
||||
}
|
||||
return {
|
||||
fingerprint: 'fp_refunds',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.refunds WHERE state = ?',
|
||||
tablesTouched: ['analytics.refunds'],
|
||||
literalSlots: [{ position: 1, type: 'string', exampleValue: 'complete' }],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const categoricalSqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
const status = sql.includes("'refunded'") ? 'refunded' : 'paid';
|
||||
return {
|
||||
fingerprint: 'fp_order_status',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [{ position: 1, type: 'string', exampleValue: status }],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
function categoricalRows(): HistoricSqlRawQueryRow[] {
|
||||
return [
|
||||
{
|
||||
id: 'paid-1',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
|
||||
user: 'analyst-a',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 11,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'paid-2',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
|
||||
user: 'analyst-b',
|
||||
startedAt: '2026-05-04T10:01:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 110,
|
||||
rowsProduced: 12,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'paid-3',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
|
||||
user: 'analyst-c',
|
||||
startedAt: '2026-05-04T10:02:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 120,
|
||||
rowsProduced: 13,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'refunded-1',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
|
||||
user: 'analyst-a',
|
||||
startedAt: '2026-05-04T10:03:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 130,
|
||||
rowsProduced: 21,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'refunded-2',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
|
||||
user: 'analyst-b',
|
||||
startedAt: '2026-05-04T10:04:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 140,
|
||||
rowsProduced: 22,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'refunded-3',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
|
||||
user: 'analyst-c',
|
||||
startedAt: '2026-05-04T10:05:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 150,
|
||||
rowsProduced: 23,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
const diverseSqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
const value = sql.match(/status = '([^']+)'/)?.[1] ?? 'unknown';
|
||||
return {
|
||||
fingerprint: 'fp_diverse_samples',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [{ position: 1, type: 'string', exampleValue: value }],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
const classificationMatrixSqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
if (sql.includes('stale_orders')) {
|
||||
return {
|
||||
fingerprint: 'fp_stale_date',
|
||||
normalizedSql: 'SELECT count(*) FROM analytics.stale_orders WHERE created_at >= ?',
|
||||
tablesTouched: ['analytics.stale_orders'],
|
||||
literalSlots: [{ position: 1, type: 'date', exampleValue: '2026-04-01' }],
|
||||
};
|
||||
}
|
||||
|
||||
const stringValue = (field: string): string => sql.match(new RegExp(`${field} = '([^']+)'`))?.[1] ?? 'unknown';
|
||||
const amount = sql.match(/amount >= (\d+)/)?.[1] ?? '0';
|
||||
const asOf = sql.match(/created_at >= '([^']+)'/)?.[1] ?? '2026-05-01';
|
||||
|
||||
return {
|
||||
fingerprint: 'fp_classification_matrix',
|
||||
normalizedSql:
|
||||
'SELECT count(*) FROM analytics.orders WHERE region = ? AND plan = ? AND status = ? AND amount >= ? AND created_at >= ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [
|
||||
{ position: 1, type: 'string', exampleValue: stringValue('region') },
|
||||
{ position: 2, type: 'string', exampleValue: stringValue('plan') },
|
||||
{ position: 3, type: 'string', exampleValue: stringValue('status') },
|
||||
{ position: 4, type: 'number', exampleValue: amount },
|
||||
{ position: 5, type: 'date', exampleValue: asOf },
|
||||
],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
function classificationMatrixRows(): HistoricSqlRawQueryRow[] {
|
||||
const rows: HistoricSqlRawQueryRow[] = Array.from({ length: 20 }, (_, index) => {
|
||||
const status = index < 10 ? 'paid' : 'refunded';
|
||||
const plan = index === 19 ? 'self_serve' : 'enterprise';
|
||||
const amount = 100 + index;
|
||||
const asOf = `2026-05-${String(1 + Math.floor(index / 5)).padStart(2, '0')}`;
|
||||
return {
|
||||
id: `matrix-${index + 1}`,
|
||||
sql: `SELECT count(*) FROM analytics.orders WHERE region = 'us' AND plan = '${plan}' AND status = '${status}' AND amount >= ${amount} AND created_at >= '${asOf}'`,
|
||||
user: `analyst-${(index % 4) + 1}`,
|
||||
startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`,
|
||||
endedAt: null,
|
||||
runtimeMs: 100 + index,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
};
|
||||
});
|
||||
|
||||
return [
|
||||
...rows,
|
||||
{
|
||||
id: 'stale-date-1',
|
||||
sql: "SELECT count(*) FROM analytics.stale_orders WHERE created_at >= '2026-04-01'",
|
||||
user: 'analyst-1',
|
||||
startedAt: '2026-05-04T11:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 75,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
describe('stageHistoricSqlTemplates', () => {
|
||||
it('compresses rows by fingerprint into document-shaped staged templates', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader([
|
||||
{
|
||||
id: 'q1',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01' AND email = 'analyst@example.com'",
|
||||
user: 'analyst@example.com',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: '2026-05-04T10:00:01.000Z',
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'q2',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-05-01' AND email = 'analyst-2@example.com'",
|
||||
user: 'analyst-2@example.com',
|
||||
startedAt: '2026-05-04T11:00:00.000Z',
|
||||
endedAt: '2026-05-04T11:00:01.000Z',
|
||||
runtimeMs: 300,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]),
|
||||
sqlAnalysis: fakeSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: ['^svc_'],
|
||||
redactionPatterns: ['[\\w.+-]+@[\\w-]+\\.[\\w.-]+'],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest).toMatchObject({
|
||||
source: 'historic-sql',
|
||||
connectionId: 'conn_1',
|
||||
dialect: 'snowflake',
|
||||
nextSuccessfulCursor: '2026-05-04T11:00:00.000Z',
|
||||
templateCount: 1,
|
||||
capped: false,
|
||||
});
|
||||
|
||||
const files = (await readdir(join(stagedDir, 'templates', 'fp_paid_orders'))).sort();
|
||||
expect(files).toEqual(['metadata.json', 'page.md', 'usage.json']);
|
||||
|
||||
const metadata = historicSqlMetadataSchema.parse(
|
||||
await readJson(stagedDir, 'templates/fp_paid_orders/metadata.json'),
|
||||
);
|
||||
expect(metadata).toEqual({
|
||||
id: 'fp_paid_orders',
|
||||
title: 'snowflake · analytics.orders [fp_pai]',
|
||||
path: 'templates/fp_paid_orders/page.md',
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: 'fp_paid_orders',
|
||||
sub_cluster_id: null,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [
|
||||
{ position: 1, type: 'string', classification: 'constant' },
|
||||
{ position: 2, type: 'date', classification: 'runtime' },
|
||||
],
|
||||
triage_signals: {
|
||||
executions_bucket: 'low',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 1 runtime',
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const page = await readFile(join(stagedDir, 'templates/fp_paid_orders/page.md'), 'utf-8');
|
||||
expect(page).toContain('## Normalized SQL');
|
||||
expect(page).toContain('SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?');
|
||||
expect(page).toContain('- analytics.orders');
|
||||
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json'));
|
||||
expect(usage.stats).toMatchObject({
|
||||
executions: 2,
|
||||
distinct_users: 2,
|
||||
first_seen: '2026-05-04T10:00:00.000Z',
|
||||
last_seen: '2026-05-04T11:00:00.000Z',
|
||||
p50_runtime_ms: 100,
|
||||
p95_runtime_ms: 300,
|
||||
error_rate: 0,
|
||||
});
|
||||
expect(usage.samples).toHaveLength(1);
|
||||
expect(usage.samples[0].bound_sql).toContain('<redacted>');
|
||||
expect(usage.samples[0].bound_sql).not.toContain('analyst@example.com');
|
||||
expect(usage.samples[0].bound_sql).not.toContain('analyst-2@example.com');
|
||||
});
|
||||
|
||||
it('skips hard-noise SQL and caps templates deterministically', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader([
|
||||
{
|
||||
id: 'show-1',
|
||||
sql: 'SHOW TABLES',
|
||||
user: 'analyst',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: null,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'q3',
|
||||
sql: "SELECT count(*) FROM analytics.refunds WHERE state = 'complete'",
|
||||
user: 'analyst',
|
||||
startedAt: '2026-05-04T11:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 50,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'q4',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01'",
|
||||
user: 'analyst',
|
||||
startedAt: '2026-05-04T11:30:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 40,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]),
|
||||
sqlAnalysis: fakeSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'bigquery',
|
||||
windowDays: 7,
|
||||
lastSuccessfulCursor: '2026-05-01T00:00:00.000Z',
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 1,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.templateCount).toBe(1);
|
||||
expect(manifest.capped).toBe(true);
|
||||
expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']);
|
||||
expect(manifest.templates.map((template) => template.id)).toEqual(['fp_paid_orders']);
|
||||
});
|
||||
|
||||
it('splits categorical fingerprints into one document directory per dominant value', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader(categoricalRows()),
|
||||
sqlAnalysis: categoricalSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
const templates = manifest.templates
|
||||
.map((template) => ({
|
||||
id: template.id,
|
||||
fingerprint: template.fingerprint,
|
||||
subClusterId: template.subClusterId,
|
||||
path: template.path,
|
||||
}))
|
||||
.sort((left, right) => left.id.localeCompare(right.id));
|
||||
|
||||
expect(manifest.templateCount).toBe(2);
|
||||
expect(templates).toEqual([
|
||||
{
|
||||
id: 'fp_order_status__cat_2b2ff2318877',
|
||||
fingerprint: 'fp_order_status',
|
||||
subClusterId: 'cat_2b2ff2318877',
|
||||
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
|
||||
},
|
||||
{
|
||||
id: 'fp_order_status__cat_34f037ddcbfa',
|
||||
fingerprint: 'fp_order_status',
|
||||
subClusterId: 'cat_34f037ddcbfa',
|
||||
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
|
||||
},
|
||||
]);
|
||||
|
||||
const paidMetadata = historicSqlMetadataSchema.parse(
|
||||
await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/metadata.json'),
|
||||
);
|
||||
expect(paidMetadata).toMatchObject({
|
||||
id: 'fp_order_status__cat_34f037ddcbfa',
|
||||
title: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
|
||||
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
|
||||
properties: {
|
||||
fingerprint: 'fp_order_status',
|
||||
sub_cluster_id: 'cat_34f037ddcbfa',
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
|
||||
},
|
||||
});
|
||||
|
||||
const paidUsage = historicSqlUsageSchema.parse(
|
||||
await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'),
|
||||
);
|
||||
expect(paidUsage.stats).toMatchObject({
|
||||
executions: 3,
|
||||
distinct_users: 3,
|
||||
first_seen: '2026-05-04T10:00:00.000Z',
|
||||
last_seen: '2026-05-04T10:02:00.000Z',
|
||||
rows_produced: 36,
|
||||
});
|
||||
expect(paidUsage.literal_slots).toEqual([{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }]);
|
||||
|
||||
const refundedUsage = historicSqlUsageSchema.parse(
|
||||
await readJson(stagedDir, 'templates/fp_order_status__cat_2b2ff2318877/usage.json'),
|
||||
);
|
||||
expect(refundedUsage.stats).toMatchObject({
|
||||
executions: 3,
|
||||
distinct_users: 3,
|
||||
first_seen: '2026-05-04T10:03:00.000Z',
|
||||
last_seen: '2026-05-04T10:05:00.000Z',
|
||||
rows_produced: 66,
|
||||
});
|
||||
expect(refundedUsage.literal_slots).toEqual([
|
||||
{ position: 1, distinct_values: 1, top_values: [['refunded', 3]] },
|
||||
]);
|
||||
});
|
||||
|
||||
it('classifies literal slots across the spec matrix and stale-date demotion', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader(classificationMatrixRows()),
|
||||
sqlAnalysis: classificationMatrixSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
const matrixTemplates = manifest.templates.filter((template) => template.fingerprint === 'fp_classification_matrix');
|
||||
expect(matrixTemplates).toHaveLength(2);
|
||||
expect(matrixTemplates.every((template) => template.subClusterId?.startsWith('cat_'))).toBe(true);
|
||||
|
||||
const matrixTemplate = matrixTemplates[0];
|
||||
if (!matrixTemplate) {
|
||||
throw new Error('expected classification matrix template');
|
||||
}
|
||||
const matrixMetadata = historicSqlMetadataSchema.parse(
|
||||
await readJson(stagedDir, matrixTemplate.path.replace('/page.md', '/metadata.json')),
|
||||
);
|
||||
expect(matrixMetadata.properties.literal_slots).toMatchInlineSnapshot(`
|
||||
[
|
||||
{
|
||||
"classification": "constant",
|
||||
"position": 1,
|
||||
"type": "string",
|
||||
},
|
||||
{
|
||||
"classification": "constant",
|
||||
"position": 2,
|
||||
"type": "string",
|
||||
},
|
||||
{
|
||||
"classification": "categorical",
|
||||
"position": 3,
|
||||
"type": "string",
|
||||
},
|
||||
{
|
||||
"classification": "runtime",
|
||||
"position": 4,
|
||||
"type": "number",
|
||||
},
|
||||
{
|
||||
"classification": "runtime",
|
||||
"position": 5,
|
||||
"type": "date",
|
||||
},
|
||||
]
|
||||
`);
|
||||
expect(matrixMetadata.properties.triage_signals.slot_summary).toBe('2 constant, 2 runtime');
|
||||
|
||||
const staleMetadata = historicSqlMetadataSchema.parse(
|
||||
await readJson(stagedDir, 'templates/fp_stale_date/metadata.json'),
|
||||
);
|
||||
expect(staleMetadata.properties.literal_slots).toMatchInlineSnapshot(`
|
||||
[
|
||||
{
|
||||
"classification": "runtime",
|
||||
"position": 1,
|
||||
"type": "date",
|
||||
},
|
||||
]
|
||||
`);
|
||||
expect(staleMetadata.properties.triage_signals.slot_summary).toBe('0 constant, 1 runtime');
|
||||
});
|
||||
|
||||
it('applies the templates-per-run cap after categorical expansion', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader(categoricalRows()),
|
||||
sqlAnalysis: categoricalSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 1,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.templateCount).toBe(1);
|
||||
expect(manifest.capped).toBe(true);
|
||||
expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']);
|
||||
expect(manifest.templates).toHaveLength(1);
|
||||
expect(manifest.templates[0].id).toMatch(/^fp_order_status__cat_/);
|
||||
});
|
||||
|
||||
it('omits rows_produced for BigQuery templates when reader rows have no row counts', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_bq',
|
||||
queryClient: {},
|
||||
reader: fakeReader([
|
||||
{
|
||||
id: 'bq-1',
|
||||
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
|
||||
user: 'analyst-a@example.com',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]),
|
||||
sqlAnalysis: fakeSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'bigquery',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json'));
|
||||
expect(usage.stats).not.toHaveProperty('rows_produced');
|
||||
expect(usage.samples[0]).not.toHaveProperty('rows_produced');
|
||||
});
|
||||
|
||||
it('keeps at most five diverse samples, preferring recent successful representatives per literal tuple', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const statuses = [
|
||||
'paid',
|
||||
'refunded',
|
||||
'pending',
|
||||
'failed',
|
||||
'trial',
|
||||
'cancelled',
|
||||
'draft',
|
||||
'returned',
|
||||
'review',
|
||||
'held',
|
||||
'archived',
|
||||
];
|
||||
const rows: HistoricSqlRawQueryRow[] = statuses.flatMap((status, index) => [
|
||||
{
|
||||
id: `${status}-old`,
|
||||
sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`,
|
||||
user: 'analyst-a',
|
||||
startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`,
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 1,
|
||||
success: false,
|
||||
errorMessage: 'old failed sample',
|
||||
},
|
||||
{
|
||||
id: `${status}-new`,
|
||||
sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`,
|
||||
user: 'analyst-a',
|
||||
startedAt: `2026-05-04T11:${String(index).padStart(2, '0')}:00.000Z`,
|
||||
endedAt: null,
|
||||
runtimeMs: 90,
|
||||
rowsProduced: 2,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]);
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader(rows),
|
||||
sqlAnalysis: diverseSqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_diverse_samples/usage.json'));
|
||||
expect(usage.samples).toHaveLength(5);
|
||||
expect(usage.samples.every((sample) => sample.success)).toBe(true);
|
||||
expect(new Set(usage.samples.map((sample) => sample.bound_sql.match(/status = '([^']+)'/)?.[1])).size).toBe(5);
|
||||
expect(usage.samples.map((sample) => sample.started_at)).toEqual([
|
||||
'2026-05-04T11:10:00.000Z',
|
||||
'2026-05-04T11:09:00.000Z',
|
||||
'2026-05-04T11:08:00.000Z',
|
||||
'2026-05-04T11:07:00.000Z',
|
||||
'2026-05-04T11:06:00.000Z',
|
||||
]);
|
||||
});
|
||||
|
||||
it('uses recency as a tie-breaker when the templates-per-run cap overflows', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
const sqlAnalysis: SqlAnalysisPort = {
|
||||
async analyzeForFingerprint(sql) {
|
||||
const table = sql.includes('fresh_orders') ? 'fresh_orders' : 'stale_orders';
|
||||
return {
|
||||
fingerprint: `fp_${table}`,
|
||||
normalizedSql: `SELECT count(*) FROM analytics.${table}`,
|
||||
tablesTouched: [`analytics.${table}`],
|
||||
literalSlots: [],
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader([
|
||||
{
|
||||
id: 'stale-1',
|
||||
sql: 'SELECT count(*) FROM analytics.stale_orders',
|
||||
user: 'analyst-a',
|
||||
startedAt: '2026-02-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
{
|
||||
id: 'fresh-1',
|
||||
sql: 'SELECT count(*) FROM analytics.fresh_orders',
|
||||
user: 'analyst-a',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]),
|
||||
sqlAnalysis,
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 1,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
expect(manifest.templates.map((template) => template.id)).toEqual(['fp_fresh_orders']);
|
||||
});
|
||||
|
||||
it('does not persist bound SQL samples when redaction patterns are invalid', async () => {
|
||||
const stagedDir = await tempDir();
|
||||
|
||||
await stageHistoricSqlTemplates({
|
||||
stagedDir,
|
||||
connectionId: 'conn_1',
|
||||
queryClient: {},
|
||||
reader: fakeReader([
|
||||
{
|
||||
id: 'q1',
|
||||
sql: "SELECT * FROM analytics.orders WHERE email = 'analyst@example.com'",
|
||||
user: 'analyst@example.com',
|
||||
startedAt: '2026-05-04T10:00:00.000Z',
|
||||
endedAt: null,
|
||||
runtimeMs: 100,
|
||||
rowsProduced: 1,
|
||||
success: true,
|
||||
errorMessage: null,
|
||||
},
|
||||
]),
|
||||
sqlAnalysis: {
|
||||
async analyzeForFingerprint() {
|
||||
return {
|
||||
fingerprint: 'fp_redaction',
|
||||
normalizedSql: 'SELECT * FROM analytics.orders WHERE email = ?',
|
||||
tablesTouched: ['analytics.orders'],
|
||||
literalSlots: [{ position: 1, type: 'string', exampleValue: 'analyst@example.com' }],
|
||||
};
|
||||
},
|
||||
},
|
||||
pullConfig: {
|
||||
dialect: 'snowflake',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: [],
|
||||
redactionPatterns: ['['],
|
||||
maxTemplatesPerRun: 5000,
|
||||
minCalls: 5,
|
||||
},
|
||||
now: new Date('2026-05-04T12:00:00.000Z'),
|
||||
});
|
||||
|
||||
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
|
||||
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_redaction/usage.json'));
|
||||
expect(manifest.warnings.some((warning) => warning.startsWith('redaction_skipped:invalid_redaction_pattern'))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(usage.samples).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
|
@ -1,630 +0,0 @@
|
|||
import { createHash } from 'node:crypto';
|
||||
import { mkdir, writeFile } from 'node:fs/promises';
|
||||
import { dirname, join } from 'node:path';
|
||||
import type {
|
||||
SqlAnalysisFingerprintResult,
|
||||
SqlAnalysisLiteralSlot,
|
||||
SqlAnalysisLiteralSlotType,
|
||||
SqlAnalysisPort,
|
||||
} from '../../../sql-analysis/index.js';
|
||||
import {
|
||||
HISTORIC_SQL_OBJECT_TYPE,
|
||||
HISTORIC_SQL_SOURCE_KEY,
|
||||
historicSqlPullConfigSchema,
|
||||
historicSqlRawQueryRowSchema,
|
||||
type HistoricSqlLiteralSlotClassification,
|
||||
type HistoricSqlManifest,
|
||||
type HistoricSqlMetadata,
|
||||
type HistoricSqlPullConfig,
|
||||
type HistoricSqlQueryHistoryReader,
|
||||
type HistoricSqlRawQueryRow,
|
||||
type HistoricSqlUsage,
|
||||
} from './types.js';
|
||||
|
||||
interface StageHistoricSqlTemplatesInput {
|
||||
stagedDir: string;
|
||||
connectionId: string;
|
||||
queryClient: unknown;
|
||||
reader: HistoricSqlQueryHistoryReader;
|
||||
sqlAnalysis: SqlAnalysisPort;
|
||||
pullConfig: HistoricSqlPullConfig;
|
||||
now?: Date;
|
||||
}
|
||||
|
||||
interface SlotObservation {
|
||||
value: string;
|
||||
rowStartedAt: string;
|
||||
}
|
||||
|
||||
interface SlotStats {
|
||||
position: number;
|
||||
type: SqlAnalysisLiteralSlotType;
|
||||
values: Map<string, number>;
|
||||
observations: SlotObservation[];
|
||||
}
|
||||
|
||||
interface TemplateAccumulator {
|
||||
fingerprint: string;
|
||||
normalizedSql: string;
|
||||
tablesTouched: Set<string>;
|
||||
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
|
||||
slotStats: Map<number, SlotStats>;
|
||||
}
|
||||
|
||||
interface ClassifiedLiteralSlot {
|
||||
position: number;
|
||||
type: SqlAnalysisLiteralSlotType;
|
||||
classification: HistoricSqlLiteralSlotClassification;
|
||||
}
|
||||
|
||||
interface TemplateVariant {
|
||||
id: string;
|
||||
fingerprint: string;
|
||||
subClusterId: string | null;
|
||||
normalizedSql: string;
|
||||
tablesTouched: Set<string>;
|
||||
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
|
||||
slotStats: Map<number, SlotStats>;
|
||||
slotClassifications: ClassifiedLiteralSlot[];
|
||||
}
|
||||
|
||||
interface CategoricalTupleEntry {
|
||||
position: number;
|
||||
value: string;
|
||||
}
|
||||
|
||||
interface RedactionPolicy {
|
||||
redactors: RegExp[];
|
||||
samplesAllowed: boolean;
|
||||
}
|
||||
|
||||
const HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i;
|
||||
const HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i;
|
||||
|
||||
export async function stageHistoricSqlTemplates(input: StageHistoricSqlTemplatesInput): Promise<void> {
|
||||
const config = historicSqlPullConfigSchema.parse(input.pullConfig);
|
||||
const now = input.now ?? new Date();
|
||||
const windowStart = config.lastSuccessfulCursor
|
||||
? new Date(config.lastSuccessfulCursor)
|
||||
: new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000);
|
||||
const warnings: string[] = [];
|
||||
const redaction = compileRedactors(config.redactionPatterns, warnings);
|
||||
const groups = new Map<string, TemplateAccumulator>();
|
||||
let nextSuccessfulCursor: string | null = null;
|
||||
|
||||
await input.reader.probe(input.queryClient);
|
||||
|
||||
for await (const rawRow of input.reader.fetch(
|
||||
input.queryClient,
|
||||
{ start: windowStart, end: now },
|
||||
config.lastSuccessfulCursor,
|
||||
)) {
|
||||
const row = historicSqlRawQueryRowSchema.parse(rawRow);
|
||||
if (!nextSuccessfulCursor || row.startedAt > nextSuccessfulCursor) {
|
||||
nextSuccessfulCursor = row.startedAt;
|
||||
}
|
||||
if (shouldSkipSql(row.sql)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const analysis = await input.sqlAnalysis.analyzeForFingerprint(row.sql, config.dialect);
|
||||
if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) {
|
||||
warnings.push(`analysis_failed:${row.id}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const group =
|
||||
groups.get(analysis.fingerprint) ??
|
||||
{
|
||||
fingerprint: analysis.fingerprint,
|
||||
normalizedSql: analysis.normalizedSql,
|
||||
tablesTouched: new Set<string>(),
|
||||
rows: [],
|
||||
slotStats: new Map<number, SlotStats>(),
|
||||
};
|
||||
|
||||
for (const table of analysis.tablesTouched) {
|
||||
group.tablesTouched.add(table);
|
||||
}
|
||||
for (const slot of analysis.literalSlots) {
|
||||
recordSlot(group.slotStats, slot, redaction.redactors, row.startedAt);
|
||||
}
|
||||
group.rows.push({ row, analysis });
|
||||
groups.set(analysis.fingerprint, group);
|
||||
}
|
||||
|
||||
const expandedTemplates = expandCategoricalTemplates([...groups.values()], redaction.redactors);
|
||||
const selected = selectTemplates(expandedTemplates, config.maxTemplatesPerRun, now);
|
||||
if (selected.length < expandedTemplates.length) {
|
||||
warnings.push(`templates_truncated: kept ${selected.length} of ${expandedTemplates.length} templates`);
|
||||
}
|
||||
|
||||
await mkdir(input.stagedDir, { recursive: true });
|
||||
const templates: HistoricSqlManifest['templates'] = [];
|
||||
for (const template of selected) {
|
||||
const staged = buildStagedTemplate(template, config, redaction, now);
|
||||
const basePath = `templates/${staged.metadata.id}`;
|
||||
await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata);
|
||||
await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown);
|
||||
await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage);
|
||||
templates.push({
|
||||
id: staged.metadata.id,
|
||||
fingerprint: staged.metadata.properties.fingerprint,
|
||||
subClusterId: staged.metadata.properties.sub_cluster_id,
|
||||
path: staged.metadata.path,
|
||||
});
|
||||
}
|
||||
|
||||
await writeJson(input.stagedDir, 'manifest.json', {
|
||||
source: HISTORIC_SQL_SOURCE_KEY,
|
||||
connectionId: input.connectionId,
|
||||
dialect: config.dialect,
|
||||
fetchedAt: now.toISOString(),
|
||||
windowStart: windowStart.toISOString(),
|
||||
windowEnd: now.toISOString(),
|
||||
nextSuccessfulCursor,
|
||||
templateCount: selected.length,
|
||||
capped: selected.length < expandedTemplates.length,
|
||||
warnings,
|
||||
degraded: false,
|
||||
statsResetAt: null,
|
||||
baselineFirstRun: false,
|
||||
pgServerVersion: null,
|
||||
deallocCount: null,
|
||||
templates,
|
||||
} satisfies HistoricSqlManifest);
|
||||
}
|
||||
|
||||
function shouldSkipSql(sql: string): boolean {
|
||||
return HARD_SKIP_PREFIX_RE.test(sql) || HARD_SKIP_TABLE_RE.test(sql);
|
||||
}
|
||||
|
||||
function recordSlot(
|
||||
slotStats: Map<number, SlotStats>,
|
||||
slot: SqlAnalysisLiteralSlot,
|
||||
redactors: RegExp[],
|
||||
rowStartedAt: string,
|
||||
): void {
|
||||
const existing = slotStats.get(slot.position) ?? {
|
||||
position: slot.position,
|
||||
type: slot.type,
|
||||
values: new Map<string, number>(),
|
||||
observations: [],
|
||||
};
|
||||
const persistedValue = redactText(slot.exampleValue, redactors);
|
||||
existing.values.set(persistedValue, (existing.values.get(persistedValue) ?? 0) + 1);
|
||||
existing.observations.push({ value: persistedValue, rowStartedAt });
|
||||
slotStats.set(slot.position, existing);
|
||||
}
|
||||
|
||||
function expandCategoricalTemplates(groups: TemplateAccumulator[], redactors: RegExp[]): TemplateVariant[] {
|
||||
return groups.flatMap((group) => expandTemplateGroup(group, redactors));
|
||||
}
|
||||
|
||||
function expandTemplateGroup(group: TemplateAccumulator, redactors: RegExp[]): TemplateVariant[] {
|
||||
const rows = [...group.rows].sort((left, right) => left.row.startedAt.localeCompare(right.row.startedAt));
|
||||
const firstSeen = rows[0]?.row.startedAt;
|
||||
if (!firstSeen) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const slotClassifications = classifySlots(group.slotStats, rows.length, firstSeen);
|
||||
const categoricalPositions = slotClassifications
|
||||
.filter((slot) => slot.classification === 'categorical')
|
||||
.map((slot) => slot.position)
|
||||
.sort((left, right) => left - right);
|
||||
|
||||
if (categoricalPositions.length === 0) {
|
||||
return [
|
||||
{
|
||||
id: group.fingerprint,
|
||||
fingerprint: group.fingerprint,
|
||||
subClusterId: null,
|
||||
normalizedSql: group.normalizedSql,
|
||||
tablesTouched: group.tablesTouched,
|
||||
rows,
|
||||
slotStats: group.slotStats,
|
||||
slotClassifications,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
const byTuple = new Map<
|
||||
string,
|
||||
{
|
||||
tuple: CategoricalTupleEntry[];
|
||||
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
|
||||
}
|
||||
>();
|
||||
|
||||
for (const entry of rows) {
|
||||
const tuple = categoricalTuple(entry.analysis.literalSlots, categoricalPositions, redactors);
|
||||
const key = JSON.stringify(tuple);
|
||||
const existing = byTuple.get(key) ?? { tuple, rows: [] };
|
||||
existing.rows.push(entry);
|
||||
byTuple.set(key, existing);
|
||||
}
|
||||
|
||||
return [...byTuple.values()]
|
||||
.map(({ tuple, rows: tupleRows }) => {
|
||||
const subClusterId = subClusterIdForTuple(tuple);
|
||||
return {
|
||||
id: `${group.fingerprint}__${subClusterId}`,
|
||||
fingerprint: group.fingerprint,
|
||||
subClusterId,
|
||||
normalizedSql: group.normalizedSql,
|
||||
tablesTouched: group.tablesTouched,
|
||||
rows: tupleRows,
|
||||
slotStats: collectSlotStats(tupleRows, redactors),
|
||||
slotClassifications,
|
||||
};
|
||||
})
|
||||
.sort((left, right) => left.id.localeCompare(right.id));
|
||||
}
|
||||
|
||||
function classifySlots(
|
||||
slotStats: Map<number, SlotStats>,
|
||||
executions: number,
|
||||
firstSeen: string,
|
||||
): ClassifiedLiteralSlot[] {
|
||||
return [...slotStats.values()]
|
||||
.sort((left, right) => left.position - right.position)
|
||||
.map((slot) => ({
|
||||
position: slot.position,
|
||||
type: slot.type,
|
||||
classification: classifySlot(slot, executions, firstSeen),
|
||||
}));
|
||||
}
|
||||
|
||||
function collectSlotStats(
|
||||
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>,
|
||||
redactors: RegExp[],
|
||||
): Map<number, SlotStats> {
|
||||
const slotStats = new Map<number, SlotStats>();
|
||||
for (const entry of rows) {
|
||||
for (const slot of entry.analysis.literalSlots) {
|
||||
recordSlot(slotStats, slot, redactors, entry.row.startedAt);
|
||||
}
|
||||
}
|
||||
return slotStats;
|
||||
}
|
||||
|
||||
function categoricalTuple(
|
||||
literalSlots: SqlAnalysisLiteralSlot[],
|
||||
categoricalPositions: number[],
|
||||
redactors: RegExp[],
|
||||
): CategoricalTupleEntry[] {
|
||||
const valuesByPosition = new Map(
|
||||
literalSlots.map((slot) => [slot.position, redactText(slot.exampleValue, redactors)] as const),
|
||||
);
|
||||
return categoricalPositions.map((position) => ({
|
||||
position,
|
||||
value: valuesByPosition.get(position) ?? '<missing>',
|
||||
}));
|
||||
}
|
||||
|
||||
function subClusterIdForTuple(tuple: CategoricalTupleEntry[]): string {
|
||||
return `cat_${createHash('sha256').update(JSON.stringify(tuple)).digest('hex').slice(0, 12)}`;
|
||||
}
|
||||
|
||||
function buildStagedTemplate(
|
||||
template: TemplateVariant,
|
||||
config: HistoricSqlPullConfig,
|
||||
redaction: RedactionPolicy,
|
||||
now: Date,
|
||||
): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } {
|
||||
const rows = template.rows
|
||||
.map((entry) => entry.row)
|
||||
.sort((left, right) => left.startedAt.localeCompare(right.startedAt));
|
||||
const firstSeen = rows[0].startedAt;
|
||||
const lastSeen = rows[rows.length - 1].startedAt;
|
||||
const distinctUsers = new Set(rows.map((row) => row.user).filter((user): user is string => !!user)).size;
|
||||
const errorCount = rows.filter((row) => !row.success).length;
|
||||
const runtimes = rows
|
||||
.map((row) => row.runtimeMs)
|
||||
.filter((runtime): runtime is number => typeof runtime === 'number')
|
||||
.sort((left, right) => left - right);
|
||||
const triageSignals = buildTriageSignals({
|
||||
executions: rows.length,
|
||||
distinctUsers,
|
||||
errorRate: rows.length === 0 ? 0 : errorCount / rows.length,
|
||||
lastSeen,
|
||||
now,
|
||||
serviceAccountOnly: isServiceAccountOnly(rows, config.serviceAccountUserPatterns),
|
||||
slotClassifications: template.slotClassifications.map((slot) => slot.classification),
|
||||
});
|
||||
const tablesTouched = [...template.tablesTouched].sort();
|
||||
const firstTable = tablesTouched[0] ?? 'query';
|
||||
const id = template.id;
|
||||
const rowsProduced = sumRowsProduced(rows);
|
||||
const metadata: HistoricSqlMetadata = {
|
||||
id,
|
||||
title: buildTemplateTitle(config.dialect, firstTable, template.fingerprint, template.subClusterId),
|
||||
path: `templates/${id}/page.md`,
|
||||
objectType: HISTORIC_SQL_OBJECT_TYPE,
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: template.fingerprint,
|
||||
sub_cluster_id: template.subClusterId,
|
||||
dialect: config.dialect,
|
||||
tables_touched: tablesTouched,
|
||||
literal_slots: template.slotClassifications,
|
||||
triage_signals: triageSignals,
|
||||
},
|
||||
};
|
||||
|
||||
return {
|
||||
metadata,
|
||||
pageMarkdown: renderTemplatePage(id, template.normalizedSql, tablesTouched),
|
||||
usage: {
|
||||
stats: {
|
||||
executions: rows.length,
|
||||
distinct_users: distinctUsers,
|
||||
first_seen: firstSeen,
|
||||
last_seen: lastSeen,
|
||||
p50_runtime_ms: percentile(runtimes, 0.5),
|
||||
p95_runtime_ms: percentile(runtimes, 0.95),
|
||||
error_rate: rows.length === 0 ? 0 : errorCount / rows.length,
|
||||
...(rowsProduced === null ? {} : { rows_produced: rowsProduced }),
|
||||
},
|
||||
literal_slots: [...template.slotStats.values()]
|
||||
.sort((left, right) => left.position - right.position)
|
||||
.map((slot) => ({
|
||||
position: slot.position,
|
||||
distinct_values: slot.values.size,
|
||||
top_values: [...slot.values.entries()]
|
||||
.sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))
|
||||
.slice(0, 10),
|
||||
})),
|
||||
samples: selectSamples(template.rows, redaction),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const TEMPORAL_SLOT_TYPES = new Set<SqlAnalysisLiteralSlotType>(['date', 'timestamp']);
|
||||
|
||||
function isStaleDateConstant(slot: SlotStats, value: string, firstSeen: string): boolean {
|
||||
return slot.type === 'date' && parseTemporalSlotValue(value) !== null && value < firstSeen.slice(0, 10);
|
||||
}
|
||||
|
||||
function isMovingTemporalSlot(slot: SlotStats): boolean {
|
||||
if (!TEMPORAL_SLOT_TYPES.has(slot.type) || slot.values.size < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const observations: Array<{ rowStartedAt: number; literalTime: number }> = [];
|
||||
for (const observation of slot.observations) {
|
||||
const rowStartedAt = Date.parse(observation.rowStartedAt);
|
||||
const literalTime = parseTemporalSlotValue(observation.value);
|
||||
if (Number.isNaN(rowStartedAt) || literalTime === null) {
|
||||
return false;
|
||||
}
|
||||
observations.push({ rowStartedAt, literalTime });
|
||||
}
|
||||
|
||||
const literalTimes = observations
|
||||
.sort((left, right) => left.rowStartedAt - right.rowStartedAt)
|
||||
.map((observation) => observation.literalTime);
|
||||
|
||||
return isMonotonic(literalTimes);
|
||||
}
|
||||
|
||||
function parseTemporalSlotValue(value: string): number | null {
|
||||
const parsed = Date.parse(value);
|
||||
return Number.isNaN(parsed) ? null : parsed;
|
||||
}
|
||||
|
||||
function isMonotonic(values: number[]): boolean {
|
||||
if (values.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
let nonDecreasing = true;
|
||||
let nonIncreasing = true;
|
||||
for (let index = 1; index < values.length; index += 1) {
|
||||
if (values[index] < values[index - 1]) {
|
||||
nonDecreasing = false;
|
||||
}
|
||||
if (values[index] > values[index - 1]) {
|
||||
nonIncreasing = false;
|
||||
}
|
||||
}
|
||||
|
||||
return nonDecreasing || nonIncreasing;
|
||||
}
|
||||
|
||||
function classifySlot(
|
||||
slot: SlotStats,
|
||||
executions: number,
|
||||
firstSeen: string,
|
||||
): HistoricSqlLiteralSlotClassification {
|
||||
const ordered = [...slot.values.entries()].sort((left, right) => right[1] - left[1]);
|
||||
const distinct = ordered.length;
|
||||
const topCount = ordered[0]?.[1] ?? 0;
|
||||
const topValue = ordered[0]?.[0] ?? '';
|
||||
const staleDateConstant = isStaleDateConstant(slot, topValue, firstSeen);
|
||||
|
||||
if (distinct === 1 && !staleDateConstant) {
|
||||
return 'constant';
|
||||
}
|
||||
if (executions > 0 && topCount / executions >= 0.95 && !staleDateConstant) {
|
||||
return 'constant';
|
||||
}
|
||||
if (isMovingTemporalSlot(slot)) {
|
||||
return 'runtime';
|
||||
}
|
||||
if (executions > 0 && distinct >= 2 && distinct <= 10 && ordered.every(([, count]) => count / executions >= 0.05)) {
|
||||
return 'categorical';
|
||||
}
|
||||
return 'runtime';
|
||||
}
|
||||
|
||||
function buildTriageSignals(input: {
|
||||
executions: number;
|
||||
distinctUsers: number;
|
||||
errorRate: number;
|
||||
lastSeen: string;
|
||||
now: Date;
|
||||
serviceAccountOnly: boolean;
|
||||
slotClassifications: HistoricSqlLiteralSlotClassification[];
|
||||
}): Record<string, string> {
|
||||
const runtimeCount = input.slotClassifications.filter((classification) => classification === 'runtime').length;
|
||||
const constantCount = input.slotClassifications.filter((classification) => classification === 'constant').length;
|
||||
return {
|
||||
executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high',
|
||||
distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad',
|
||||
error_rate_bucket: input.errorRate <= 0.01 ? 'ok' : input.errorRate <= 0.1 ? 'noisy' : 'broken',
|
||||
recency_bucket: recencyBucket(input.lastSeen, input.now),
|
||||
service_account_only: String(input.serviceAccountOnly),
|
||||
slot_summary: `${constantCount} constant, ${runtimeCount} runtime`,
|
||||
};
|
||||
}
|
||||
|
||||
function recencyBucket(lastSeen: string, now: Date): string {
|
||||
const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / (24 * 60 * 60 * 1000));
|
||||
if (ageDays <= 14) {
|
||||
return 'active';
|
||||
}
|
||||
if (ageDays <= 60) {
|
||||
return 'warm';
|
||||
}
|
||||
return 'cold';
|
||||
}
|
||||
|
||||
function isServiceAccountOnly(rows: HistoricSqlRawQueryRow[], patterns: string[]): boolean {
|
||||
const users = rows.map((row) => row.user).filter((user): user is string => !!user);
|
||||
if (users.length === 0 || patterns.length === 0) {
|
||||
return false;
|
||||
}
|
||||
const regexes = patterns.map((pattern) => new RegExp(pattern));
|
||||
return users.every((user) => regexes.some((regex) => regex.test(user)));
|
||||
}
|
||||
|
||||
function buildTemplateTitle(
|
||||
dialect: HistoricSqlPullConfig['dialect'],
|
||||
firstTable: string,
|
||||
fingerprint: string,
|
||||
subClusterId: string | null,
|
||||
): string {
|
||||
if (!subClusterId) {
|
||||
return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}]`;
|
||||
}
|
||||
return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}:${subClusterId.slice(-6)}]`;
|
||||
}
|
||||
|
||||
function renderTemplatePage(fingerprint: string, normalizedSql: string, tablesTouched: string[]): string {
|
||||
return [
|
||||
`# ${fingerprint}`,
|
||||
'',
|
||||
'## Normalized SQL',
|
||||
'```sql',
|
||||
normalizedSql,
|
||||
'```',
|
||||
'',
|
||||
'## Tables touched',
|
||||
...tablesTouched.map((table) => `- ${table}`),
|
||||
'',
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
function selectSamples(
|
||||
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>,
|
||||
redaction: RedactionPolicy,
|
||||
): HistoricSqlUsage['samples'] {
|
||||
if (!redaction.samplesAllowed) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const byLiteralTuple = new Map<string, { row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>();
|
||||
const preferred = [...rows].sort((left, right) => {
|
||||
if (left.row.success !== right.row.success) {
|
||||
return left.row.success ? -1 : 1;
|
||||
}
|
||||
return right.row.startedAt.localeCompare(left.row.startedAt);
|
||||
});
|
||||
|
||||
for (const entry of preferred) {
|
||||
const key = [...entry.analysis.literalSlots]
|
||||
.sort((left, right) => left.position - right.position)
|
||||
.map((slot) => slot.exampleValue)
|
||||
.join('\u001f');
|
||||
if (!byLiteralTuple.has(key)) {
|
||||
byLiteralTuple.set(key, entry);
|
||||
}
|
||||
}
|
||||
|
||||
return [...byLiteralTuple.values()]
|
||||
.sort((left, right) => right.row.startedAt.localeCompare(left.row.startedAt))
|
||||
.slice(0, 5)
|
||||
.map(({ row }) => ({
|
||||
started_at: row.startedAt,
|
||||
user: row.user,
|
||||
bound_sql: redactText(row.sql, redaction.redactors),
|
||||
...(row.rowsProduced === undefined ? {} : { rows_produced: row.rowsProduced ?? null }),
|
||||
runtime_ms: row.runtimeMs,
|
||||
success: row.success,
|
||||
}));
|
||||
}
|
||||
|
||||
function selectTemplates(templates: TemplateVariant[], maxTemplatesPerRun: number, now: Date): TemplateVariant[] {
|
||||
return templates
|
||||
.map((template) => ({ template, score: rankTemplate(template, now) }))
|
||||
.sort((left, right) => right.score - left.score || left.template.id.localeCompare(right.template.id))
|
||||
.slice(0, maxTemplatesPerRun)
|
||||
.map((entry) => entry.template);
|
||||
}
|
||||
|
||||
function rankTemplate(template: TemplateVariant, now: Date): number {
|
||||
const users = new Set(template.rows.map(({ row }) => row.user).filter((user): user is string => !!user)).size;
|
||||
const latestStartedAt = template.rows.reduce<string | null>(
|
||||
(latest, { row }) => (latest === null || row.startedAt > latest ? row.startedAt : latest),
|
||||
null,
|
||||
);
|
||||
const ageDays =
|
||||
latestStartedAt === null ? 365 : Math.max(0, (now.getTime() - new Date(latestStartedAt).getTime()) / 86400000);
|
||||
const recencyWeight = 1 / (1 + ageDays / 30);
|
||||
return users * Math.log1p(template.rows.length) * recencyWeight;
|
||||
}
|
||||
|
||||
function percentile(values: number[], percentileValue: number): number | null {
|
||||
if (values.length === 0) {
|
||||
return null;
|
||||
}
|
||||
const index = Math.min(values.length - 1, Math.max(0, Math.ceil(values.length * percentileValue) - 1));
|
||||
return values[index];
|
||||
}
|
||||
|
||||
function sumRowsProduced(rows: HistoricSqlRawQueryRow[]): number | null {
|
||||
const values = rows.map((row) => row.rowsProduced).filter((value): value is number => typeof value === 'number');
|
||||
return values.length > 0 ? values.reduce((sum, value) => sum + value, 0) : null;
|
||||
}
|
||||
|
||||
function compileRedactors(patterns: string[], warnings: string[]): RedactionPolicy {
|
||||
let samplesAllowed = true;
|
||||
const redactors = patterns.flatMap((pattern) => {
|
||||
try {
|
||||
return [new RegExp(pattern, 'g')];
|
||||
} catch (error) {
|
||||
samplesAllowed = false;
|
||||
warnings.push(
|
||||
`redaction_skipped:invalid_redaction_pattern:${pattern}:${error instanceof Error ? error.message : String(error)}`,
|
||||
);
|
||||
return [];
|
||||
}
|
||||
});
|
||||
return { redactors, samplesAllowed };
|
||||
}
|
||||
|
||||
function redactText(value: string, redactors: RegExp[]): string {
|
||||
return redactors.reduce((current, regex) => current.replace(regex, '<redacted>'), value);
|
||||
}
|
||||
|
||||
async function writeJson(stagedDir: string, relPath: string, value: unknown): Promise<void> {
|
||||
await writeText(stagedDir, relPath, `${JSON.stringify(value, null, 2)}\n`);
|
||||
}
|
||||
|
||||
async function writeText(stagedDir: string, relPath: string, value: string): Promise<void> {
|
||||
const target = join(stagedDir, relPath);
|
||||
await mkdir(dirname(target), { recursive: true });
|
||||
await writeFile(target, value, 'utf-8');
|
||||
}
|
||||
|
|
@ -0,0 +1,98 @@
|
|||
import { describe, expect, it } from 'vitest';
|
||||
import {
|
||||
aggregatedTemplateSchema,
|
||||
historicSqlUnifiedPullConfigSchema,
|
||||
stagedManifestSchema,
|
||||
stagedPatternsInputSchema,
|
||||
stagedTableInputSchema,
|
||||
} from './types.js';
|
||||
|
||||
describe('historic-sql unified contracts', () => {
|
||||
it('parses minExecutions and accepts minCalls as a one-release alias', () => {
|
||||
expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minExecutions: 9 })).toMatchObject({
|
||||
dialect: 'postgres',
|
||||
minExecutions: 9,
|
||||
windowDays: 90,
|
||||
concurrency: 12,
|
||||
redactionPatterns: [],
|
||||
staleArchiveAfterDays: 90,
|
||||
});
|
||||
|
||||
expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minCalls: 7 }).minExecutions).toBe(7);
|
||||
});
|
||||
|
||||
it('validates aggregate templates from warehouse readers', () => {
|
||||
const parsed = aggregatedTemplateSchema.parse({
|
||||
templateId: 'pg:123',
|
||||
canonicalSql: 'select status, count(*) from public.orders group by status',
|
||||
dialect: 'postgres',
|
||||
stats: {
|
||||
executions: 42,
|
||||
distinctUsers: 3,
|
||||
firstSeen: '2026-05-01T00:00:00.000Z',
|
||||
lastSeen: '2026-05-11T00:00:00.000Z',
|
||||
p50RuntimeMs: 12.5,
|
||||
p95RuntimeMs: 40,
|
||||
errorRate: 0,
|
||||
rowsProduced: 100,
|
||||
},
|
||||
topUsers: [{ user: 'analyst', executions: 40 }],
|
||||
});
|
||||
|
||||
expect(parsed.templateId).toBe('pg:123');
|
||||
expect(parsed.topUsers).toEqual([{ user: 'analyst', executions: 40 }]);
|
||||
});
|
||||
|
||||
it('validates staged table, patterns, and manifest artifacts', () => {
|
||||
expect(
|
||||
stagedTableInputSchema.parse({
|
||||
table: 'public.orders',
|
||||
stats: {
|
||||
executionsBucket: '10-100',
|
||||
distinctUsersBucket: '2-5',
|
||||
errorRateBucket: 'none',
|
||||
p95RuntimeBucket: '<100ms',
|
||||
recencyBucket: 'current',
|
||||
},
|
||||
columnsByClause: {
|
||||
select: [['status', 'high']],
|
||||
where: [['created_at', 'mid']],
|
||||
},
|
||||
observedJoins: [{ withTable: 'public.customers', on: ['customer_id'], freq: 'high' }],
|
||||
topTemplates: [{ id: 'pg:123', canonicalSql: 'select * from public.orders', topUsers: [{ user: 'analyst' }] }],
|
||||
}).table,
|
||||
).toBe('public.orders');
|
||||
|
||||
expect(
|
||||
stagedPatternsInputSchema.parse({
|
||||
templates: [
|
||||
{
|
||||
id: 'pg:123',
|
||||
canonicalSql: 'select * from public.orders',
|
||||
tablesTouched: ['public.orders'],
|
||||
executionsBucket: '10-100',
|
||||
distinctUsersBucket: '2-5',
|
||||
dialect: 'postgres',
|
||||
},
|
||||
],
|
||||
}).templates,
|
||||
).toHaveLength(1);
|
||||
|
||||
expect(
|
||||
stagedManifestSchema.parse({
|
||||
source: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-11T00:00:00.000Z',
|
||||
windowStart: '2026-02-10T00:00:00.000Z',
|
||||
windowEnd: '2026-05-11T00:00:00.000Z',
|
||||
snapshotRowCount: 2,
|
||||
touchedTableCount: 1,
|
||||
parseFailures: 1,
|
||||
warnings: ['parse_failed:bad'],
|
||||
probeWarnings: [],
|
||||
staleArchiveAfterDays: 90,
|
||||
}).staleArchiveAfterDays,
|
||||
).toBe(90);
|
||||
});
|
||||
});
|
||||
|
|
@ -2,200 +2,161 @@ import { z } from 'zod';
|
|||
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
|
||||
|
||||
export const HISTORIC_SQL_SOURCE_KEY = 'historic-sql' as const;
|
||||
export const HISTORIC_SQL_OBJECT_TYPE = 'historic_sql_template' as const;
|
||||
|
||||
const historicSqlDialectSchema = z.enum(['snowflake', 'bigquery', 'postgres']);
|
||||
export type HistoricSqlDialect = z.infer<typeof historicSqlDialectSchema>;
|
||||
|
||||
export const historicSqlPullConfigSchema = z.object({
|
||||
const filterModeSchema = z.enum(['exclude', 'include', 'mark-only']);
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
||||
}
|
||||
|
||||
export const historicSqlUnifiedPullConfigSchema = z.preprocess((value) => {
|
||||
if (!isRecord(value)) {
|
||||
return value;
|
||||
}
|
||||
const next: Record<string, unknown> = { ...value };
|
||||
if (next.minExecutions === undefined && typeof next.minCalls === 'number') {
|
||||
next.minExecutions = next.minCalls;
|
||||
}
|
||||
if (!next.filters && Array.isArray(next.serviceAccountUserPatterns)) {
|
||||
next.filters = {
|
||||
serviceAccounts: { patterns: next.serviceAccountUserPatterns, mode: 'exclude' },
|
||||
dropTrivialProbes: true,
|
||||
};
|
||||
}
|
||||
return next;
|
||||
}, z.object({
|
||||
dialect: historicSqlDialectSchema,
|
||||
windowDays: z.number().int().min(1).max(365).default(90),
|
||||
lastSuccessfulCursor: z.string().datetime().nullable().default(null),
|
||||
serviceAccountUserPatterns: z.array(z.string()).default([]),
|
||||
windowDays: z.number().int().positive().default(90),
|
||||
minExecutions: z.number().int().nonnegative().default(5),
|
||||
concurrency: z.number().int().positive().default(12),
|
||||
filters: z.object({
|
||||
serviceAccounts: z.object({
|
||||
patterns: z.array(z.string()).default([]),
|
||||
mode: filterModeSchema.default('exclude'),
|
||||
}).optional(),
|
||||
orchestrators: z.object({
|
||||
mode: filterModeSchema.default('mark-only'),
|
||||
}).optional(),
|
||||
dropTrivialProbes: z.boolean().default(true),
|
||||
dropFailedBelow: z.object({
|
||||
errorRate: z.number().min(0).max(1),
|
||||
executions: z.number().int().nonnegative(),
|
||||
}).optional(),
|
||||
}).default({ dropTrivialProbes: true }),
|
||||
redactionPatterns: z.array(z.string()).default([]),
|
||||
maxTemplatesPerRun: z.number().int().min(1).max(5000).default(5000),
|
||||
minCalls: z.number().int().min(1).default(5),
|
||||
staleArchiveAfterDays: z.number().int().positive().default(90),
|
||||
}));
|
||||
|
||||
export type HistoricSqlUnifiedPullConfig = z.infer<typeof historicSqlUnifiedPullConfigSchema>;
|
||||
|
||||
export const aggregatedTemplateSchema = z.object({
|
||||
templateId: z.string().min(1),
|
||||
canonicalSql: z.string().min(1),
|
||||
dialect: historicSqlDialectSchema,
|
||||
stats: z.object({
|
||||
executions: z.number().int().nonnegative(),
|
||||
distinctUsers: z.number().int().nonnegative(),
|
||||
firstSeen: z.iso.datetime(),
|
||||
lastSeen: z.iso.datetime(),
|
||||
p50RuntimeMs: z.number().nonnegative().nullable(),
|
||||
p95RuntimeMs: z.number().nonnegative().nullable(),
|
||||
errorRate: z.number().min(0).max(1),
|
||||
rowsProduced: z.number().int().nonnegative().nullable(),
|
||||
}),
|
||||
topUsers: z.array(z.object({
|
||||
user: z.string().nullable(),
|
||||
executions: z.number().int().nonnegative(),
|
||||
})).default([]),
|
||||
});
|
||||
export type HistoricSqlPullConfig = z.infer<typeof historicSqlPullConfigSchema>;
|
||||
export type AggregatedTemplate = z.infer<typeof aggregatedTemplateSchema>;
|
||||
|
||||
export const stagedTableInputSchema = z.object({
|
||||
table: z.string().min(1),
|
||||
stats: z.object({
|
||||
executionsBucket: z.string(),
|
||||
distinctUsersBucket: z.string(),
|
||||
errorRateBucket: z.string(),
|
||||
p95RuntimeBucket: z.string(),
|
||||
recencyBucket: z.string(),
|
||||
}),
|
||||
columnsByClause: z.record(z.string(), z.array(z.tuple([z.string(), z.string()]))),
|
||||
observedJoins: z.array(z.object({
|
||||
withTable: z.string(),
|
||||
on: z.array(z.string()),
|
||||
freq: z.string(),
|
||||
})),
|
||||
topTemplates: z.array(z.object({
|
||||
id: z.string(),
|
||||
canonicalSql: z.string(),
|
||||
topUsers: z.array(z.object({ user: z.string().nullable() })),
|
||||
})),
|
||||
});
|
||||
export type StagedTableInput = z.infer<typeof stagedTableInputSchema>;
|
||||
|
||||
export const stagedPatternsInputSchema = z.object({
|
||||
templates: z.array(z.object({
|
||||
id: z.string(),
|
||||
canonicalSql: z.string(),
|
||||
tablesTouched: z.array(z.string()),
|
||||
executionsBucket: z.string(),
|
||||
distinctUsersBucket: z.string(),
|
||||
dialect: historicSqlDialectSchema,
|
||||
})),
|
||||
});
|
||||
export type StagedPatternsInput = z.infer<typeof stagedPatternsInputSchema>;
|
||||
|
||||
export const stagedManifestSchema = z.object({
|
||||
source: z.literal(HISTORIC_SQL_SOURCE_KEY),
|
||||
connectionId: z.string().min(1),
|
||||
dialect: historicSqlDialectSchema,
|
||||
fetchedAt: z.iso.datetime(),
|
||||
windowStart: z.iso.datetime(),
|
||||
windowEnd: z.iso.datetime(),
|
||||
snapshotRowCount: z.number().int().nonnegative(),
|
||||
touchedTableCount: z.number().int().nonnegative(),
|
||||
parseFailures: z.number().int().nonnegative(),
|
||||
warnings: z.array(z.string()),
|
||||
probeWarnings: z.array(z.string()),
|
||||
staleArchiveAfterDays: z.number().int().positive().default(90),
|
||||
});
|
||||
export type StagedManifest = z.infer<typeof stagedManifestSchema>;
|
||||
|
||||
export interface HistoricSqlProbeResult {
|
||||
warnings: string[];
|
||||
info?: string[];
|
||||
}
|
||||
|
||||
export interface HistoricSqlReader {
|
||||
probe(client: unknown): Promise<HistoricSqlProbeResult>;
|
||||
fetchAggregated(
|
||||
client: unknown,
|
||||
window: HistoricSqlTimeWindow,
|
||||
config: HistoricSqlUnifiedPullConfig,
|
||||
): AsyncIterable<AggregatedTemplate>;
|
||||
}
|
||||
|
||||
export interface HistoricSqlTimeWindow {
|
||||
start: Date;
|
||||
end: Date;
|
||||
}
|
||||
|
||||
export const historicSqlRawQueryRowSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
sql: z.string().min(1),
|
||||
user: z.string().nullable().default(null),
|
||||
startedAt: z.string().datetime(),
|
||||
endedAt: z.string().datetime().nullable().default(null),
|
||||
runtimeMs: z.number().nonnegative().nullable().default(null),
|
||||
rowsProduced: z.number().int().nonnegative().nullable().optional(),
|
||||
success: z.boolean().default(true),
|
||||
errorMessage: z.string().nullable().default(null),
|
||||
});
|
||||
export type HistoricSqlRawQueryRow = z.infer<typeof historicSqlRawQueryRowSchema>;
|
||||
|
||||
export interface HistoricSqlQueryHistoryReader {
|
||||
probe(client: unknown): Promise<void>;
|
||||
fetch(
|
||||
client: unknown,
|
||||
window: HistoricSqlTimeWindow,
|
||||
cursor?: string | null,
|
||||
): AsyncIterable<HistoricSqlRawQueryRow>;
|
||||
}
|
||||
|
||||
export interface KtxPostgresQueryClient {
|
||||
executeQuery(sql: string, params?: unknown[]): Promise<{ headers: string[]; rows: unknown[][]; totalRows?: number }>;
|
||||
}
|
||||
|
||||
export interface PostgresPgssProbeResult {
|
||||
export interface PostgresPgssProbeResult extends HistoricSqlProbeResult {
|
||||
pgServerVersion: string;
|
||||
warnings: string[];
|
||||
}
|
||||
|
||||
export interface PostgresPgssSnapshot {
|
||||
statsResetAt: string | null;
|
||||
deallocCount: number | null;
|
||||
rows: PostgresPgssRow[];
|
||||
}
|
||||
|
||||
export interface PostgresPgssReader {
|
||||
probe(client: KtxPostgresQueryClient): Promise<PostgresPgssProbeResult>;
|
||||
readSnapshot(
|
||||
client: KtxPostgresQueryClient,
|
||||
options: { minCalls: number; maxTemplates: number },
|
||||
): Promise<PostgresPgssSnapshot>;
|
||||
}
|
||||
|
||||
export interface PostgresPgssRow {
|
||||
queryid: string;
|
||||
userid: string;
|
||||
username: string | null;
|
||||
dbid: string;
|
||||
database: string | null;
|
||||
query: string;
|
||||
calls: number;
|
||||
totalExecTime: number;
|
||||
meanExecTime: number;
|
||||
totalRows: number;
|
||||
}
|
||||
|
||||
export interface PostgresPgssAggregateRow {
|
||||
id: string;
|
||||
queryid: string;
|
||||
dbid: string;
|
||||
database: string | null;
|
||||
query: string;
|
||||
deltaCalls: number;
|
||||
deltaExecTime: number;
|
||||
deltaRows: number;
|
||||
meanExecTime: number;
|
||||
distinctUsersDelta: number;
|
||||
users: string[];
|
||||
firstObservedAt: string;
|
||||
info: string[];
|
||||
}
|
||||
|
||||
export interface HistoricSqlSourceAdapterDeps {
|
||||
sqlAnalysis: SqlAnalysisPort;
|
||||
reader: HistoricSqlQueryHistoryReader;
|
||||
reader: HistoricSqlReader;
|
||||
queryClient: unknown;
|
||||
postgresReader?: PostgresPgssReader;
|
||||
postgresQueryClient?: KtxPostgresQueryClient;
|
||||
postgresBaselineRootDir?: string;
|
||||
legacyPostgresBaselineRootDir?: string;
|
||||
now?: () => Date;
|
||||
onPullSucceeded?: (ctx: {
|
||||
connectionId: string;
|
||||
sourceKey: string;
|
||||
syncId: string;
|
||||
trigger: import('../../types.js').IngestTrigger;
|
||||
completedAt: Date;
|
||||
stagedDir: string;
|
||||
nextSuccessfulCursor: string | null;
|
||||
}) => Promise<void>;
|
||||
}
|
||||
|
||||
const historicSqlLiteralSlotClassificationSchema = z.enum(['constant', 'runtime', 'categorical']);
|
||||
export type HistoricSqlLiteralSlotClassification = z.infer<typeof historicSqlLiteralSlotClassificationSchema>;
|
||||
|
||||
export const historicSqlMetadataSchema = z.object({
|
||||
id: z.string().min(1),
|
||||
title: z.string().min(1),
|
||||
path: z.string().min(1),
|
||||
objectType: z.literal(HISTORIC_SQL_OBJECT_TYPE),
|
||||
lastEditedAt: z.null(),
|
||||
properties: z.object({
|
||||
fingerprint: z.string().min(1),
|
||||
sub_cluster_id: z.string().nullable(),
|
||||
dialect: historicSqlDialectSchema,
|
||||
tables_touched: z.array(z.string()),
|
||||
literal_slots: z.array(
|
||||
z.object({
|
||||
position: z.number().int().min(1),
|
||||
type: z.enum(['string', 'number', 'timestamp', 'date', 'boolean', 'null', 'unknown']),
|
||||
classification: historicSqlLiteralSlotClassificationSchema,
|
||||
}),
|
||||
),
|
||||
triage_signals: z.record(z.string(), z.string()),
|
||||
}),
|
||||
});
|
||||
export type HistoricSqlMetadata = z.infer<typeof historicSqlMetadataSchema>;
|
||||
|
||||
export const historicSqlUsageSchema = z.object({
|
||||
stats: z.object({
|
||||
executions: z.number().int().nonnegative(),
|
||||
distinct_users: z.number().int().nonnegative(),
|
||||
first_seen: z.string().datetime(),
|
||||
last_seen: z.string().datetime(),
|
||||
p50_runtime_ms: z.number().nonnegative().nullable(),
|
||||
p95_runtime_ms: z.number().nonnegative().nullable(),
|
||||
mean_runtime_ms: z.number().nonnegative().nullable().optional(),
|
||||
error_rate: z.number().min(0).max(1),
|
||||
rows_produced: z.number().int().nonnegative().nullable().optional(),
|
||||
}),
|
||||
literal_slots: z.array(
|
||||
z.object({
|
||||
position: z.number().int().min(1),
|
||||
distinct_values: z.number().int().nonnegative(),
|
||||
top_values: z.array(z.tuple([z.string(), z.number().int().nonnegative()])),
|
||||
}),
|
||||
),
|
||||
samples: z.array(
|
||||
z.object({
|
||||
started_at: z.string().datetime(),
|
||||
user: z.string().nullable(),
|
||||
bound_sql: z.string(),
|
||||
rows_produced: z.number().int().nonnegative().nullable().optional(),
|
||||
runtime_ms: z.number().nonnegative().nullable(),
|
||||
success: z.boolean(),
|
||||
}),
|
||||
),
|
||||
});
|
||||
export type HistoricSqlUsage = z.infer<typeof historicSqlUsageSchema>;
|
||||
|
||||
export const historicSqlManifestSchema = z.object({
|
||||
source: z.literal(HISTORIC_SQL_SOURCE_KEY),
|
||||
connectionId: z.string().min(1),
|
||||
dialect: historicSqlDialectSchema,
|
||||
fetchedAt: z.string().datetime(),
|
||||
windowStart: z.string().datetime(),
|
||||
windowEnd: z.string().datetime(),
|
||||
nextSuccessfulCursor: z.string().datetime().nullable(),
|
||||
templateCount: z.number().int().nonnegative(),
|
||||
capped: z.boolean(),
|
||||
warnings: z.array(z.string()),
|
||||
degraded: z.boolean().default(false),
|
||||
statsResetAt: z.string().datetime().nullable().default(null),
|
||||
baselineFirstRun: z.boolean().default(false),
|
||||
pgServerVersion: z.string().nullable().default(null),
|
||||
deallocCount: z.number().int().nonnegative().nullable().default(null),
|
||||
templates: z.array(
|
||||
z.object({
|
||||
id: z.string().min(1),
|
||||
fingerprint: z.string().min(1),
|
||||
subClusterId: z.string().nullable(),
|
||||
path: z.string().min(1),
|
||||
}),
|
||||
),
|
||||
});
|
||||
export type HistoricSqlManifest = z.infer<typeof historicSqlManifestSchema>;
|
||||
|
|
|
|||
|
|
@ -186,6 +186,62 @@ describe('buildLiveDatabaseManifestShards', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('preserves external usage keys while replacing historic SQL managed keys', () => {
|
||||
const existingUsage = new Map([
|
||||
[
|
||||
'orders',
|
||||
{
|
||||
narrative: 'Old generated usage narrative.',
|
||||
frequencyTier: 'low' as const,
|
||||
commonFilters: ['old_status'],
|
||||
commonJoins: [],
|
||||
ownerNote: 'Pinned analyst note',
|
||||
},
|
||||
],
|
||||
]);
|
||||
|
||||
const result = buildLiveDatabaseManifestShards({
|
||||
connectionType: 'POSTGRESQL',
|
||||
mapColumnType: (nativeType) => nativeType.toLowerCase(),
|
||||
existingUsage,
|
||||
tables: [
|
||||
{
|
||||
name: 'orders',
|
||||
catalog: null,
|
||||
db: 'public',
|
||||
usage: {
|
||||
narrative: 'Fresh generated usage narrative.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonGroupBys: ['created_at'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
},
|
||||
columns: [{ name: 'id', type: 'INTEGER' }],
|
||||
},
|
||||
],
|
||||
joins: [],
|
||||
});
|
||||
|
||||
expect(shardObject(result.shards)).toEqual({
|
||||
public: {
|
||||
tables: {
|
||||
orders: {
|
||||
table: 'public.orders',
|
||||
usage: {
|
||||
ownerNote: 'Pinned analyst note',
|
||||
narrative: 'Fresh generated usage narrative.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonGroupBys: ['created_at'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
},
|
||||
columns: [{ name: 'id', type: 'integer' }],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it('renders ordered multi-column joins in both directions', () => {
|
||||
const result = buildLiveDatabaseManifestShards({
|
||||
connectionType: 'POSTGRESQL',
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
import type { TableUsageOutput } from '../historic-sql/skill-schemas.js';
|
||||
|
||||
const RELATIONSHIP_MAP: Record<string, string> = {
|
||||
MANY_TO_ONE: 'many_to_one',
|
||||
ONE_TO_MANY: 'one_to_many',
|
||||
|
|
@ -11,6 +13,14 @@ const RELATIONSHIP_INVERSE: Record<string, string> = {
|
|||
};
|
||||
|
||||
const SCAN_MANAGED_DESCRIPTION_KEYS = new Set(['db', 'ai']);
|
||||
const HISTORIC_SQL_MANAGED_USAGE_KEYS = new Set([
|
||||
'narrative',
|
||||
'frequencyTier',
|
||||
'commonFilters',
|
||||
'commonGroupBys',
|
||||
'commonJoins',
|
||||
'staleSince',
|
||||
]);
|
||||
|
||||
export interface LiveDatabaseManifestColumn {
|
||||
name: string;
|
||||
|
|
@ -30,6 +40,7 @@ export interface LiveDatabaseManifestJoinEntry {
|
|||
export interface LiveDatabaseManifestTableEntry {
|
||||
table: string;
|
||||
descriptions?: Record<string, string>;
|
||||
usage?: TableUsageOutput;
|
||||
columns: LiveDatabaseManifestColumn[];
|
||||
joins?: LiveDatabaseManifestJoinEntry[];
|
||||
}
|
||||
|
|
@ -43,6 +54,7 @@ export interface LiveDatabaseManifestTableData {
|
|||
catalog: string | null;
|
||||
db: string | null;
|
||||
descriptions?: Record<string, string>;
|
||||
usage?: TableUsageOutput;
|
||||
columns: Array<{
|
||||
name: string;
|
||||
type: string;
|
||||
|
|
@ -73,6 +85,7 @@ export interface BuildLiveDatabaseManifestShardsInput {
|
|||
mapColumnType: (nativeType: string) => string;
|
||||
existingPreservedJoins?: Map<string, LiveDatabaseManifestJoinEntry[]>;
|
||||
existingDescriptions?: Map<string, LiveDatabaseManifestExistingDescriptions>;
|
||||
existingUsage?: Map<string, TableUsageOutput>;
|
||||
}
|
||||
|
||||
export interface BuildLiveDatabaseManifestShardsResult {
|
||||
|
|
@ -101,6 +114,28 @@ function mergeDescriptionsPreservingExternal(
|
|||
return Object.keys(result).length > 0 ? result : undefined;
|
||||
}
|
||||
|
||||
export function mergeUsagePreservingExternal(
|
||||
existing: TableUsageOutput | undefined,
|
||||
incoming: TableUsageOutput | undefined,
|
||||
): TableUsageOutput | undefined {
|
||||
if (!existing && !incoming) {
|
||||
return undefined;
|
||||
}
|
||||
if (!incoming) {
|
||||
return existing ? { ...existing } : undefined;
|
||||
}
|
||||
const result: Record<string, unknown> = {};
|
||||
if (existing) {
|
||||
for (const [key, value] of Object.entries(existing)) {
|
||||
if (!HISTORIC_SQL_MANAGED_USAGE_KEYS.has(key)) {
|
||||
result[key] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
Object.assign(result, incoming);
|
||||
return Object.keys(result).length > 0 ? (result as TableUsageOutput) : undefined;
|
||||
}
|
||||
|
||||
function getShardKey(connectionType: string, catalog: string | null, db: string | null): string {
|
||||
const normalized = connectionType.toUpperCase();
|
||||
|
||||
|
|
@ -254,6 +289,11 @@ export function buildLiveDatabaseManifestShards(
|
|||
entry.descriptions = tableDescriptions;
|
||||
}
|
||||
|
||||
const usage = mergeUsagePreservingExternal(input.existingUsage?.get(table.name), table.usage);
|
||||
if (usage) {
|
||||
entry.usage = usage;
|
||||
}
|
||||
|
||||
const tableJoins = joinsByTable.get(table.name);
|
||||
if (tableJoins && tableJoins.length > 0) {
|
||||
entry.joins = tableJoins;
|
||||
|
|
|
|||
|
|
@ -318,7 +318,8 @@ export { NOTION_ORG_KNOWLEDGE_WARNING } from './adapters/notion/chunk.js';
|
|||
export { NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN } from './adapters/notion/types.js';
|
||||
export { NotionSourceAdapter, type NotionSourceAdapterDeps } from './adapters/notion/notion.adapter.js';
|
||||
export { NotionClient, type NotionApi, type NotionBotInfo } from './adapters/notion/notion-client.js';
|
||||
export { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './adapters/historic-sql/chunk.js';
|
||||
export { bucketDistinctUsers, bucketErrorRate, bucketExecutions, bucketP95Runtime, bucketRecency } from './adapters/historic-sql/buckets.js';
|
||||
export { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './adapters/historic-sql/chunk-unified.js';
|
||||
export { detectHistoricSqlStagedDir } from './adapters/historic-sql/detect.js';
|
||||
export {
|
||||
HistoricSqlExtensionMissingError,
|
||||
|
|
@ -328,41 +329,55 @@ export {
|
|||
export { HistoricSqlSourceAdapter } from './adapters/historic-sql/historic-sql.adapter.js';
|
||||
export { BigQueryHistoricSqlQueryHistoryReader } from './adapters/historic-sql/bigquery-query-history-reader.js';
|
||||
export type { BigQueryHistoricSqlQueryHistoryReaderOptions } from './adapters/historic-sql/bigquery-query-history-reader.js';
|
||||
export { PostgresPgssQueryHistoryReader } from './adapters/historic-sql/postgres-pgss-query-history-reader.js';
|
||||
export { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js';
|
||||
export { SnowflakeHistoricSqlQueryHistoryReader } from './adapters/historic-sql/snowflake-query-history-reader.js';
|
||||
export { stageHistoricSqlTemplates } from './adapters/historic-sql/stage.js';
|
||||
export { stageHistoricSqlAggregatedSnapshot } from './adapters/historic-sql/stage-unified.js';
|
||||
export {
|
||||
pgssBaselinePath,
|
||||
readPgssBaseline,
|
||||
stagePgStatStatementsTemplates,
|
||||
writePgssBaselineAtomic,
|
||||
} from './adapters/historic-sql/stage-pgss.js';
|
||||
export type { PgssBaseline, StagePgStatStatementsTemplatesResult } from './adapters/historic-sql/stage-pgss.js';
|
||||
historicSqlEvidenceEnvelopeSchema,
|
||||
historicSqlEvidencePath,
|
||||
historicSqlPatternEvidenceSchema,
|
||||
historicSqlTableUsageEvidenceSchema,
|
||||
serializeHistoricSqlEvidence,
|
||||
} from './adapters/historic-sql/evidence.js';
|
||||
export type {
|
||||
HistoricSqlEvidenceEnvelope,
|
||||
HistoricSqlPatternEvidence,
|
||||
HistoricSqlTableUsageEvidence,
|
||||
} from './adapters/historic-sql/evidence.js';
|
||||
export { createEmitHistoricSqlEvidenceTool } from './adapters/historic-sql/evidence-tool.js';
|
||||
export { HistoricSqlProjectionPostProcessor } from './adapters/historic-sql/post-processor.js';
|
||||
export { projectHistoricSqlEvidence } from './adapters/historic-sql/projection.js';
|
||||
export type { HistoricSqlProjectionInput, HistoricSqlProjectionResult } from './adapters/historic-sql/projection.js';
|
||||
export {
|
||||
patternOutputSchema,
|
||||
patternsArraySchema,
|
||||
tableUsageOutputSchema,
|
||||
} from './adapters/historic-sql/skill-schemas.js';
|
||||
export type {
|
||||
PatternOutput,
|
||||
TableUsageOutput,
|
||||
} from './adapters/historic-sql/skill-schemas.js';
|
||||
export type {
|
||||
AggregatedTemplate,
|
||||
HistoricSqlDialect,
|
||||
HistoricSqlManifest,
|
||||
HistoricSqlMetadata,
|
||||
HistoricSqlPullConfig,
|
||||
HistoricSqlQueryHistoryReader,
|
||||
HistoricSqlRawQueryRow,
|
||||
HistoricSqlProbeResult,
|
||||
HistoricSqlReader,
|
||||
HistoricSqlSourceAdapterDeps,
|
||||
HistoricSqlTimeWindow,
|
||||
HistoricSqlUsage,
|
||||
HistoricSqlUnifiedPullConfig,
|
||||
KtxPostgresQueryClient,
|
||||
PostgresPgssAggregateRow,
|
||||
PostgresPgssProbeResult,
|
||||
PostgresPgssReader,
|
||||
PostgresPgssRow,
|
||||
PostgresPgssSnapshot,
|
||||
StagedManifest,
|
||||
StagedPatternsInput,
|
||||
StagedTableInput,
|
||||
} from './adapters/historic-sql/types.js';
|
||||
export {
|
||||
HISTORIC_SQL_OBJECT_TYPE,
|
||||
HISTORIC_SQL_SOURCE_KEY,
|
||||
historicSqlManifestSchema,
|
||||
historicSqlMetadataSchema,
|
||||
historicSqlPullConfigSchema,
|
||||
historicSqlRawQueryRowSchema,
|
||||
historicSqlUsageSchema,
|
||||
aggregatedTemplateSchema,
|
||||
historicSqlUnifiedPullConfigSchema,
|
||||
stagedManifestSchema,
|
||||
stagedPatternsInputSchema,
|
||||
stagedTableInputSchema,
|
||||
} from './adapters/historic-sql/types.js';
|
||||
export type { CanonicalPin } from './canonical-pins.js';
|
||||
export { buildCanonicalPinsPromptBlock, selectRelevantCanonicalPins } from './canonical-pins.js';
|
||||
|
|
|
|||
|
|
@ -405,44 +405,44 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
|
|||
);
|
||||
});
|
||||
|
||||
it('reuses document evidence indexing and page triage for historic-SQL WorkUnits', async () => {
|
||||
it('reuses document evidence indexing and page triage for document WorkUnits', async () => {
|
||||
const deps = makeDeps();
|
||||
deps.adapter.source = 'historic-sql';
|
||||
deps.adapter.skillNames = ['historic_sql_ingest'];
|
||||
deps.adapter.reconcileSkillNames = ['historic_sql_curator'];
|
||||
deps.adapter.source = 'notion';
|
||||
deps.adapter.skillNames = ['notion_synthesize'];
|
||||
deps.adapter.reconcileSkillNames = [];
|
||||
deps.adapter.evidenceIndexing = 'documents';
|
||||
deps.adapter.triageSupported = true;
|
||||
deps.adapter.chunk.mockResolvedValue({
|
||||
workUnits: [
|
||||
{ unitKey: 'full', rawFiles: ['templates/full/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
|
||||
{ unitKey: 'skip', rawFiles: ['templates/skip/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
|
||||
{ unitKey: 'full', rawFiles: ['pages/full/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
|
||||
{ unitKey: 'skip', rawFiles: ['pages/skip/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
|
||||
],
|
||||
});
|
||||
deps.diffSetService.compute.mockResolvedValue({
|
||||
added: ['templates/full/metadata.json', 'templates/skip/metadata.json'],
|
||||
added: ['pages/full/metadata.json', 'pages/skip/metadata.json'],
|
||||
modified: [],
|
||||
deleted: [],
|
||||
unchanged: [],
|
||||
});
|
||||
deps.pageTriage.triageRun.mockResolvedValue({
|
||||
enabled: true,
|
||||
fullRawPaths: new Set(['templates/full/metadata.json']),
|
||||
fullRawPaths: new Set(['pages/full/metadata.json']),
|
||||
warnings: [],
|
||||
});
|
||||
const runner = buildRunner(deps);
|
||||
(runner as any).stageRawFilesStage1 = vi.fn().mockResolvedValue({
|
||||
currentHashes: new Map([
|
||||
['templates/full/metadata.json', 'h-full'],
|
||||
['templates/skip/metadata.json', 'h-skip'],
|
||||
['pages/full/metadata.json', 'h-full'],
|
||||
['pages/skip/metadata.json', 'h-skip'],
|
||||
]),
|
||||
rawDirInWorktree: 'raw-sources/c1/historic-sql/s',
|
||||
rawDirInWorktree: 'raw-sources/c1/notion/s',
|
||||
});
|
||||
(runner as any).resolveStagedDir = vi.fn().mockResolvedValue('/tmp/stage/upload-x');
|
||||
|
||||
const result = await runner.run({
|
||||
jobId: 'j1',
|
||||
connectionId: 'c1',
|
||||
sourceKey: 'historic-sql',
|
||||
sourceKey: 'notion',
|
||||
trigger: 'upload',
|
||||
bundleRef: { kind: 'upload', uploadId: 'upload-x' },
|
||||
});
|
||||
|
|
@ -1428,6 +1428,67 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
|
|||
expect(deps.sessionWorktreeService.cleanup).toHaveBeenCalledWith(expect.any(Object), 'success');
|
||||
});
|
||||
|
||||
it('includes historic-sql post-processor output in memory-flow saved counts', async () => {
|
||||
const deps = makeDeps();
|
||||
deps.adapter.source = 'historic-sql';
|
||||
deps.registry.get.mockReturnValue(deps.adapter);
|
||||
deps.adapter.chunk.mockResolvedValue({
|
||||
workUnits: [
|
||||
{
|
||||
unitKey: 'historic-sql-table-public-orders',
|
||||
rawFiles: ['tables/public/orders.json'],
|
||||
peerFileIndex: [],
|
||||
dependencyPaths: [],
|
||||
},
|
||||
],
|
||||
});
|
||||
const postProcessor = {
|
||||
run: vi.fn().mockResolvedValue({
|
||||
result: {
|
||||
tableUsageMerged: 2,
|
||||
staleTablesMarked: 1,
|
||||
patternPagesWritten: 3,
|
||||
stalePatternPagesMarked: 1,
|
||||
archivedPatternPages: 1,
|
||||
legacyPagesDeleted: 1,
|
||||
},
|
||||
warnings: [],
|
||||
errors: [],
|
||||
touchedSources: [{ connectionId: 'c1', sourceName: 'orders' }],
|
||||
}),
|
||||
};
|
||||
const runner = buildRunner(deps, { postProcessors: { 'historic-sql': postProcessor } });
|
||||
(runner as any).stageRawFilesStage1 = vi.fn().mockResolvedValue({
|
||||
currentHashes: new Map([['tables/public/orders.json', 'h1']]),
|
||||
rawDirInWorktree: 'raw-sources/c1/historic-sql/s',
|
||||
});
|
||||
(runner as any).resolveStagedDir = vi.fn().mockResolvedValue('/tmp/stage/upload-x');
|
||||
const memoryFlow = createMemoryFlowLiveBuffer(bundleReplayInput());
|
||||
|
||||
await runner.run(
|
||||
{
|
||||
jobId: 'j1',
|
||||
connectionId: 'c1',
|
||||
sourceKey: 'historic-sql',
|
||||
trigger: 'upload',
|
||||
bundleRef: { kind: 'upload', uploadId: 'upload-x' },
|
||||
},
|
||||
{
|
||||
jobId: 'j1',
|
||||
memoryFlow,
|
||||
startPhase: () => new TestJobContext('j1', null, () => Promise.resolve(), () => Promise.resolve()),
|
||||
},
|
||||
);
|
||||
|
||||
expect(memoryFlow.snapshot().events).toContainEqual(
|
||||
expect.objectContaining({
|
||||
type: 'saved',
|
||||
wikiCount: 6,
|
||||
slCount: 3,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('marks post-processor infrastructure failure as failed and preserves worktree cleanup state', async () => {
|
||||
const deps = makeDeps();
|
||||
deps.adapter.source = 'metricflow';
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ import type { ContextEvidenceIndexSummary, IngestBundleRunnerDeps, PageTriageRun
|
|||
import { buildSyncId, rawSourcesDirForSync } from './raw-sources-paths.js';
|
||||
import {
|
||||
buildStageIndexFromReportBody,
|
||||
postProcessorSavedMemoryCounts,
|
||||
type IngestReportPostProcessorOutcome,
|
||||
type IngestReportSnapshot,
|
||||
} from './reports.js';
|
||||
|
|
@ -1087,11 +1088,12 @@ export class IngestBundleRunner {
|
|||
}
|
||||
const commitSha = mergeResult.touchedPaths.length === 0 ? null : mergeResult.squashSha;
|
||||
const memoryFlowSavedActions = stageIndex.workUnits.flatMap((wu) => wu.actions).concat(reconcileActions);
|
||||
const postProcessorMemoryCounts = postProcessorSavedMemoryCounts(postProcessorOutcome);
|
||||
memoryFlow?.emit({
|
||||
type: 'saved',
|
||||
commitSha,
|
||||
wikiCount: countMemoryFlowActions(memoryFlowSavedActions, 'wiki'),
|
||||
slCount: countMemoryFlowActions(memoryFlowSavedActions, 'sl'),
|
||||
wikiCount: countMemoryFlowActions(memoryFlowSavedActions, 'wiki') + postProcessorMemoryCounts.wikiCount,
|
||||
slCount: countMemoryFlowActions(memoryFlowSavedActions, 'sl') + postProcessorMemoryCounts.slCount,
|
||||
});
|
||||
await stage6?.updateProgress(1.0, commitSha ? `Saved changes (${commitSha.slice(0, 8)})` : 'No changes to save');
|
||||
|
||||
|
|
|
|||
|
|
@ -29,48 +29,10 @@ describe('ingest prompt assets', () => {
|
|||
expect(prompt).not.toMatch(forbiddenProductPattern());
|
||||
});
|
||||
|
||||
it('pins historic-SQL triage rules with synthetic signal fixtures', async () => {
|
||||
it('does not route historic-SQL through page-triage prompt examples', async () => {
|
||||
const prompt = await readFile(new URL('../../prompts/skills/page_triage_classifier.md', import.meta.url), 'utf-8');
|
||||
|
||||
expect(prompt).toContain('signals.objectType === "historic_sql_template"');
|
||||
expect(prompt).toContain('executions_bucket=low AND distinct_users_bucket=solo');
|
||||
expect(prompt).toContain('service_account_only=true AND below the frequency floor');
|
||||
expect(prompt).toContain('shared human usage with mid or high execution volume');
|
||||
|
||||
const fixtures = [
|
||||
{
|
||||
label: 'skip low solo template',
|
||||
objectType: '"objectType": "historic_sql_template"',
|
||||
executions: '"executions_bucket": "low"',
|
||||
users: '"distinct_users_bucket": "solo"',
|
||||
serviceAccount: '"service_account_only": "false"',
|
||||
lane: '-> `skip`',
|
||||
},
|
||||
{
|
||||
label: 'light service-account-only template',
|
||||
objectType: '"objectType": "historic_sql_template"',
|
||||
executions: '"executions_bucket": "high"',
|
||||
users: '"distinct_users_bucket": "solo"',
|
||||
serviceAccount: '"service_account_only": "true"',
|
||||
lane: '-> `light`',
|
||||
},
|
||||
{
|
||||
label: 'full shared human template',
|
||||
objectType: '"objectType": "historic_sql_template"',
|
||||
executions: '"executions_bucket": "high"',
|
||||
users: '"distinct_users_bucket": "team"',
|
||||
serviceAccount: '"service_account_only": "false"',
|
||||
lane: '-> `full`',
|
||||
},
|
||||
];
|
||||
|
||||
for (const fixture of fixtures) {
|
||||
expect(prompt).toContain(fixture.label);
|
||||
expect(prompt).toContain(fixture.objectType);
|
||||
expect(prompt).toContain(fixture.executions);
|
||||
expect(prompt).toContain(fixture.users);
|
||||
expect(prompt).toContain(fixture.serviceAccount);
|
||||
expect(prompt).toContain(fixture.lane);
|
||||
}
|
||||
expect(prompt).not.toContain(['historic_sql', 'template'].join('_'));
|
||||
expect(prompt).not.toContain('service_account_only=true AND below the frequency floor');
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -14,14 +14,14 @@ const adapterSkillNames = [
|
|||
'metabase_ingest',
|
||||
'metricflow_ingest',
|
||||
'notion_synthesize',
|
||||
'historic_sql_ingest',
|
||||
'historic_sql_table_digest',
|
||||
'historic_sql_patterns',
|
||||
'ingest_triage',
|
||||
'knowledge_capture',
|
||||
'sl_capture',
|
||||
] as const;
|
||||
|
||||
const adapterReconcileSkillNames = [
|
||||
'historic_sql_curator',
|
||||
'ingest_triage',
|
||||
'knowledge_capture',
|
||||
'sl_capture',
|
||||
|
|
@ -58,75 +58,37 @@ describe('ingest runtime assets', () => {
|
|||
}
|
||||
|
||||
await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain('# Page Triage Classifier');
|
||||
await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain(
|
||||
'signals.objectType === "historic_sql_template"',
|
||||
);
|
||||
await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain(
|
||||
'service_account_only=true AND below the frequency floor',
|
||||
);
|
||||
await expect(prompts.loadPrompt('skills/light_extraction')).resolves.toContain('# Light Context Extraction');
|
||||
});
|
||||
|
||||
it('packages historic-SQL WorkUnit skill guidance from KTX assets', async () => {
|
||||
it('packages historic-SQL table digest guidance from KTX assets', async () => {
|
||||
const registry = new SkillsRegistryService({ skillsDir });
|
||||
const skills = await registry.listSkills(['historic_sql_ingest'], 'memory_agent');
|
||||
const skills = await registry.listSkills(['historic_sql_table_digest'], 'memory_agent');
|
||||
|
||||
expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_ingest']);
|
||||
expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_table_digest']);
|
||||
|
||||
const [skill] = skills;
|
||||
if (!skill) {
|
||||
throw new Error('historic_sql_ingest skill missing');
|
||||
}
|
||||
|
||||
expect(skill.path.startsWith(skillsDir)).toBe(true);
|
||||
|
||||
const body = await readFile(join(skill.path, 'SKILL.md'), 'utf-8');
|
||||
expect(body).toContain('# Historic SQL Ingest');
|
||||
expect(body).toContain('Read exactly one historic-SQL template WorkUnit');
|
||||
expect(body).toContain('metadata.json');
|
||||
expect(body).toContain('page.md');
|
||||
expect(body).toContain('usage.json');
|
||||
expect(body).toContain('manifest.json');
|
||||
expect(body).toContain('wiki_write');
|
||||
expect(body).toContain('key: "queries/<intent_slug>"');
|
||||
expect(body).toContain('"source": "historic-sql"');
|
||||
expect(body).toContain('representative_sql');
|
||||
expect(body).toContain('fingerprints');
|
||||
expect(body).toContain('usage');
|
||||
expect(body).toContain('SL proposal threshold');
|
||||
expect(body).toContain('Do not group sibling templates');
|
||||
expect(body).toContain('Do not copy sample bound_sql');
|
||||
expect(body).not.toContain('store historic-SQL provenance in the markdown body');
|
||||
const body = await readFile(join(skills[0]!.path, 'SKILL.md'), 'utf-8');
|
||||
expect(body).toContain('# Historic SQL Table Digest');
|
||||
expect(body).toContain('tables/<schema>.<name>.json');
|
||||
expect(body).toContain('tableUsageOutputSchema');
|
||||
expect(body).toContain('emit_historic_sql_evidence');
|
||||
expect(body).toContain('Do not call wiki_write');
|
||||
expect(body).toContain('Do not call sl_write_source');
|
||||
expect(body).not.toMatch(forbiddenProductPattern());
|
||||
});
|
||||
|
||||
it('packages historic-SQL curator reconcile guidance from KTX assets', async () => {
|
||||
it('packages historic-SQL patterns guidance from KTX assets', async () => {
|
||||
const registry = new SkillsRegistryService({ skillsDir });
|
||||
const skills = await registry.listSkills(['historic_sql_curator'], 'memory_agent');
|
||||
const skills = await registry.listSkills(['historic_sql_patterns'], 'memory_agent');
|
||||
|
||||
expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_curator']);
|
||||
expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_patterns']);
|
||||
|
||||
const [skill] = skills;
|
||||
if (!skill) {
|
||||
throw new Error('historic_sql_curator skill missing');
|
||||
}
|
||||
|
||||
expect(skill.path.startsWith(skillsDir)).toBe(true);
|
||||
|
||||
const body = await readFile(join(skill.path, 'SKILL.md'), 'utf-8');
|
||||
expect(body).toContain('# Historic SQL Curator');
|
||||
expect(body).toContain('curator pagination');
|
||||
expect(body).toContain('stage_list');
|
||||
expect(body).toContain('stage_diff');
|
||||
expect(body).toContain('read_raw_span');
|
||||
expect(body).toContain('wiki_search');
|
||||
expect(body).toContain('wiki_read');
|
||||
expect(body).toContain('wiki_write');
|
||||
expect(body).toContain('emit_artifact_resolution');
|
||||
expect(body).toContain('emit_eviction_decision');
|
||||
expect(body).toContain('categorical sub-cluster');
|
||||
expect(body).toContain('historic-sql-demoted');
|
||||
expect(body).toContain('Do not call `context_candidate_write`');
|
||||
const body = await readFile(join(skills[0]!.path, 'SKILL.md'), 'utf-8');
|
||||
expect(body).toContain('# Historic SQL Patterns');
|
||||
expect(body).toContain('patterns-input/part-0001.json');
|
||||
expect(body).toContain('patternsArraySchema');
|
||||
expect(body).toContain('emit_historic_sql_evidence');
|
||||
expect(body).toContain('cross-table');
|
||||
expect(body).not.toMatch(forbiddenProductPattern());
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import { join } from 'node:path';
|
|||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
import { initKtxProject, type KtxLocalProject, loadKtxProject } from '../project/index.js';
|
||||
import type { SqlAnalysisPort } from '../sql-analysis/index.js';
|
||||
import type { HistoricSqlReader } from './adapters/historic-sql/types.js';
|
||||
import { LocalLookerRuntimeStore } from './adapters/looker/local-runtime-store.js';
|
||||
import { createDefaultLocalIngestAdapters, localPullConfigForAdapter } from './local-adapters.js';
|
||||
|
||||
|
|
@ -92,6 +93,9 @@ describe('local ingest adapters', () => {
|
|||
literalSlots: [],
|
||||
};
|
||||
},
|
||||
async analyzeBatch() {
|
||||
return new Map();
|
||||
},
|
||||
};
|
||||
const adapters = createDefaultLocalIngestAdapters(project, {
|
||||
historicSql: {
|
||||
|
|
@ -107,6 +111,44 @@ describe('local ingest adapters', () => {
|
|||
|
||||
expect(adapters.map((adapter) => adapter.source)).toContain('historic-sql');
|
||||
expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.fetch).toBeTypeOf('function');
|
||||
expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.skillNames).toEqual([
|
||||
'historic_sql_table_digest',
|
||||
'historic_sql_patterns',
|
||||
]);
|
||||
});
|
||||
|
||||
it('registers historic-sql with an injected non-Postgres reader and query client', () => {
|
||||
const reader: HistoricSqlReader = {
|
||||
async probe() {
|
||||
return { warnings: [], info: [] };
|
||||
},
|
||||
async *fetchAggregated() {},
|
||||
};
|
||||
const queryClient = { executeQuery: async () => ({ headers: [], rows: [], totalRows: 0 }) };
|
||||
|
||||
const adapters = createDefaultLocalIngestAdapters(project, {
|
||||
historicSql: {
|
||||
sqlAnalysis: {
|
||||
async analyzeForFingerprint(sql) {
|
||||
return {
|
||||
fingerprint: 'fp',
|
||||
normalizedSql: sql,
|
||||
tablesTouched: [],
|
||||
literalSlots: [],
|
||||
};
|
||||
},
|
||||
async analyzeBatch() {
|
||||
return new Map();
|
||||
},
|
||||
},
|
||||
reader,
|
||||
queryClient,
|
||||
},
|
||||
});
|
||||
|
||||
const adapter = adapters.find((candidate) => candidate.source === 'historic-sql');
|
||||
expect(adapter).toBeDefined();
|
||||
expect(adapter?.fetch).toBeTypeOf('function');
|
||||
});
|
||||
|
||||
it('builds Postgres historic-sql pull config from a local connection', async () => {
|
||||
|
|
@ -121,6 +163,9 @@ describe('local ingest adapters', () => {
|
|||
literalSlots: [],
|
||||
};
|
||||
},
|
||||
async analyzeBatch() {
|
||||
return new Map();
|
||||
},
|
||||
},
|
||||
postgresQueryClient: {
|
||||
async executeQuery() {
|
||||
|
|
@ -146,11 +191,14 @@ describe('local ingest adapters', () => {
|
|||
await expect(localPullConfigForAdapter(postgresProject, historicSql!, 'warehouse')).resolves.toEqual({
|
||||
dialect: 'postgres',
|
||||
windowDays: 90,
|
||||
lastSuccessfulCursor: null,
|
||||
serviceAccountUserPatterns: ['^svc_'],
|
||||
minExecutions: 7,
|
||||
concurrency: 12,
|
||||
filters: {
|
||||
serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' },
|
||||
dropTrivialProbes: true,
|
||||
},
|
||||
redactionPatterns: [],
|
||||
maxTemplatesPerRun: 123,
|
||||
minCalls: 7,
|
||||
staleArchiveAfterDays: 90,
|
||||
});
|
||||
});
|
||||
|
||||
|
|
@ -166,6 +214,9 @@ describe('local ingest adapters', () => {
|
|||
literalSlots: [],
|
||||
};
|
||||
},
|
||||
async analyzeBatch() {
|
||||
return new Map();
|
||||
},
|
||||
},
|
||||
postgresQueryClient: {
|
||||
async executeQuery() {
|
||||
|
|
|
|||
|
|
@ -6,11 +6,11 @@ import type { SqlAnalysisPort } from '../sql-analysis/index.js';
|
|||
import { DbtSourceAdapter } from './adapters/dbt/dbt.adapter.js';
|
||||
import { FakeSourceAdapter } from './adapters/fake/fake.adapter.js';
|
||||
import { HistoricSqlSourceAdapter } from './adapters/historic-sql/historic-sql.adapter.js';
|
||||
import { PostgresPgssQueryHistoryReader } from './adapters/historic-sql/postgres-pgss-query-history-reader.js';
|
||||
import { SnowflakeHistoricSqlQueryHistoryReader } from './adapters/historic-sql/snowflake-query-history-reader.js';
|
||||
import { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js';
|
||||
import {
|
||||
HISTORIC_SQL_SOURCE_KEY,
|
||||
historicSqlPullConfigSchema,
|
||||
historicSqlUnifiedPullConfigSchema,
|
||||
type HistoricSqlReader,
|
||||
type KtxPostgresQueryClient,
|
||||
} from './adapters/historic-sql/types.js';
|
||||
import {
|
||||
|
|
@ -43,7 +43,9 @@ export interface DefaultLocalIngestAdaptersOptions {
|
|||
databaseIntrospection?: Omit<DaemonLiveDatabaseIntrospectionOptions, 'connections' | 'baseUrl'>;
|
||||
historicSql?: {
|
||||
sqlAnalysis: SqlAnalysisPort;
|
||||
postgresQueryClient: KtxPostgresQueryClient;
|
||||
reader?: HistoricSqlReader;
|
||||
queryClient?: unknown;
|
||||
postgresQueryClient?: KtxPostgresQueryClient;
|
||||
postgresBaselineRootDir?: string;
|
||||
now?: () => Date;
|
||||
};
|
||||
|
|
@ -91,18 +93,16 @@ export function createDefaultLocalIngestAdapters(
|
|||
];
|
||||
|
||||
if (options.historicSql) {
|
||||
const queryClient = options.historicSql.queryClient ?? options.historicSql.postgresQueryClient;
|
||||
if (!queryClient) {
|
||||
throw new Error('Historic SQL local adapter requires queryClient or postgresQueryClient');
|
||||
}
|
||||
adapters.push(
|
||||
new HistoricSqlSourceAdapter({
|
||||
sqlAnalysis: options.historicSql.sqlAnalysis,
|
||||
reader: new SnowflakeHistoricSqlQueryHistoryReader(),
|
||||
queryClient: {
|
||||
executeQuery: async () => {
|
||||
throw new Error('Local historic-SQL currently supports Postgres pg_stat_statements only');
|
||||
},
|
||||
},
|
||||
postgresReader: new PostgresPgssQueryHistoryReader(),
|
||||
postgresQueryClient: options.historicSql.postgresQueryClient,
|
||||
postgresBaselineRootDir: options.historicSql.postgresBaselineRootDir,
|
||||
reader: options.historicSql.reader ?? new PostgresPgssReader(),
|
||||
queryClient,
|
||||
legacyPostgresBaselineRootDir: options.historicSql.postgresBaselineRootDir,
|
||||
now: options.historicSql.now,
|
||||
}),
|
||||
);
|
||||
|
|
@ -180,9 +180,8 @@ export async function localPullConfigForAdapter(
|
|||
if (historicSql?.enabled !== true) {
|
||||
throw new Error(`Connection "${connectionId}" does not have historicSql.enabled: true`);
|
||||
}
|
||||
return historicSqlPullConfigSchema.parse({
|
||||
return historicSqlUnifiedPullConfigSchema.parse({
|
||||
...historicSql,
|
||||
lastSuccessfulCursor: stringField(historicSql.lastSuccessfulCursor),
|
||||
});
|
||||
}
|
||||
if (adapter.source === 'looker') {
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
|
|||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import Database from 'better-sqlite3';
|
||||
import YAML from 'yaml';
|
||||
import { AgentRunnerService } from '../agent/index.js';
|
||||
import { initKtxProject, type KtxLocalProject, loadKtxProject } from '../project/index.js';
|
||||
import { makeLocalGitRepo } from '../test/make-local-git-repo.js';
|
||||
|
|
@ -10,6 +11,7 @@ import { FakeSourceAdapter } from './adapters/fake/fake.adapter.js';
|
|||
import { LocalLookerRuntimeStore } from './adapters/looker/local-runtime-store.js';
|
||||
import { createDefaultLocalIngestAdapters, localPullConfigForAdapter } from './local-adapters.js';
|
||||
import { getLocalIngestStatus, runLocalIngest } from './local-ingest.js';
|
||||
import type { ChunkResult, DiffSet, SourceAdapter } from './types.js';
|
||||
|
||||
class TestAgentRunner extends AgentRunnerService {
|
||||
override runLoop = vi.fn().mockResolvedValue({ stopReason: 'natural' as const });
|
||||
|
|
@ -86,6 +88,70 @@ class WikiWritingAgentRunner extends AgentRunnerService {
|
|||
}
|
||||
}
|
||||
|
||||
class HistoricSqlEvidenceAgentRunner extends AgentRunnerService {
|
||||
override runLoop = vi.fn(async (params: any) => {
|
||||
if (
|
||||
params.telemetryTags?.operationName === 'ingest-bundle-wu' &&
|
||||
params.telemetryTags?.unitKey === 'historic-sql-table-public-orders'
|
||||
) {
|
||||
const emitEvidence = params.toolSet.emit_historic_sql_evidence;
|
||||
if (!emitEvidence?.execute) {
|
||||
throw new Error('emit_historic_sql_evidence tool was not available to the historic-SQL WorkUnit');
|
||||
}
|
||||
const result = await emitEvidence.execute(
|
||||
{
|
||||
kind: 'table_usage',
|
||||
table: 'public.orders',
|
||||
rawPath: 'tables/public.orders.json',
|
||||
usage: {
|
||||
narrative: 'Orders are repeatedly queried by lifecycle status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [],
|
||||
staleSince: null,
|
||||
},
|
||||
},
|
||||
{ toolCallId: 'historic-sql-evidence' },
|
||||
);
|
||||
if (!String(result).includes('Recorded historic-SQL table_usage evidence')) {
|
||||
throw new Error(`Unexpected historic-SQL evidence result: ${String(result)}`);
|
||||
}
|
||||
}
|
||||
return { stopReason: 'natural' as const };
|
||||
});
|
||||
|
||||
constructor() {
|
||||
super({ llmProvider: { getModel: () => ({}) as never } as never });
|
||||
}
|
||||
}
|
||||
|
||||
class HistoricSqlEvidenceTestAdapter implements SourceAdapter {
|
||||
readonly source = 'historic-sql';
|
||||
readonly skillNames = ['historic_sql_table_digest'];
|
||||
readonly reconcileSkillNames: string[] = [];
|
||||
readonly triageSupported = false;
|
||||
|
||||
detect(): Promise<boolean> {
|
||||
return Promise.resolve(true);
|
||||
}
|
||||
|
||||
chunk(_stagedDir: string, _diffSet?: DiffSet): Promise<ChunkResult> {
|
||||
return Promise.resolve({
|
||||
workUnits: [
|
||||
{
|
||||
unitKey: 'historic-sql-table-public-orders',
|
||||
displayLabel: 'public.orders',
|
||||
rawFiles: ['tables/public.orders.json'],
|
||||
peerFileIndex: [],
|
||||
dependencyPaths: ['manifest.json'],
|
||||
notes:
|
||||
'Use historic_sql_table_digest. Read this table usage JSON and emit exactly one table_usage object with emit_historic_sql_evidence.',
|
||||
},
|
||||
],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function makeLookerRuntimeClient() {
|
||||
const lookerModels = {
|
||||
models: [{ name: 'ecommerce', label: 'Ecommerce', explores: [{ name: 'orders', label: 'Orders' }] }],
|
||||
|
|
@ -308,6 +374,90 @@ describe('canonical local ingest', () => {
|
|||
}
|
||||
});
|
||||
|
||||
it('runs historic-SQL evidence projection through the local bundle post-processor', async () => {
|
||||
const projectDir = join(tempDir, 'historic-sql-project');
|
||||
await initKtxProject({ projectDir, projectName: 'warehouse' });
|
||||
await writeFile(
|
||||
join(projectDir, 'ktx.yaml'),
|
||||
[
|
||||
'project: warehouse',
|
||||
'connections:',
|
||||
' warehouse:',
|
||||
' driver: postgres',
|
||||
'ingest:',
|
||||
' adapters:',
|
||||
' - historic-sql',
|
||||
' embeddings:',
|
||||
' backend: deterministic',
|
||||
'storage:',
|
||||
' state: sqlite',
|
||||
' search: sqlite-fts5',
|
||||
' git:',
|
||||
' auto_commit: false',
|
||||
' author: KTX Test <system@ktx.local>',
|
||||
'',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
);
|
||||
const historicProject = await loadKtxProject({ projectDir });
|
||||
await historicProject.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
YAML.stringify({ tables: { orders: { table: 'public.orders', columns: [{ name: 'id', type: 'string' }] } } }),
|
||||
'KTX Test',
|
||||
'system@ktx.local',
|
||||
'Seed schema shard',
|
||||
);
|
||||
|
||||
const sourceDir = join(tempDir, 'historic-sql-source');
|
||||
await mkdir(join(sourceDir, 'tables'), { recursive: true });
|
||||
await writeFile(
|
||||
join(sourceDir, 'manifest.json'),
|
||||
`${JSON.stringify(
|
||||
{
|
||||
source: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
dialect: 'postgres',
|
||||
fetchedAt: '2026-05-11T00:00:00.000Z',
|
||||
windowStart: '2026-02-10T00:00:00.000Z',
|
||||
windowEnd: '2026-05-11T00:00:00.000Z',
|
||||
snapshotRowCount: 1,
|
||||
touchedTableCount: 1,
|
||||
parseFailures: 0,
|
||||
warnings: [],
|
||||
probeWarnings: [],
|
||||
staleArchiveAfterDays: 90,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
)}\n`,
|
||||
'utf-8',
|
||||
);
|
||||
await writeFile(join(sourceDir, 'tables/public.orders.json'), '{"table":"public.orders"}\n', 'utf-8');
|
||||
await writeFile(join(sourceDir, 'patterns-input.json'), '{"templates":[]}\n', 'utf-8');
|
||||
const agentRunner = new HistoricSqlEvidenceAgentRunner();
|
||||
|
||||
const result = await runLocalIngest({
|
||||
project: historicProject,
|
||||
adapters: [new HistoricSqlEvidenceTestAdapter()],
|
||||
adapter: 'historic-sql',
|
||||
connectionId: 'warehouse',
|
||||
sourceDir,
|
||||
jobId: 'historic-sql-local-projection',
|
||||
agentRunner,
|
||||
});
|
||||
|
||||
expect(result.result.failedWorkUnits).toEqual([]);
|
||||
expect(result.report.body.postProcessor).toMatchObject({
|
||||
sourceKey: 'historic-sql',
|
||||
status: 'success',
|
||||
result: { tableUsageMerged: 1 },
|
||||
touchedSources: [{ connectionId: 'warehouse', sourceName: 'orders' }],
|
||||
});
|
||||
await expect(readFile(join(projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves.toContain(
|
||||
'Orders are repeatedly queried by lifecycle status.',
|
||||
);
|
||||
});
|
||||
|
||||
it('rejects direct Metabase scheduled pulls before requiring a local ingest LLM provider', async () => {
|
||||
const projectDir = join(tempDir, 'metabase-project');
|
||||
await initKtxProject({ projectDir, projectName: 'warehouse' });
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import { mkdirSync } from 'node:fs';
|
|||
import { join } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import type { KtxLlmProvider } from '@ktx/llm';
|
||||
import type { Tool } from 'ai';
|
||||
import YAML from 'yaml';
|
||||
import type { AgentRunnerService } from '../agent/index.js';
|
||||
import { AgentRunnerService as DefaultAgentRunnerService } from '../agent/index.js';
|
||||
|
|
@ -70,6 +71,8 @@ import {
|
|||
ContextCandidateCarryforwardService,
|
||||
CuratorPaginationService,
|
||||
} from './context-candidates/index.js';
|
||||
import { createEmitHistoricSqlEvidenceTool } from './adapters/historic-sql/evidence-tool.js';
|
||||
import { HistoricSqlProjectionPostProcessor } from './adapters/historic-sql/post-processor.js';
|
||||
import { ContextEvidenceIndexService, SqliteContextEvidenceStore } from './context-evidence/index.js';
|
||||
import { DiffSetService } from './diff-set.service.js';
|
||||
import { IngestBundleRunner } from './ingest-bundle.runner.js';
|
||||
|
|
@ -439,10 +442,16 @@ class NoopKnowledgeEventPort implements KnowledgeEventPort {
|
|||
}
|
||||
|
||||
class LocalIngestToolSet implements IngestToolsetLike {
|
||||
constructor(private readonly tools: BaseTool[]) {}
|
||||
constructor(
|
||||
private readonly tools: BaseTool[],
|
||||
private readonly sourceTools: Record<string, Tool> = {},
|
||||
) {}
|
||||
|
||||
toAiSdkTools(context: ToolContext) {
|
||||
return Object.fromEntries(this.tools.map((tool) => [tool.name, tool.toAiSdkTool(context)]));
|
||||
return {
|
||||
...Object.fromEntries(this.tools.map((tool) => [tool.name, tool.toAiSdkTool(context)])),
|
||||
...this.sourceTools,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -510,9 +519,19 @@ class LocalIngestToolsetFactory implements IngestToolsetFactoryPort {
|
|||
];
|
||||
}
|
||||
|
||||
createIngestWuToolset(_session: ToolSession, options?: { includeContextEvidenceTools?: boolean }): IngestToolsetLike {
|
||||
createIngestWuToolset(session: ToolSession, options?: { includeContextEvidenceTools?: boolean }): IngestToolsetLike {
|
||||
const sourceTools: Record<string, Tool> =
|
||||
session.ingest?.sourceKey === 'historic-sql'
|
||||
? {
|
||||
emit_historic_sql_evidence: createEmitHistoricSqlEvidenceTool({
|
||||
connectionId: session.connectionId,
|
||||
session,
|
||||
}),
|
||||
}
|
||||
: {};
|
||||
return new LocalIngestToolSet(
|
||||
options?.includeContextEvidenceTools ? [...this.baseTools, ...this.contextTools] : this.baseTools,
|
||||
sourceTools,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -668,6 +687,9 @@ export function createLocalBundleIngestRuntime(
|
|||
settings: { batchSize: 8, maxPasses: 8, stepBudgetPerPass: 60 },
|
||||
logger,
|
||||
}),
|
||||
postProcessors: {
|
||||
'historic-sql': new HistoricSqlProjectionPostProcessor(),
|
||||
},
|
||||
logger,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
|
||||
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
|
@ -120,14 +120,6 @@ describe('PageTriageService', () => {
|
|||
await rm(stagedDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
function parseSignalsFromClassifierPrompt(prompt: string): unknown {
|
||||
const match = /<signals>\n([\s\S]*?)\n<\/signals>/.exec(prompt);
|
||||
if (!match) {
|
||||
throw new Error('classifier prompt did not include a <signals> block');
|
||||
}
|
||||
return JSON.parse(match[1]);
|
||||
}
|
||||
|
||||
it('writes light-lane candidates and keeps the page out of full WorkUnits', async () => {
|
||||
generateTextMock
|
||||
.mockResolvedValueOnce({ text: JSON.stringify({ lane: 'light', reason: 'short durable policy' }) } as any)
|
||||
|
|
@ -282,163 +274,6 @@ describe('PageTriageService', () => {
|
|||
expect(repository.setDocumentTriageLane).toHaveBeenCalledWith('run-1', 'pages/page-1/page.md', 'light');
|
||||
});
|
||||
|
||||
it.each([
|
||||
{
|
||||
name: 'skip low solo template',
|
||||
propertyHints: {
|
||||
executions_bucket: 'low',
|
||||
distinct_users_bucket: 'solo',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '1 constant, 1 runtime',
|
||||
},
|
||||
expectedLane: 'skip',
|
||||
expectedReport: { skip: 1, light: 0, full: 0 },
|
||||
},
|
||||
{
|
||||
name: 'light service-account-only template',
|
||||
propertyHints: {
|
||||
executions_bucket: 'high',
|
||||
distinct_users_bucket: 'solo',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'true',
|
||||
slot_summary: '1 constant, 0 runtime',
|
||||
},
|
||||
expectedLane: 'light',
|
||||
expectedReport: { skip: 0, light: 1, full: 0 },
|
||||
},
|
||||
{
|
||||
name: 'full shared human template',
|
||||
propertyHints: {
|
||||
executions_bucket: 'high',
|
||||
distinct_users_bucket: 'team',
|
||||
error_rate_bucket: 'ok',
|
||||
recency_bucket: 'active',
|
||||
service_account_only: 'false',
|
||||
slot_summary: '2 constant, 1 runtime',
|
||||
},
|
||||
expectedLane: 'full',
|
||||
expectedReport: { skip: 0, light: 0, full: 1 },
|
||||
},
|
||||
] as const)('triages historic-SQL synthetic signal fixture as $expectedLane for $name', async ({
|
||||
name,
|
||||
propertyHints,
|
||||
expectedLane,
|
||||
expectedReport,
|
||||
}) => {
|
||||
const externalId = name.replace(/[^a-z0-9]+/g, '_');
|
||||
const templateDir = join(stagedDir, 'templates', externalId);
|
||||
await mkdir(templateDir, { recursive: true });
|
||||
await writeFile(
|
||||
join(templateDir, 'metadata.json'),
|
||||
JSON.stringify({
|
||||
id: externalId,
|
||||
title: `snowflake - analytics.orders [${externalId.slice(0, 6)}]`,
|
||||
path: `templates/${externalId}/page.md`,
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: null,
|
||||
properties: {
|
||||
fingerprint: externalId,
|
||||
sub_cluster_id: null,
|
||||
dialect: 'snowflake',
|
||||
tables_touched: ['analytics.orders'],
|
||||
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
|
||||
triage_signals: propertyHints,
|
||||
},
|
||||
}),
|
||||
'utf-8',
|
||||
);
|
||||
await writeFile(
|
||||
join(templateDir, 'page.md'),
|
||||
[
|
||||
`# ${externalId}`,
|
||||
'',
|
||||
'## Normalized SQL',
|
||||
'```sql',
|
||||
'SELECT count(*) FROM analytics.orders WHERE status = ?',
|
||||
'```',
|
||||
'',
|
||||
'## Tables touched',
|
||||
'- analytics.orders',
|
||||
].join('\n'),
|
||||
'utf-8',
|
||||
);
|
||||
|
||||
adapter.getTriageSignals.mockResolvedValueOnce({
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: '2026-05-04T12:00:00.000Z',
|
||||
propertyHints,
|
||||
});
|
||||
promptService.loadPrompt.mockImplementation((promptName: string) => {
|
||||
if (promptName === 'skills/page_triage_classifier') {
|
||||
return readFile(new URL('../../../prompts/skills/page_triage_classifier.md', import.meta.url), 'utf-8');
|
||||
}
|
||||
return Promise.resolve(`prompt:${promptName}`);
|
||||
});
|
||||
generateTextMock.mockImplementationOnce((args: any) => {
|
||||
const prompt = args.messages[0].content as string;
|
||||
expect(prompt).toContain('signals.objectType === "historic_sql_template"');
|
||||
expect(prompt).toContain('executions_bucket=low AND distinct_users_bucket=solo');
|
||||
expect(prompt).toContain('service_account_only=true AND below the frequency floor');
|
||||
expect(prompt).toContain('shared human usage with mid or high execution volume');
|
||||
expect(parseSignalsFromClassifierPrompt(prompt)).toEqual({
|
||||
objectType: 'historic_sql_template',
|
||||
lastEditedAt: '2026-05-04T12:00:00.000Z',
|
||||
propertyHints,
|
||||
});
|
||||
return { text: JSON.stringify({ lane: expectedLane, reason: `${name} fixture` }) } as any;
|
||||
});
|
||||
if (expectedLane === 'light') {
|
||||
generateTextMock.mockResolvedValueOnce({
|
||||
text: JSON.stringify({
|
||||
candidates: [
|
||||
{
|
||||
candidateKey: 'historic-sql-service-account-template',
|
||||
topic: 'Historic SQL Service Account Template',
|
||||
assertion: 'A service-account-only historic SQL template can remain as light evidence.',
|
||||
rationale: 'The synthetic historic-SQL fixture is service-account-only and below the frequency floor.',
|
||||
evidenceChunkIds: ['00000000-0000-0000-0000-000000000101'],
|
||||
suggestedPageKey: 'historic-sql-service-account-template',
|
||||
actionHint: 'create',
|
||||
durabilityScore: 2,
|
||||
authorityScore: 1,
|
||||
reuseScore: 2,
|
||||
noveltyScore: 1,
|
||||
riskScore: 0,
|
||||
},
|
||||
],
|
||||
}),
|
||||
} as any);
|
||||
}
|
||||
|
||||
const result = await service.triageRun({
|
||||
stagedDir,
|
||||
runId: 'run-1',
|
||||
connectionId: 'conn-1',
|
||||
sourceKey: 'historic-sql',
|
||||
syncId: 'sync-1',
|
||||
jobId: 'job-1',
|
||||
diffSet: {
|
||||
added: [`templates/${externalId}/metadata.json`, `templates/${externalId}/page.md`],
|
||||
modified: [],
|
||||
deleted: [],
|
||||
unchanged: [],
|
||||
},
|
||||
adapter: adapter as any,
|
||||
});
|
||||
|
||||
expect(result.report).toMatchObject({ pageCount: 1, ...expectedReport });
|
||||
expect(repository.setDocumentTriageLane).toHaveBeenCalledWith(
|
||||
'run-1',
|
||||
`templates/${externalId}/page.md`,
|
||||
expectedLane,
|
||||
);
|
||||
expect(result.fullRawPaths.has(`templates/${externalId}/metadata.json`)).toBe(expectedLane === 'full');
|
||||
expect(result.fullRawPaths.has(`templates/${externalId}/page.md`)).toBe(expectedLane === 'full');
|
||||
});
|
||||
|
||||
it('triages Notion data-source row pages without reading data-source metadata as page markdown', async () => {
|
||||
triageSettings.lightExtractionEnabled = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -79,6 +79,50 @@ export interface IngestReportSnapshot {
|
|||
createdAt: string;
|
||||
}
|
||||
|
||||
export interface IngestSavedMemoryCounts {
|
||||
wikiCount: number;
|
||||
slCount: number;
|
||||
}
|
||||
|
||||
function numericResultField(result: Record<string, unknown>, field: string): number {
|
||||
const value = result[field];
|
||||
return typeof value === 'number' && Number.isFinite(value) && value > 0 ? value : 0;
|
||||
}
|
||||
|
||||
export function postProcessorSavedMemoryCounts(
|
||||
postProcessor: IngestReportPostProcessorOutcome | undefined,
|
||||
): IngestSavedMemoryCounts {
|
||||
if (!postProcessor || postProcessor.sourceKey !== 'historic-sql') {
|
||||
return { wikiCount: 0, slCount: 0 };
|
||||
}
|
||||
const result = postProcessor.result;
|
||||
if (!result || typeof result !== 'object' || Array.isArray(result)) {
|
||||
return { wikiCount: 0, slCount: 0 };
|
||||
}
|
||||
const record = result as Record<string, unknown>;
|
||||
return {
|
||||
wikiCount:
|
||||
numericResultField(record, 'patternPagesWritten') +
|
||||
numericResultField(record, 'stalePatternPagesMarked') +
|
||||
numericResultField(record, 'archivedPatternPages') +
|
||||
numericResultField(record, 'legacyPagesDeleted'),
|
||||
slCount: numericResultField(record, 'tableUsageMerged') + numericResultField(record, 'staleTablesMarked'),
|
||||
};
|
||||
}
|
||||
|
||||
export function savedMemoryCountsForReport(report: IngestReportSnapshot): IngestSavedMemoryCounts {
|
||||
const actions = report.body.workUnits.flatMap((workUnit) => workUnit.actions);
|
||||
const directCounts = {
|
||||
wikiCount: actions.filter((action) => action.target === 'wiki').length,
|
||||
slCount: actions.filter((action) => action.target === 'sl').length,
|
||||
};
|
||||
const postProcessorCounts = postProcessorSavedMemoryCounts(report.body.postProcessor);
|
||||
return {
|
||||
wikiCount: directCounts.wikiCount + postProcessorCounts.wikiCount,
|
||||
slCount: directCounts.slCount + postProcessorCounts.slCount,
|
||||
};
|
||||
}
|
||||
|
||||
export function buildStageIndexFromReportBody(jobId: string, connectionId: string, body: IngestReportBody): StageIndex {
|
||||
return {
|
||||
jobId,
|
||||
|
|
|
|||
|
|
@ -520,6 +520,54 @@ describe('createLocalProjectMcpContextPorts', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('returns historic SQL usage frequency and snippet through semantic-layer list search', async () => {
|
||||
const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' });
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
`tables:
|
||||
orders:
|
||||
table: public.orders
|
||||
usage:
|
||||
narrative: Analysts inspect paid order lifecycle by customer segment.
|
||||
frequencyTier: high
|
||||
commonFilters:
|
||||
- status
|
||||
commonGroupBys:
|
||||
- customer_segment
|
||||
commonJoins:
|
||||
- table: public.customers
|
||||
on:
|
||||
- customer_id
|
||||
columns:
|
||||
- name: order_id
|
||||
type: string
|
||||
- name: status
|
||||
type: string
|
||||
`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Seed usage-backed manifest shard',
|
||||
);
|
||||
|
||||
const ports = createLocalProjectMcpContextPorts(project);
|
||||
await expect(
|
||||
ports.semanticLayer?.listSources({ connectionId: 'warehouse', query: 'paid order lifecycle' }),
|
||||
).resolves.toEqual({
|
||||
sources: [
|
||||
expect.objectContaining({
|
||||
connectionId: 'warehouse',
|
||||
connectionName: 'warehouse',
|
||||
name: 'orders',
|
||||
frequencyTier: 'high',
|
||||
snippet: expect.stringContaining('<mark>'),
|
||||
score: expect.any(Number),
|
||||
matchReasons: expect.arrayContaining(['lexical']),
|
||||
}),
|
||||
],
|
||||
totalSources: 1,
|
||||
});
|
||||
});
|
||||
|
||||
it('uses configured local embeddings for semantic-layer search when available', async () => {
|
||||
const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' });
|
||||
project.config.ingest.embeddings = { backend: 'none', dimensions: 2 };
|
||||
|
|
|
|||
|
|
@ -479,6 +479,8 @@ export function createLocalProjectMcpContextPorts(
|
|||
columnCount: source.columnCount,
|
||||
measureCount: source.measureCount,
|
||||
joinCount: source.joinCount,
|
||||
...(hasSlSearchMetadata(source) && source.frequencyTier ? { frequencyTier: source.frequencyTier } : {}),
|
||||
...(hasSlSearchMetadata(source) && source.snippet ? { snippet: source.snippet } : {}),
|
||||
...(hasSlSearchMetadata(source) ? { score: source.score } : {}),
|
||||
...(hasSlSearchMetadata(source) && source.matchReasons ? { matchReasons: source.matchReasons } : {}),
|
||||
...(hasSlSearchMetadata(source) && source.dictionaryMatches
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import type { IngestReportSnapshot, MemoryFlowReplayInput } from '../ingest/index.js';
|
||||
import type { IngestReportSnapshot, MemoryFlowReplayInput, TableUsageOutput } from '../ingest/index.js';
|
||||
import type { MemoryCaptureService } from '../memory/index.js';
|
||||
import type { KtxScanMode, KtxScanReport } from '../scan/index.js';
|
||||
import type {
|
||||
|
|
@ -131,6 +131,8 @@ export interface KtxSemanticLayerSourceSummary {
|
|||
columnCount: number;
|
||||
measureCount: number;
|
||||
joinCount: number;
|
||||
frequencyTier?: TableUsageOutput['frequencyTier'];
|
||||
snippet?: string;
|
||||
score?: number;
|
||||
matchReasons?: SlSearchMatchReason[];
|
||||
dictionaryMatches?: SlDictionaryMatch[];
|
||||
|
|
|
|||
|
|
@ -15,7 +15,8 @@ const expectedSkillHeadings: Record<string, string> = {
|
|||
sl_capture: '# Semantic Layer',
|
||||
};
|
||||
const expectedAdapterSkillHeadings: Record<string, string> = {
|
||||
historic_sql_ingest: '# Historic SQL Ingest',
|
||||
historic_sql_patterns: '# Historic SQL Patterns',
|
||||
historic_sql_table_digest: '# Historic SQL Table Digest',
|
||||
live_database_ingest: '# Live Database Ingest',
|
||||
looker_ingest: '# Looker Runtime Ingest',
|
||||
lookml_ingest: '# LookML to KTX Semantic Layer',
|
||||
|
|
|
|||
|
|
@ -232,14 +232,17 @@ describe('@ktx/context package exports', () => {
|
|||
expect(ingest.HistoricSqlSourceAdapter).toBeTypeOf('function');
|
||||
expect(ingest.SnowflakeHistoricSqlQueryHistoryReader).toBeTypeOf('function');
|
||||
expect(ingest.BigQueryHistoricSqlQueryHistoryReader).toBeTypeOf('function');
|
||||
expect(ingest.PostgresPgssQueryHistoryReader).toBeTypeOf('function');
|
||||
expect(ingest.stagePgStatStatementsTemplates).toBeTypeOf('function');
|
||||
expect(ingest.pgssBaselinePath).toBeTypeOf('function');
|
||||
expect(ingest.readPgssBaseline).toBeTypeOf('function');
|
||||
expect(ingest.writePgssBaselineAtomic).toBeTypeOf('function');
|
||||
expect(ingest.PostgresPgssReader).toBeTypeOf('function');
|
||||
expect(ingest.HistoricSqlExtensionMissingError).toBeTypeOf('function');
|
||||
expect(ingest.HistoricSqlVersionUnsupportedError).toBeTypeOf('function');
|
||||
expect(ingest.HISTORIC_SQL_SOURCE_KEY).toBe('historic-sql');
|
||||
expect(ingest.historicSqlUnifiedPullConfigSchema).toBeDefined();
|
||||
expect(ingest.aggregatedTemplateSchema).toBeDefined();
|
||||
expect(ingest.stagedTableInputSchema).toBeDefined();
|
||||
expect(ingest.historicSqlEvidenceEnvelopeSchema).toBeDefined();
|
||||
expect(ingest.historicSqlEvidencePath).toBeTypeOf('function');
|
||||
expect(ingest.createEmitHistoricSqlEvidenceTool).toBeTypeOf('function');
|
||||
expect(ingest.HistoricSqlProjectionPostProcessor).toBeTypeOf('function');
|
||||
expect(ingest.SqliteContextEvidenceStore).toBeTypeOf('function');
|
||||
expect(ingest.SqliteBundleIngestStore).toBeTypeOf('function');
|
||||
expect(ingest.CuratorPaginationService).toBeTypeOf('function');
|
||||
|
|
|
|||
|
|
@ -742,6 +742,13 @@ describe('writeLocalScanEnrichmentArtifacts', () => {
|
|||
orders: {
|
||||
table: 'public.orders',
|
||||
descriptions: { user: 'Pinned structural description', ai: 'Old generated text' },
|
||||
usage: {
|
||||
narrative: 'Orders are commonly filtered by lifecycle status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
ownerNote: 'Preserve analyst note',
|
||||
},
|
||||
columns: [
|
||||
{
|
||||
name: 'id',
|
||||
|
|
@ -797,6 +804,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => {
|
|||
tables: {
|
||||
orders: {
|
||||
descriptions: Record<string, string>;
|
||||
usage?: Record<string, unknown>;
|
||||
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
|
||||
joins: Array<{ to: string; on: string; source: string }>;
|
||||
};
|
||||
|
|
@ -807,6 +815,13 @@ describe('writeLocalScanEnrichmentArtifacts', () => {
|
|||
user: 'Pinned structural description',
|
||||
db: 'DB orders table',
|
||||
});
|
||||
expect(manifest.tables.orders.usage).toEqual({
|
||||
narrative: 'Orders are commonly filtered by lifecycle status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
ownerNote: 'Preserve analyst note',
|
||||
});
|
||||
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
|
||||
user: 'Pinned structural id',
|
||||
db: 'DB order id',
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import {
|
|||
type LiveDatabaseManifestJoinEntry,
|
||||
type LiveDatabaseManifestShard,
|
||||
type LiveDatabaseManifestTableData,
|
||||
type TableUsageOutput,
|
||||
} from '../ingest/index.js';
|
||||
import type { KtxScanRelationshipConfig } from '../project/config.js';
|
||||
import type { KtxLocalProject } from '../project/index.js';
|
||||
|
|
@ -56,6 +57,7 @@ export interface WriteLocalScanEnrichmentArtifactsResult extends WriteLocalScanM
|
|||
interface ExistingManifestState {
|
||||
descriptions: Map<string, LiveDatabaseManifestExistingDescriptions>;
|
||||
preservedJoins: Map<string, LiveDatabaseManifestJoinEntry[]>;
|
||||
usage: Map<string, TableUsageOutput>;
|
||||
}
|
||||
|
||||
type LocalDescriptionUpdates = KtxLocalScanEnrichmentResult['descriptionUpdates'];
|
||||
|
|
@ -196,6 +198,7 @@ async function loadExistingManifestState(
|
|||
): Promise<ExistingManifestState> {
|
||||
const descriptions = new Map<string, LiveDatabaseManifestExistingDescriptions>();
|
||||
const preservedJoins = new Map<string, LiveDatabaseManifestJoinEntry[]>();
|
||||
const usage = new Map<string, TableUsageOutput>();
|
||||
const validTableNames = new Set(snapshot.tables.map((table) => table.name));
|
||||
const columnsByTable = validColumns(snapshot);
|
||||
|
||||
|
|
@ -203,7 +206,7 @@ async function loadExistingManifestState(
|
|||
try {
|
||||
files = (await project.fileStore.listFiles(schemaDir(connectionId))).files.filter((file) => file.endsWith('.yaml'));
|
||||
} catch {
|
||||
return { descriptions, preservedJoins };
|
||||
return { descriptions, preservedJoins, usage };
|
||||
}
|
||||
|
||||
for (const file of files) {
|
||||
|
|
@ -225,6 +228,9 @@ async function loadExistingManifestState(
|
|||
),
|
||||
),
|
||||
});
|
||||
if (entry.usage) {
|
||||
usage.set(tableName, { ...entry.usage });
|
||||
}
|
||||
const joins = (entry.joins ?? []).filter((join) => {
|
||||
return (
|
||||
(join.source === 'manual' || join.source === 'inferred') &&
|
||||
|
|
@ -241,7 +247,7 @@ async function loadExistingManifestState(
|
|||
}
|
||||
}
|
||||
|
||||
return { descriptions, preservedJoins };
|
||||
return { descriptions, preservedJoins, usage };
|
||||
}
|
||||
|
||||
async function writeJsonArtifact(
|
||||
|
|
@ -276,6 +282,7 @@ export async function writeLocalScanManifestShards(
|
|||
joins: relationshipJoins(input.snapshot, input.relationshipUpdate),
|
||||
existingDescriptions: existing.descriptions,
|
||||
existingPreservedJoins: existing.preservedJoins,
|
||||
existingUsage: existing.usage,
|
||||
mapColumnType: (dimensionType) => dimensionType,
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -187,6 +187,53 @@ describe('local semantic-layer helpers', () => {
|
|||
await expect(access(join(project.projectDir, '.ktx/db.sqlite'))).resolves.toBeUndefined();
|
||||
});
|
||||
|
||||
it('searches historic SQL usage and returns frequency tier plus FTS snippet', async () => {
|
||||
await project.fileStore.writeFile(
|
||||
'semantic-layer/warehouse/_schema/public.yaml',
|
||||
`tables:
|
||||
orders:
|
||||
table: public.orders
|
||||
usage:
|
||||
narrative: Analysts inspect paid order lifecycle by customer segment.
|
||||
frequencyTier: high
|
||||
commonFilters:
|
||||
- status
|
||||
- created_at
|
||||
commonGroupBys:
|
||||
- customer_segment
|
||||
commonJoins:
|
||||
- table: public.customers
|
||||
on:
|
||||
- customer_id
|
||||
columns:
|
||||
- name: order_id
|
||||
type: string
|
||||
- name: status
|
||||
type: string
|
||||
`,
|
||||
'ktx',
|
||||
'ktx@example.com',
|
||||
'Add usage-backed manifest shard',
|
||||
);
|
||||
|
||||
const results = await searchLocalSlSources(project, {
|
||||
connectionId: 'warehouse',
|
||||
query: 'paid lifecycle customer segment',
|
||||
});
|
||||
|
||||
expect(results).toEqual([
|
||||
expect.objectContaining({
|
||||
connectionId: 'warehouse',
|
||||
name: 'orders',
|
||||
path: 'semantic-layer/warehouse/_schema/public.yaml#orders',
|
||||
frequencyTier: 'high',
|
||||
snippet: expect.stringContaining('<mark>'),
|
||||
matchReasons: expect.arrayContaining(['lexical']),
|
||||
}),
|
||||
]);
|
||||
expect(results[0]?.snippet).toContain('lifecycle');
|
||||
});
|
||||
|
||||
it('searches all connections with one global hybrid ranking pass', async () => {
|
||||
await writeLocalSlSource(project, {
|
||||
connectionId: 'warehouse',
|
||||
|
|
|
|||
|
|
@ -26,6 +26,8 @@ export interface LocalSlSourceSummary {
|
|||
|
||||
export interface LocalSlSourceSearchResult extends LocalSlSourceSummary {
|
||||
score: number;
|
||||
frequencyTier?: NonNullable<SemanticLayerSource['usage']>['frequencyTier'];
|
||||
snippet?: string;
|
||||
matchReasons?: SlSearchMatchReason[];
|
||||
dictionaryMatches?: SlDictionaryMatch[];
|
||||
lanes?: SlSearchLaneSummary[];
|
||||
|
|
@ -367,6 +369,10 @@ function candidateKey(summary: LocalSlSourceSummary): string {
|
|||
return `${summary.connectionId}/${summary.name}`;
|
||||
}
|
||||
|
||||
function searchResultUsageFields(source: SemanticLayerSource): Pick<LocalSlSourceSearchResult, 'frequencyTier'> {
|
||||
return source.usage?.frequencyTier ? { frequencyTier: source.usage.frequencyTier } : {};
|
||||
}
|
||||
|
||||
function tokenLaneCandidates(candidates: LocalSlSearchCandidate[], terms: readonly string[]) {
|
||||
if (terms.length === 0) {
|
||||
return [];
|
||||
|
|
@ -483,6 +489,7 @@ export async function searchLocalSlSources(
|
|||
...result.candidate.summary,
|
||||
score: result.score,
|
||||
matchReasons: ['token'],
|
||||
...searchResultUsageFields(result.candidate.source),
|
||||
}))
|
||||
.sort(
|
||||
(left, right) =>
|
||||
|
|
@ -500,6 +507,7 @@ export async function searchLocalSlSources(
|
|||
const finalLimit = input.limit ?? candidates.length;
|
||||
const core = new HybridSearchCore();
|
||||
const dictionaryEvidence = new Map<string, SlDictionaryMatch[]>();
|
||||
const lexicalSnippets = new Map<string, string>();
|
||||
|
||||
const generators: SearchCandidateGenerator[] = [
|
||||
{
|
||||
|
|
@ -510,6 +518,11 @@ export async function searchLocalSlSources(
|
|||
queryText: args.queryText,
|
||||
limit: args.laneCandidatePoolLimit,
|
||||
});
|
||||
for (const row of rows) {
|
||||
if (row.snippet) {
|
||||
lexicalSnippets.set(row.id, row.snippet);
|
||||
}
|
||||
}
|
||||
return {
|
||||
candidates: rows.map((row) => ({ id: row.id, rank: row.rank, rawScore: row.rawScore })),
|
||||
};
|
||||
|
|
@ -584,9 +597,12 @@ export async function searchLocalSlSources(
|
|||
continue;
|
||||
}
|
||||
const dictionaryMatches = dictionaryEvidence.get(fused.id);
|
||||
const snippet = lexicalSnippets.get(fused.id);
|
||||
hydrated.push({
|
||||
...candidate.summary,
|
||||
score: fused.score,
|
||||
...searchResultUsageFields(candidate.source),
|
||||
...(snippet ? { snippet } : {}),
|
||||
matchReasons: fused.matchReasons as SlSearchMatchReason[],
|
||||
...(dictionaryMatches && dictionaryMatches.length > 0 ? { dictionaryMatches } : {}),
|
||||
lanes: result.lanes,
|
||||
|
|
|
|||
|
|
@ -554,9 +554,11 @@ export async function searchLocalSlSourcesWithPglitePrototype(
|
|||
continue;
|
||||
}
|
||||
const dictionaryMatches = dictionaryEvidence.get(result.id);
|
||||
const frequencyTier = candidate.source.usage?.frequencyTier;
|
||||
hydrated.push({
|
||||
...candidate.summary,
|
||||
score: result.score,
|
||||
...(frequencyTier ? { frequencyTier } : {}),
|
||||
matchReasons: result.matchReasons as SlSearchMatchReason[],
|
||||
...(dictionaryMatches && dictionaryMatches.length > 0 ? { dictionaryMatches } : {}),
|
||||
lanes: fused.lanes,
|
||||
|
|
|
|||
|
|
@ -49,5 +49,5 @@ export interface SlSourcesIndexPort {
|
|||
queryText: string,
|
||||
limit: number,
|
||||
minRrfScore?: number,
|
||||
): Promise<Array<{ sourceName: string; rrfScore: number }>>;
|
||||
): Promise<Array<{ sourceName: string; rrfScore: number; snippet?: string }>>;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import { z } from 'zod';
|
||||
import { tableUsageOutputSchema } from '../ingest/adapters/historic-sql/skill-schemas.js';
|
||||
|
||||
// Literal vocabularies — kept in lockstep with the Python Pydantic model at
|
||||
// python/ktx-sl/semantic_layer/models.py (SourceColumn / ColumnRole /
|
||||
|
|
@ -125,6 +126,7 @@ export const sourceDefinitionSchema = z
|
|||
default_time_dimension: defaultTimeDimensionDbtSchema.optional(),
|
||||
tags: sourceKeyedStringArraySchema.optional(),
|
||||
freshness: sourceFreshnessSchema.optional(),
|
||||
usage: tableUsageOutputSchema.optional(),
|
||||
})
|
||||
.strict()
|
||||
.refine((s) => (s.table || s.sql) && !(s.table && s.sql), {
|
||||
|
|
@ -145,6 +147,7 @@ export const sourceOverlaySchema = z
|
|||
exclude_columns: z.array(z.string()).optional(),
|
||||
disable_joins: z.array(z.string()).optional(),
|
||||
default_time_dimension: defaultTimeDimensionDbtSchema.optional(),
|
||||
usage: tableUsageOutputSchema.optional(),
|
||||
})
|
||||
.strict();
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import {
|
|||
composeOverlay,
|
||||
enrichColumnsFromManifest,
|
||||
findDanglingSegmentRefs,
|
||||
projectManifestEntry,
|
||||
SemanticLayerService,
|
||||
} from './semantic-layer.service.js';
|
||||
import { sourceDefinitionSchema } from './schemas.js';
|
||||
|
|
@ -129,6 +130,39 @@ describe('composeOverlay', () => {
|
|||
dbt: 'dbt description',
|
||||
});
|
||||
});
|
||||
|
||||
it('replaces manifest usage only when an overlay explicitly provides usage', () => {
|
||||
const baseWithUsage: SemanticLayerSource = {
|
||||
...baseTable,
|
||||
usage: {
|
||||
narrative: 'Orders are commonly queried by lifecycle status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
},
|
||||
};
|
||||
|
||||
expect(composeOverlay(baseWithUsage, { name: 'fct_labs', measures: [] }).usage).toEqual(baseWithUsage.usage);
|
||||
|
||||
const composed = composeOverlay(baseWithUsage, {
|
||||
name: 'fct_labs',
|
||||
usage: {
|
||||
narrative: 'Overlay-curated usage note.',
|
||||
frequencyTier: 'mid',
|
||||
commonFilters: ['created_at'],
|
||||
commonGroupBys: ['created_at'],
|
||||
commonJoins: [],
|
||||
},
|
||||
});
|
||||
|
||||
expect(composed.usage).toEqual({
|
||||
narrative: 'Overlay-curated usage note.',
|
||||
frequencyTier: 'mid',
|
||||
commonFilters: ['created_at'],
|
||||
commonGroupBys: ['created_at'],
|
||||
commonJoins: [],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('enrichColumnsFromManifest', () => {
|
||||
|
|
@ -299,6 +333,61 @@ describe('sourceDefinitionSchema', () => {
|
|||
dbt: { loaded_at_field: 'updated_at', raw: { warn_after: { count: 12, period: 'hour' } } },
|
||||
});
|
||||
});
|
||||
|
||||
it('accepts historic SQL usage on standalone sources', () => {
|
||||
const result = sourceDefinitionSchema.safeParse({
|
||||
name: 'orders',
|
||||
table: 'public.orders',
|
||||
grain: ['id'],
|
||||
columns: [{ name: 'id', type: 'string' }],
|
||||
joins: [],
|
||||
measures: [],
|
||||
usage: {
|
||||
narrative: 'Orders are queried for fulfillment and revenue analysis.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status', 'created_at'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
externalOwner: 'analytics',
|
||||
},
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
if (!result.success) {
|
||||
return;
|
||||
}
|
||||
expect(result.data.usage).toMatchObject({
|
||||
narrative: 'Orders are queried for fulfillment and revenue analysis.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status', 'created_at'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
externalOwner: 'analytics',
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('projectManifestEntry', () => {
|
||||
it('projects manifest usage onto the semantic-layer source', () => {
|
||||
const source = projectManifestEntry('orders', {
|
||||
table: 'public.orders',
|
||||
usage: {
|
||||
narrative: 'Orders are frequently filtered by status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
},
|
||||
columns: [
|
||||
{ name: 'id', type: 'string', pk: true },
|
||||
{ name: 'status', type: 'string' },
|
||||
],
|
||||
});
|
||||
|
||||
expect(source.usage).toEqual({
|
||||
narrative: 'Orders are frequently filtered by status.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('findManifestEntryByTableRef', () => {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import YAML from 'yaml';
|
||||
import type { KtxFileStorePort, KtxLogger } from '../core/index.js';
|
||||
import { noopLogger } from '../core/index.js';
|
||||
import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js';
|
||||
import type { SlConnectionCatalogPort, SlPythonPort } from './ports.js';
|
||||
import { normalizeSemanticLayerDescriptions } from './description-normalization.js';
|
||||
import { isOverlaySource, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js';
|
||||
|
|
@ -884,6 +885,7 @@ export interface ManifestTableEntry {
|
|||
joins?: ManifestJoinEntry[];
|
||||
tags?: { dbt?: string[] };
|
||||
freshness?: { dbt?: { raw?: unknown; loaded_at_field?: string | null } };
|
||||
usage?: TableUsageOutput;
|
||||
}
|
||||
|
||||
/** Migrate legacy flat description/db_description fields to a descriptions map. */
|
||||
|
|
@ -930,6 +932,7 @@ export function projectManifestEntry(name: string, entry: ManifestTableEntry): S
|
|||
measures: [],
|
||||
...(entry.tags?.dbt?.length ? { tags: entry.tags } : {}),
|
||||
...(entry.freshness?.dbt ? { freshness: entry.freshness } : {}),
|
||||
...(entry.usage ? { usage: entry.usage } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -1005,6 +1008,7 @@ const COMPOSE_KNOWN_KEYS = new Set([
|
|||
'exclude_columns',
|
||||
'disable_joins',
|
||||
'default_time_dimension',
|
||||
'usage',
|
||||
]);
|
||||
|
||||
export function composeOverlay(base: SemanticLayerSource, overlay: Record<string, unknown>): SemanticLayerSource {
|
||||
|
|
@ -1028,6 +1032,10 @@ export function composeOverlay(base: SemanticLayerSource, overlay: Record<string
|
|||
};
|
||||
}
|
||||
|
||||
if (normalizedOverlay.usage !== undefined) {
|
||||
result.usage = normalizedOverlay.usage as SemanticLayerSource['usage'];
|
||||
}
|
||||
|
||||
// Filter out excluded columns
|
||||
const excluded = new Set((normalizedOverlay.exclude_columns as string[] | undefined) ?? []);
|
||||
let columns = result.columns.filter((c) => !excluded.has(c.name));
|
||||
|
|
|
|||
|
|
@ -162,4 +162,65 @@ describe('SlSearchService', () => {
|
|||
expect(text).toContain('loaded_at=updated_at');
|
||||
expect(text).toContain('warn_after');
|
||||
});
|
||||
|
||||
it('includes historic SQL usage in semantic-layer search text', () => {
|
||||
const source: SemanticLayerSource = {
|
||||
name: 'orders',
|
||||
descriptions: { user: 'Customer orders' },
|
||||
table: 'public.orders',
|
||||
grain: ['order_id'],
|
||||
columns: [{ name: 'order_id', type: 'string' }],
|
||||
joins: [],
|
||||
measures: [],
|
||||
usage: {
|
||||
narrative: 'Analysts inspect paid and refunded order lifecycle trends by customer segment.',
|
||||
frequencyTier: 'high',
|
||||
commonFilters: ['status', 'created_at'],
|
||||
commonGroupBys: ['customer_segment'],
|
||||
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
|
||||
staleSince: '2026-05-01T00:00:00.000Z',
|
||||
},
|
||||
};
|
||||
|
||||
const text = buildSemanticLayerSourceSearchText(source);
|
||||
|
||||
expect(text).toContain('usage: Analysts inspect paid and refunded order lifecycle trends by customer segment.');
|
||||
expect(text).toContain('frequency: high');
|
||||
expect(text).toContain('commonly filtered by: status, created_at');
|
||||
expect(text).toContain('commonly grouped by: customer_segment');
|
||||
expect(text).toContain('commonly joined to public.customers on customer_id');
|
||||
expect(text).toContain('stale since 2026-05-01T00:00:00.000Z');
|
||||
});
|
||||
|
||||
it('preserves FTS snippets returned by the source index', async () => {
|
||||
const service = new SlSearchService(
|
||||
{
|
||||
maxBatchSize: 16,
|
||||
computeEmbedding: vi.fn(async () => [1, 0]),
|
||||
computeEmbeddingsBulk: vi.fn(),
|
||||
},
|
||||
{
|
||||
upsertSources: vi.fn(),
|
||||
getExistingSearchTexts: vi.fn(),
|
||||
deleteStale: vi.fn(),
|
||||
deleteByConnection: vi.fn(),
|
||||
deleteByConnectionAndName: vi.fn(),
|
||||
search: vi.fn(async () => [
|
||||
{
|
||||
sourceName: 'orders',
|
||||
rrfScore: 0.75,
|
||||
snippet: 'usage: paid <mark>order</mark> lifecycle',
|
||||
},
|
||||
]),
|
||||
},
|
||||
);
|
||||
|
||||
await expect(service.search('warehouse', 'order lifecycle', 10)).resolves.toEqual([
|
||||
{
|
||||
sourceName: 'orders',
|
||||
score: 0.75,
|
||||
snippet: 'usage: paid <mark>order</mark> lifecycle',
|
||||
},
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -71,6 +71,24 @@ export function buildSemanticLayerSourceSearchText(
|
|||
}
|
||||
}
|
||||
|
||||
if (source.usage) {
|
||||
const usage = source.usage;
|
||||
parts.push(`usage: ${usage.narrative}`);
|
||||
parts.push(`frequency: ${usage.frequencyTier}`);
|
||||
if (usage.commonFilters.length > 0) {
|
||||
parts.push(`commonly filtered by: ${usage.commonFilters.join(', ')}`);
|
||||
}
|
||||
if (usage.commonGroupBys?.length) {
|
||||
parts.push(`commonly grouped by: ${usage.commonGroupBys.join(', ')}`);
|
||||
}
|
||||
for (const join of usage.commonJoins) {
|
||||
parts.push(`commonly joined to ${join.table} on ${join.on.join(',')}`);
|
||||
}
|
||||
if (usage.staleSince) {
|
||||
parts.push(`stale since ${usage.staleSince}`);
|
||||
}
|
||||
}
|
||||
|
||||
return parts.join('. ');
|
||||
}
|
||||
|
||||
|
|
@ -150,7 +168,7 @@ export class SlSearchService {
|
|||
query: string,
|
||||
limit = 15,
|
||||
minRrfScore = 0,
|
||||
): Promise<Array<{ sourceName: string; score: number }>> {
|
||||
): Promise<Array<{ sourceName: string; score: number; snippet?: string }>> {
|
||||
let queryEmbedding: number[] | null = null;
|
||||
try {
|
||||
queryEmbedding = await this.embeddingService.computeEmbedding(query);
|
||||
|
|
@ -161,7 +179,11 @@ export class SlSearchService {
|
|||
}
|
||||
|
||||
const results = await this.slSourcesRepository.search(connectionId, queryEmbedding, query, limit, minRrfScore);
|
||||
return results.map((r) => ({ sourceName: r.sourceName, score: r.rrfScore }));
|
||||
return results.map((result) => ({
|
||||
sourceName: result.sourceName,
|
||||
score: result.rrfScore,
|
||||
...(result.snippet ? { snippet: result.snippet } : {}),
|
||||
}));
|
||||
}
|
||||
|
||||
buildSearchText(source: SemanticLayerSource, priority: string[] = DEFAULT_PRIORITY): string {
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ describe('SqliteSlSourcesIndex', () => {
|
|||
await rm(tempDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('creates SQLite tables and searches indexed source text', async () => {
|
||||
it('creates SQLite tables and searches indexed source text with FTS snippets', async () => {
|
||||
const index = new SqliteSlSourcesIndex({ dbPath });
|
||||
|
||||
await index.upsertSources('warehouse', [
|
||||
|
|
@ -34,10 +34,24 @@ describe('SqliteSlSourcesIndex', () => {
|
|||
]);
|
||||
|
||||
await expect(access(dbPath)).resolves.toBeUndefined();
|
||||
expect(await index.search('warehouse', null, 'gross revenue', 10)).toEqual([
|
||||
|
||||
const directResults = await index.search('warehouse', null, 'gross revenue', 10);
|
||||
expect(directResults).toEqual([
|
||||
expect.objectContaining({
|
||||
sourceName: 'orders',
|
||||
rrfScore: expect.any(Number),
|
||||
snippet: expect.stringContaining('<mark>'),
|
||||
}),
|
||||
]);
|
||||
expect(directResults[0]?.snippet).toContain('revenue');
|
||||
|
||||
const lexicalCandidates = await index.searchLexicalCandidates({ queryText: 'gross revenue', limit: 10 });
|
||||
expect(lexicalCandidates).toEqual([
|
||||
expect.objectContaining({
|
||||
id: 'warehouse/orders',
|
||||
connectionId: 'warehouse',
|
||||
sourceName: 'orders',
|
||||
snippet: expect.stringContaining('<mark>'),
|
||||
}),
|
||||
]);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ type SearchRow = {
|
|||
connection_id?: string;
|
||||
source_name: string;
|
||||
rank: number;
|
||||
snippet?: string | null;
|
||||
};
|
||||
|
||||
export interface SlSqliteLaneCandidate {
|
||||
|
|
@ -27,6 +28,7 @@ export interface SlSqliteLaneCandidate {
|
|||
sourceName: string;
|
||||
rank: number;
|
||||
rawScore: number;
|
||||
snippet?: string;
|
||||
}
|
||||
|
||||
export interface SlSqliteDictionaryCandidate extends SlSqliteLaneCandidate {
|
||||
|
|
@ -334,7 +336,11 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
|
|||
const rows = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT connection_id, source_name, bm25(local_sl_sources_fts) AS rank
|
||||
SELECT
|
||||
connection_id,
|
||||
source_name,
|
||||
bm25(local_sl_sources_fts) AS rank,
|
||||
snippet(local_sl_sources_fts, 2, '<mark>', '</mark>', '...', 12) AS snippet
|
||||
FROM local_sl_sources_fts
|
||||
WHERE local_sl_sources_fts MATCH ?
|
||||
${connectionPredicate}
|
||||
|
|
@ -350,6 +356,7 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
|
|||
sourceName: row.source_name,
|
||||
rank: index + 1,
|
||||
rawScore: Number(row.rank),
|
||||
...(typeof row.snippet === 'string' && row.snippet.length > 0 ? { snippet: row.snippet } : {}),
|
||||
}));
|
||||
}
|
||||
|
||||
|
|
@ -499,7 +506,7 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
|
|||
queryText: string,
|
||||
limit: number,
|
||||
minRrfScore = 0,
|
||||
): Promise<Array<{ sourceName: string; rrfScore: number }>> {
|
||||
): Promise<Array<{ sourceName: string; rrfScore: number; snippet?: string }>> {
|
||||
const ftsQuery = normalizeFtsQuery(queryText);
|
||||
if (!ftsQuery) {
|
||||
return [];
|
||||
|
|
@ -508,7 +515,10 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
|
|||
const rows = this.db
|
||||
.prepare(
|
||||
`
|
||||
SELECT source_name, bm25(local_sl_sources_fts) AS rank
|
||||
SELECT
|
||||
source_name,
|
||||
bm25(local_sl_sources_fts) AS rank,
|
||||
snippet(local_sl_sources_fts, 2, '<mark>', '</mark>', '...', 12) AS snippet
|
||||
FROM local_sl_sources_fts
|
||||
WHERE connection_id = ?
|
||||
AND local_sl_sources_fts MATCH ?
|
||||
|
|
@ -519,7 +529,11 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
|
|||
.all(connectionId, ftsQuery, Math.max(1, limit)) as SearchRow[];
|
||||
|
||||
return rows
|
||||
.map((row) => ({ sourceName: row.source_name, rrfScore: scoreFromRank(row.rank) }))
|
||||
.map((row) => ({
|
||||
sourceName: row.source_name,
|
||||
rrfScore: scoreFromRank(row.rank),
|
||||
...(typeof row.snippet === 'string' && row.snippet.length > 0 ? { snippet: row.snippet } : {}),
|
||||
}))
|
||||
.filter((row) => row.rrfScore >= minRrfScore);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js';
|
||||
|
||||
export interface SemanticLayerSource {
|
||||
name: string;
|
||||
descriptions?: Record<string, string>;
|
||||
|
|
@ -42,6 +44,7 @@ export interface SemanticLayerSource {
|
|||
default_time_dimension?: { dbt?: string };
|
||||
tags?: { dbt?: string[] };
|
||||
freshness?: { dbt?: { raw?: unknown; loaded_at_field?: string | null } };
|
||||
usage?: TableUsageOutput;
|
||||
}
|
||||
|
||||
export interface SemanticLayerQueryInput {
|
||||
|
|
|
|||
|
|
@ -45,6 +45,85 @@ describe('createHttpSqlAnalysisPort', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('calls the SQL batch endpoint and maps snake_case response fields into a Map', async () => {
|
||||
const requestJson = vi.fn(async () => ({
|
||||
results: {
|
||||
orders: {
|
||||
tables_touched: ['public.orders', 'public.customers'],
|
||||
columns_by_clause: {
|
||||
select: ['status'],
|
||||
where: ['created_at'],
|
||||
join: ['customer_id', 'id'],
|
||||
},
|
||||
error: null,
|
||||
},
|
||||
broken: {
|
||||
tables_touched: [],
|
||||
columns_by_clause: {},
|
||||
error: 'Invalid expression / Unexpected token',
|
||||
},
|
||||
},
|
||||
}));
|
||||
const port = createHttpSqlAnalysisPort({ baseUrl: 'http://python.test', requestJson });
|
||||
|
||||
await expect(
|
||||
port.analyzeBatch(
|
||||
[
|
||||
{ id: 'orders', sql: 'select status from public.orders' },
|
||||
{ id: 'broken', sql: 'select * from where' },
|
||||
],
|
||||
'postgres',
|
||||
),
|
||||
).resolves.toEqual(
|
||||
new Map([
|
||||
[
|
||||
'orders',
|
||||
{
|
||||
tablesTouched: ['public.orders', 'public.customers'],
|
||||
columnsByClause: {
|
||||
select: ['status'],
|
||||
where: ['created_at'],
|
||||
join: ['customer_id', 'id'],
|
||||
},
|
||||
error: null,
|
||||
},
|
||||
],
|
||||
[
|
||||
'broken',
|
||||
{
|
||||
tablesTouched: [],
|
||||
columnsByClause: {},
|
||||
error: 'Invalid expression / Unexpected token',
|
||||
},
|
||||
],
|
||||
]),
|
||||
);
|
||||
|
||||
expect(requestJson).toHaveBeenCalledWith('/sql/analyze-batch', {
|
||||
dialect: 'postgres',
|
||||
items: [
|
||||
{ id: 'orders', sql: 'select status from public.orders' },
|
||||
{ id: 'broken', sql: 'select * from where' },
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it('rejects malformed SQL batch responses instead of inventing defaults', async () => {
|
||||
const requestJson = vi.fn(async () => ({
|
||||
results: {
|
||||
orders: {
|
||||
tables_touched: ['public.orders'],
|
||||
columns_by_clause: { select: ['status'], where: [42] },
|
||||
error: null,
|
||||
},
|
||||
},
|
||||
}));
|
||||
const port = createHttpSqlAnalysisPort({ baseUrl: 'http://python.test', requestJson });
|
||||
|
||||
await expect(port.analyzeBatch([{ id: 'orders', sql: 'select status from public.orders' }], 'postgres')).rejects
|
||||
.toThrow('sql analysis response is missing string[] field columns_by_clause.where');
|
||||
});
|
||||
|
||||
it('rejects malformed daemon responses instead of inventing defaults', async () => {
|
||||
const requestJson = vi.fn(async () => ({
|
||||
fingerprint: 'abc',
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@ import { request as httpRequest } from 'node:http';
|
|||
import { request as httpsRequest } from 'node:https';
|
||||
import { URL } from 'node:url';
|
||||
import type {
|
||||
SqlAnalysisBatchItem,
|
||||
SqlAnalysisBatchResult,
|
||||
SqlAnalysisDialect,
|
||||
SqlAnalysisFingerprintResult,
|
||||
SqlAnalysisLiteralSlot,
|
||||
|
|
@ -94,6 +96,14 @@ function requiredStringArray(raw: Record<string, unknown>, field: string): strin
|
|||
return value;
|
||||
}
|
||||
|
||||
function requiredObject(raw: Record<string, unknown>, field: string): Record<string, unknown> {
|
||||
const value = raw[field];
|
||||
if (!value || typeof value !== 'object' || Array.isArray(value)) {
|
||||
throw new Error(`sql analysis response is missing object field ${field}`);
|
||||
}
|
||||
return value as Record<string, unknown>;
|
||||
}
|
||||
|
||||
function isLiteralSlotType(value: unknown): value is SqlAnalysisLiteralSlotType {
|
||||
return (
|
||||
value === 'string' ||
|
||||
|
|
@ -144,6 +154,39 @@ function mapResult(raw: Record<string, unknown>): SqlAnalysisFingerprintResult {
|
|||
};
|
||||
}
|
||||
|
||||
function mapColumnsByClause(raw: Record<string, unknown>): SqlAnalysisBatchResult['columnsByClause'] {
|
||||
const value = requiredObject(raw, 'columns_by_clause');
|
||||
const result: SqlAnalysisBatchResult['columnsByClause'] = {};
|
||||
for (const [clause, columns] of Object.entries(value)) {
|
||||
if (!Array.isArray(columns) || columns.some((item) => typeof item !== 'string')) {
|
||||
throw new Error(`sql analysis response is missing string[] field columns_by_clause.${clause}`);
|
||||
}
|
||||
result[clause] = columns;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function mapBatchResult(raw: Record<string, unknown>): SqlAnalysisBatchResult {
|
||||
const error = optionalString(raw, 'error');
|
||||
return {
|
||||
tablesTouched: requiredStringArray(raw, 'tables_touched'),
|
||||
columnsByClause: mapColumnsByClause(raw),
|
||||
...(error !== undefined ? { error } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
function mapBatchResponse(raw: Record<string, unknown>): Map<string, SqlAnalysisBatchResult> {
|
||||
const results = requiredObject(raw, 'results');
|
||||
return new Map(
|
||||
Object.entries(results).map(([id, value]) => {
|
||||
if (!value || typeof value !== 'object' || Array.isArray(value)) {
|
||||
throw new Error(`sql analysis response contains invalid batch result ${id}`);
|
||||
}
|
||||
return [id, mapBatchResult(value as Record<string, unknown>)];
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
export function createHttpSqlAnalysisPort(options: HttpSqlAnalysisPortOptions): SqlAnalysisPort {
|
||||
const requestJson = options.requestJson ?? postJson(options.baseUrl);
|
||||
|
||||
|
|
@ -155,5 +198,12 @@ export function createHttpSqlAnalysisPort(options: HttpSqlAnalysisPortOptions):
|
|||
});
|
||||
return mapResult(raw);
|
||||
},
|
||||
async analyzeBatch(items: SqlAnalysisBatchItem[], dialect: SqlAnalysisDialect) {
|
||||
const raw = await requestJson('/sql/analyze-batch', {
|
||||
dialect,
|
||||
items,
|
||||
});
|
||||
return mapBatchResponse(raw);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
export { createHttpSqlAnalysisPort } from './http-sql-analysis-port.js';
|
||||
export type { HttpSqlAnalysisPortOptions, KtxSqlAnalysisHttpJsonRunner } from './http-sql-analysis-port.js';
|
||||
export type {
|
||||
SqlAnalysisBatchItem,
|
||||
SqlAnalysisBatchResult,
|
||||
SqlAnalysisClause,
|
||||
SqlAnalysisDialect,
|
||||
SqlAnalysisFingerprintResult,
|
||||
SqlAnalysisLiteralSlot,
|
||||
|
|
|
|||
|
|
@ -25,6 +25,23 @@ export interface SqlAnalysisFingerprintResult {
|
|||
error?: string | null;
|
||||
}
|
||||
|
||||
export type SqlAnalysisClause = 'select' | 'where' | 'join' | 'groupBy' | 'having' | 'orderBy' | (string & {});
|
||||
|
||||
export interface SqlAnalysisBatchItem {
|
||||
id: string;
|
||||
sql: string;
|
||||
}
|
||||
|
||||
export interface SqlAnalysisBatchResult {
|
||||
tablesTouched: string[];
|
||||
columnsByClause: Partial<Record<SqlAnalysisClause, string[]>>;
|
||||
error?: string | null;
|
||||
}
|
||||
|
||||
export interface SqlAnalysisPort {
|
||||
analyzeForFingerprint(sql: string, dialect: SqlAnalysisDialect): Promise<SqlAnalysisFingerprintResult>;
|
||||
analyzeBatch(
|
||||
items: SqlAnalysisBatchItem[],
|
||||
dialect: SqlAnalysisDialect,
|
||||
): Promise<Map<string, SqlAnalysisBatchResult>>;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ export interface WikiFrontmatter {
|
|||
representative_sql?: string;
|
||||
usage?: HistoricSqlWikiUsageFrontmatter;
|
||||
fingerprints?: string[];
|
||||
stale_since?: string;
|
||||
}
|
||||
|
||||
export interface WikiPage {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue