Merge remote-tracking branch 'origin/main' into luca-martial/schema-select-ux-text

This commit is contained in:
Luca Martial 2026-05-11 14:45:12 -07:00
commit 523d6ab68a
130 changed files with 17386 additions and 5942 deletions

View file

@ -1,146 +0,0 @@
{
"name": "eviction-churn",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": [
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn"
]
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 3,
"rows": [
{
"queryid": "501",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 20,
"totalExecTime": 500,
"meanExecTime": 25,
"totalRows": 40
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": null,
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q501": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 20,
"totalExecTime": 500,
"totalRows": 40
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T08:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn",
"pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn",
"baseline_first_run:no_previous_pgss_baseline"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 3,
"templates": [
{
"id": "db5_q501",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q501/page.md"
}
]
}
},
"templates/db5_q501/metadata.json": {
"json": {
"id": "db5_q501",
"title": "postgres · analytics.orders [db5_q501]",
"path": "templates/db5_q501/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q501/page.md": {
"text": "# db5_q501\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q501/usage.json": {
"json": {
"stats": {
"executions": 20,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 25,
"error_rate": 0,
"rows_produced": 40
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -1,144 +0,0 @@
{
"name": "first-run",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "101",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 10,
"totalExecTime": 250,
"meanExecTime": 25,
"totalRows": 20
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [
"^svc_"
],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": null,
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q101": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 10,
"totalExecTime": 250,
"totalRows": 20
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T08:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_first_run:no_previous_pgss_baseline"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q101",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q101/page.md"
}
]
}
},
"templates/db5_q101/metadata.json": {
"json": {
"id": "db5_q101",
"title": "postgres · analytics.orders [db5_q101]",
"path": "templates/db5_q101/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q101/page.md": {
"text": "# db5_q101\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q101/usage.json": {
"json": {
"stats": {
"executions": 10,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 25,
"error_rate": 0,
"rows_produced": 20
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -1,181 +0,0 @@
{
"name": "normal-delta",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "201",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 12,
"totalExecTime": 160,
"meanExecTime": 13.333333333333334,
"totalRows": 58
},
{
"queryid": "201",
"userid": "12",
"username": "svc_loader",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 5,
"totalExecTime": 50,
"meanExecTime": 10,
"totalRows": 25
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [
"^svc_"
],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q201": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 10,
"totalExecTime": 100,
"totalRows": 50
},
"12": {
"calls": 5,
"totalExecTime": 50,
"totalRows": 25
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q201": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 12,
"totalExecTime": 160,
"totalRows": 58
},
"12": {
"calls": 5,
"totalExecTime": 50,
"totalRows": 25
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": false,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q201",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q201/page.md"
}
]
}
},
"templates/db5_q201/metadata.json": {
"json": {
"id": "db5_q201",
"title": "postgres · analytics.orders [db5_q201]",
"path": "templates/db5_q201/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "low",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q201/page.md": {
"text": "# db5_q201\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q201/usage.json": {
"json": {
"stats": {
"executions": 2,
"distinct_users": 1,
"first_seen": "2026-05-08T09:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 30,
"error_rate": 0,
"rows_produced": 8
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -1,159 +0,0 @@
{
"name": "reset-detected",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T11:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "301",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 3,
"totalExecTime": 90,
"meanExecTime": 30,
"totalRows": 9
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q301": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 100,
"totalExecTime": 1000,
"totalRows": 500
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T11:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q301": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 3,
"totalExecTime": 90,
"totalRows": 9
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z"
],
"degraded": true,
"statsResetAt": "2026-05-08T11:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q301",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q301/page.md"
}
]
}
},
"templates/db5_q301/metadata.json": {
"json": {
"id": "db5_q301",
"title": "postgres · analytics.orders [db5_q301]",
"path": "templates/db5_q301/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q301/page.md": {
"text": "# db5_q301\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q301/usage.json": {
"json": {
"stats": {
"executions": 3,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 30,
"error_rate": 0,
"rows_produced": 9
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -1,159 +0,0 @@
{
"name": "version-change",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "401",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 4,
"totalExecTime": 80,
"meanExecTime": 20,
"totalRows": 8
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 15.7",
"templates": {
"db5_q401": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 100,
"totalExecTime": 1000,
"totalRows": 500
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q401": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 4,
"totalExecTime": 80,
"totalRows": 8
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_reset:pg_server_major changed from 15 to 16"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q401",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q401/page.md"
}
]
}
},
"templates/db5_q401/metadata.json": {
"json": {
"id": "db5_q401",
"title": "postgres · analytics.orders [db5_q401]",
"path": "templates/db5_q401/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q401/page.md": {
"text": "# db5_q401\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q401/usage.json": {
"json": {
"stats": {
"executions": 4,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 20,
"error_rate": 0,
"rows_produced": 8
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -33,7 +33,7 @@ describe('BigQueryHistoricSqlQueryHistoryReader', () => {
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(reader.probe(client)).resolves.toBeUndefined();
await expect(reader.probe(client)).resolves.toEqual({ warnings: [], info: [] });
expect(client.executeQuery).toHaveBeenCalledWith(
'SELECT 1 FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT` LIMIT 1',
@ -63,127 +63,85 @@ describe('BigQueryHistoricSqlQueryHistoryReader', () => {
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('fetches BigQuery jobs with cursor and maps them into RawQueryRow shape without rowsProduced', async () => {
it('fetches aggregated BigQuery query templates', async () => {
const client = queryClient([
{
headers: [
'job_id',
'query',
'user_email',
'creation_time',
'end_time',
'runtime_ms',
'total_slot_ms',
'total_bytes_processed',
'state',
'error_reason',
'error_message',
'statement_type',
'template_id',
'canonical_sql',
'executions',
'distinct_users',
'first_seen',
'last_seen',
'p50_ms',
'p95_ms',
'error_rate',
'rows_produced',
'top_users',
],
rows: [
[
'bquxjob_1',
"SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'",
'analyst-a@example.test',
'2026-05-04T10:00:00.000Z',
'2026-05-04T10:00:01.250Z',
1250,
3106,
161164718,
'DONE',
'hash-1',
'select status from orders',
42,
3,
'2026-05-01T00:00:00.000Z',
'2026-05-11T00:00:00.000Z',
12,
40,
0.05,
null,
null,
'SELECT',
],
[
'bquxjob_2',
'SELECT * FROM `project-1.analytics.missing_table`',
'analyst-b@example.test',
new Date('2026-05-04T10:05:00.000Z'),
null,
null,
0,
0,
'DONE',
'notFound',
'Not found: Table project-1.analytics.missing_table',
'SELECT',
JSON.stringify([{ user: 'analyst@example.test', executions: 1 }]),
],
],
totalRows: 2,
totalRows: 1,
},
]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'demo', region: 'us' });
const rows = [];
for await (const row of reader.fetch(
for await (const row of reader.fetchAggregated(
client,
{
start: new Date('2026-05-01T00:00:00.000Z'),
end: new Date('2026-05-04T12:00:00.000Z'),
},
'2026-05-03T00:00:00.000Z',
{ start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') },
{ dialect: 'bigquery', minExecutions: 5, windowDays: 90, concurrency: 12, filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 },
)) {
rows.push(row);
}
expect(client.executeQuery).toHaveBeenCalledTimes(1);
const sql = firstQuery(client);
expect(sql).toContain('FROM `project-1.region-us.INFORMATION_SCHEMA.JOBS_BY_PROJECT`');
expect(sql).toContain("creation_time >= TIMESTAMP('2026-05-03T00:00:00.000Z')");
expect(sql).toContain("creation_time < TIMESTAMP('2026-05-04T12:00:00.000Z')");
expect(sql).toContain("job_type = 'QUERY'");
expect(sql).toContain("(statement_type IS NULL OR statement_type != 'SCRIPT')");
expect(sql).toContain('ORDER BY creation_time ASC, job_id ASC');
expect(sql).toContain('total_slot_ms');
expect(sql).toContain('total_bytes_processed');
expect(sql).not.toMatch(/total_rows/i);
expect(rows).toEqual([
expect(sql).toContain('COUNT(*) AS executions');
expect(sql).toContain('COUNT(DISTINCT user_email) AS distinct_users');
expect(sql).toContain('GROUP BY query_hash');
expect(sql).toContain('HAVING COUNT(*) >= 5');
expect(rows).toMatchObject([
{
id: 'bquxjob_1',
sql: "SELECT COUNT(*) FROM `project-1.analytics.orders` WHERE status = 'paid'",
user: 'analyst-a@example.test',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: '2026-05-04T10:00:01.250Z',
runtimeMs: 1250,
success: true,
errorMessage: null,
},
{
id: 'bquxjob_2',
sql: 'SELECT * FROM `project-1.analytics.missing_table`',
user: 'analyst-b@example.test',
startedAt: '2026-05-04T10:05:00.000Z',
endedAt: null,
runtimeMs: null,
success: false,
errorMessage: 'notFound: Not found: Table project-1.analytics.missing_table',
templateId: 'hash-1',
stats: {
executions: 42,
errorRate: 0.05,
},
topUsers: [{ user: 'analyst@example.test', executions: 1 }],
},
]);
});
it('uses the window start when no cursor is available', async () => {
const client = queryClient([{ headers: ['job_id'], rows: [], totalRows: 0 }]);
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'EU' });
for await (const _row of reader.fetch(client, {
start: new Date('2026-02-03T12:00:00.000Z'),
end: new Date('2026-05-04T12:00:00.000Z'),
})) {
throw new Error('empty result should not yield rows');
}
const sql = firstQuery(client);
expect(sql).toContain('FROM `project-1.region-eu.INFORMATION_SCHEMA.JOBS_BY_PROJECT`');
expect(sql).toContain("creation_time >= TIMESTAMP('2026-02-03T12:00:00.000Z')");
});
it('throws a clear error when the query client cannot execute SQL', async () => {
const reader = new BigQueryHistoricSqlQueryHistoryReader({ projectId: 'project-1', region: 'US' });
await expect(async () => {
for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) {
for await (const _row of reader.fetchAggregated(
{},
{ start: new Date(), end: new Date() },
{
dialect: 'bigquery',
minExecutions: 5,
windowDays: 90,
concurrency: 12,
filters: { dropTrivialProbes: true },
redactionPatterns: [],
staleArchiveAfterDays: 90,
},
)) {
throw new Error('unreachable');
}
}).rejects.toThrow('Historic SQL BigQuery reader requires a query client with executeQuery(query)');

View file

@ -1,5 +1,10 @@
import { HistoricSqlGrantsMissingError } from './errors.js';
import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js';
import {
aggregatedTemplateSchema,
type AggregatedTemplate,
type HistoricSqlTimeWindow,
type HistoricSqlUnifiedPullConfig,
} from './types.js';
interface QueryResultLike {
headers: string[];
@ -110,6 +115,23 @@ function nullableNumber(raw: unknown): number | null {
return Math.max(0, number);
}
function requiredNumber(raw: unknown, field: string): number {
const number = nullableNumber(raw);
if (number === null) {
throw new Error(`BigQuery JOBS_BY_PROJECT row has invalid ${field}: ${String(raw)}`);
}
return number;
}
function requiredInteger(raw: unknown, field: string): number {
return Math.trunc(requiredNumber(raw, field));
}
function nullableInteger(raw: unknown): number | null {
const number = nullableNumber(raw);
return number === null ? null : Math.trunc(number);
}
function isoTimestamp(raw: unknown, field: string): string {
if (raw instanceof Date) {
return raw.toISOString();
@ -122,43 +144,49 @@ function isoTimestamp(raw: unknown, field: string): string {
return date.toISOString();
}
function nullableIsoTimestamp(raw: unknown): string | null {
if (raw === null || raw === undefined || raw === '') {
return null;
function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> {
const text = nullableString(raw);
if (!text) {
return [];
}
return isoTimestamp(raw, 'end_time');
}
function executionSucceeded(state: string | null, errorReason: string | null, errorMessage: string | null): boolean {
if (errorReason || errorMessage) {
return false;
try {
const parsed = JSON.parse(text) as unknown;
if (!Array.isArray(parsed)) {
return [];
}
return parsed.flatMap((entry) => {
if (!entry || typeof entry !== 'object') {
return [];
}
const user = nullableString((entry as { user?: unknown }).user);
const executions = nullableInteger((entry as { executions?: unknown }).executions);
return executions === null ? [] : [{ user, executions }];
});
} catch {
return [];
}
return state === null || state.toUpperCase() === 'DONE';
}
function combinedErrorMessage(errorReason: string | null, errorMessage: string | null): string | null {
if (errorReason && errorMessage) {
return `${errorReason}: ${errorMessage}`;
}
return errorMessage ?? errorReason;
function mapAggregatedRow(row: unknown[], indexes: Map<string, number>): AggregatedTemplate {
return aggregatedTemplateSchema.parse({
templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'),
canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'),
dialect: 'bigquery',
stats: {
executions: requiredInteger(value(row, indexes, 'executions'), 'executions'),
distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'),
firstSeen: isoTimestamp(value(row, indexes, 'first_seen'), 'first_seen'),
lastSeen: isoTimestamp(value(row, indexes, 'last_seen'), 'last_seen'),
p50RuntimeMs: nullableNumber(value(row, indexes, 'p50_ms')),
p95RuntimeMs: nullableNumber(value(row, indexes, 'p95_ms')),
errorRate: requiredNumber(value(row, indexes, 'error_rate'), 'error_rate'),
rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')),
},
topUsers: parseTopUsers(value(row, indexes, 'top_users')),
});
}
function mapRow(row: unknown[], indexes: Map<string, number>): HistoricSqlRawQueryRow {
const errorReason = nullableString(value(row, indexes, 'error_reason'));
const errorMessage = nullableString(value(row, indexes, 'error_message'));
return {
id: requiredString(value(row, indexes, 'job_id'), 'job_id'),
sql: requiredString(value(row, indexes, 'query'), 'query'),
user: nullableString(value(row, indexes, 'user_email')),
startedAt: isoTimestamp(value(row, indexes, 'creation_time'), 'creation_time'),
endedAt: nullableIsoTimestamp(value(row, indexes, 'end_time')),
runtimeMs: nullableNumber(value(row, indexes, 'runtime_ms')),
success: executionSucceeded(nullableString(value(row, indexes, 'state')), errorReason, errorMessage),
errorMessage: combinedErrorMessage(errorReason, errorMessage),
};
}
export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader {
export class BigQueryHistoricSqlQueryHistoryReader {
private readonly viewPath: string;
constructor(options: BigQueryHistoricSqlQueryHistoryReaderOptions) {
@ -167,7 +195,7 @@ export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHi
this.viewPath = `\`${projectId}.region-${region}.INFORMATION_SCHEMA.JOBS_BY_PROJECT\``;
}
async probe(client: unknown): Promise<void> {
async probe(client: unknown): Promise<{ warnings: string[]; info: string[] }> {
let result: QueryResultLike;
try {
result = await queryClient(client).executeQuery(`SELECT 1 FROM ${this.viewPath} LIMIT 1`);
@ -177,43 +205,43 @@ export class BigQueryHistoricSqlQueryHistoryReader implements HistoricSqlQueryHi
if (result.error) {
throw grantsError(result.error);
}
return { warnings: [], info: [] };
}
async *fetch(
async *fetchAggregated(
client: unknown,
window: HistoricSqlTimeWindow,
cursor?: string | null,
): AsyncIterable<HistoricSqlRawQueryRow> {
const start = timestampExpression(cursor ?? window.start);
const end = timestampExpression(window.end);
config: HistoricSqlUnifiedPullConfig,
): AsyncIterable<AggregatedTemplate> {
const sql = `
SELECT
job_id,
query,
user_email,
creation_time,
end_time,
TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND) AS runtime_ms,
total_slot_ms,
total_bytes_processed,
state,
error_result.reason AS error_reason,
error_result.message AS error_message,
statement_type
query_hash AS template_id,
MIN(query) AS canonical_sql,
COUNT(*) AS executions,
COUNT(DISTINCT user_email) AS distinct_users,
MIN(creation_time) AS first_seen,
MAX(creation_time) AS last_seen,
APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(50)] AS p50_ms,
APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(95)] AS p95_ms,
SAFE_DIVIDE(COUNTIF(error_result IS NOT NULL), COUNT(*)) AS error_rate,
CAST(NULL AS INT64) AS rows_produced,
TO_JSON_STRING(ARRAY_AGG(STRUCT(user_email AS user, 1 AS executions) ORDER BY creation_time DESC LIMIT 5)) AS top_users
FROM ${this.viewPath}
WHERE creation_time >= ${start}
AND creation_time < ${end}
AND job_type = 'QUERY'
WHERE job_type = 'QUERY'
AND statement_type IN ('SELECT', 'MERGE')
AND creation_time >= ${timestampExpression(window.start)}
AND creation_time < ${timestampExpression(window.end)}
AND query IS NOT NULL
AND (statement_type IS NULL OR statement_type != 'SCRIPT')
ORDER BY creation_time ASC, job_id ASC`.trim();
GROUP BY query_hash
HAVING COUNT(*) >= ${config.minExecutions}
ORDER BY executions DESC`.trim();
const result = await queryClient(client).executeQuery(sql);
if (result.error) {
throw grantsError(result.error);
}
const indexes = indexByHeader(result.headers);
for (const row of result.rows) {
yield mapRow(row, indexes);
yield mapAggregatedRow(row, indexes);
}
}
}

View file

@ -0,0 +1,59 @@
import { describe, expect, it } from 'vitest';
import {
bucketDistinctUsers,
bucketErrorRate,
bucketExecutions,
bucketFrequency,
bucketP95Runtime,
bucketRecency,
} from './buckets.js';
describe('historic-sql bucket helpers', () => {
it('uses stable execution buckets', () => {
expect([0, 9, 10, 99, 100, 999, 1000, 4999, 5000, 49999, 50000].map(bucketExecutions)).toEqual([
'<10',
'<10',
'10-100',
'10-100',
'100-1k',
'100-1k',
'1k-5k',
'1k-5k',
'5k-50k',
'5k-50k',
'>50k',
]);
});
it('uses stable distinct-user, error-rate, runtime, and recency buckets', () => {
expect([0, 1, 2, 5, 6, 10, 11].map(bucketDistinctUsers)).toEqual([
'0',
'1',
'2-5',
'2-5',
'5-10',
'5-10',
'>10',
]);
expect([0, 0.01, 0.05, 0.2].map(bucketErrorRate)).toEqual(['none', 'low', 'low', 'high']);
expect([null, 99, 100, 999, 1000, 9999, 10000].map(bucketP95Runtime)).toEqual([
'unknown',
'<100ms',
'100ms-1s',
'100ms-1s',
'1s-10s',
'1s-10s',
'>10s',
]);
expect(bucketRecency('2026-05-11T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('current');
expect(bucketRecency('2026-04-20T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('recent');
expect(bucketRecency('2026-01-01T00:00:00.000Z', new Date('2026-05-11T12:00:00.000Z'))).toBe('stale');
});
it('maps frequency counts to high, mid, and low labels', () => {
expect(bucketFrequency(80, 100)).toBe('high');
expect(bucketFrequency(20, 100)).toBe('mid');
expect(bucketFrequency(1, 100)).toBe('low');
expect(bucketFrequency(0, 0)).toBe('low');
});
});

View file

@ -0,0 +1,49 @@
export function bucketExecutions(value: number): string {
if (value < 10) return '<10';
if (value < 100) return '10-100';
if (value < 1000) return '100-1k';
if (value < 5000) return '1k-5k';
if (value < 50000) return '5k-50k';
return '>50k';
}
export function bucketDistinctUsers(value: number): string {
if (value <= 0) return '0';
if (value === 1) return '1';
if (value <= 5) return '2-5';
if (value <= 10) return '5-10';
return '>10';
}
export function bucketErrorRate(value: number): string {
if (value <= 0) return 'none';
if (value < 0.1) return 'low';
return 'high';
}
export function bucketP95Runtime(value: number | null): string {
if (value === null) return 'unknown';
if (value < 100) return '<100ms';
if (value < 1000) return '100ms-1s';
if (value < 10000) return '1s-10s';
return '>10s';
}
export function bucketRecency(lastSeen: string, now: Date): string {
const parsed = new Date(lastSeen);
if (Number.isNaN(parsed.getTime())) {
return 'unknown';
}
const ageDays = (now.getTime() - parsed.getTime()) / (24 * 60 * 60 * 1000);
if (ageDays <= 7) return 'current';
if (ageDays <= 45) return 'recent';
return 'stale';
}
export function bucketFrequency(count: number, total: number): 'high' | 'mid' | 'low' {
if (total <= 0 || count <= 0) return 'low';
const ratio = count / total;
if (ratio >= 0.5) return 'high';
if (ratio >= 0.1) return 'mid';
return 'low';
}

View file

@ -0,0 +1,182 @@
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './chunk-unified.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-unified-chunk-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
async function writeUnifiedStagedDir(root: string): Promise<void> {
await writeJson(root, 'manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 1,
touchedTableCount: 1,
parseFailures: 0,
warnings: [],
probeWarnings: [],
});
await writeJson(root, 'tables/public.orders.json', {
table: 'public.orders',
stats: {
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
errorRateBucket: 'none',
p95RuntimeBucket: '<100ms',
recencyBucket: 'current',
},
columnsByClause: { select: [['status', 'high']] },
observedJoins: [],
topTemplates: [{ id: 'orders', canonicalSql: 'select * from public.orders', topUsers: [{ user: 'analyst' }] }],
});
await writeJson(root, 'patterns-input.json', {
templates: [
{
id: 'orders',
canonicalSql: 'select * from public.orders join public.customers on true',
tablesTouched: ['public.orders', 'public.customers'],
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
},
],
});
await writeJson(root, 'patterns-input/part-0001.json', {
templates: [
{
id: 'orders',
canonicalSql: 'select * from public.orders join public.customers on true',
tablesTouched: ['public.orders', 'public.customers'],
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
},
],
});
}
describe('chunkHistoricSqlUnifiedStagedDir', () => {
it('emits one table WorkUnit plus one patterns WorkUnit', async () => {
const stagedDir = await tempDir();
await writeUnifiedStagedDir(stagedDir);
const result = await chunkHistoricSqlUnifiedStagedDir(stagedDir);
expect(result.workUnits).toEqual([
expect.objectContaining({
unitKey: 'historic-sql-table-public-orders',
displayLabel: 'Historic SQL usage: public.orders',
rawFiles: ['tables/public.orders.json'],
dependencyPaths: ['manifest.json'],
notes: expect.stringContaining('historic_sql_table_digest'),
}),
expect.objectContaining({
unitKey: 'historic-sql-patterns-part-0001',
displayLabel: 'Historic SQL cross-table patterns: part-0001',
rawFiles: ['patterns-input/part-0001.json'],
dependencyPaths: ['manifest.json'],
notes: expect.stringContaining('patterns-input/part-0001.json'),
}),
]);
expect(result.workUnits[0]?.notes).toContain('emit_historic_sql_evidence');
expect(result.workUnits[1]?.notes).toContain('emit_historic_sql_evidence');
expect(result.reconcileNotes).toEqual(['Historic-SQL touched tables=1 parseFailures=0']);
});
it('respects diff sets for unchanged table and patterns files', async () => {
const stagedDir = await tempDir();
await writeUnifiedStagedDir(stagedDir);
await expect(
chunkHistoricSqlUnifiedStagedDir(stagedDir, {
added: [],
modified: ['tables/public.orders.json'],
deleted: [],
unchanged: ['manifest.json', 'patterns-input.json', 'patterns-input/part-0001.json'],
}),
).resolves.toMatchObject({
workUnits: [expect.objectContaining({ unitKey: 'historic-sql-table-public-orders' })],
});
await expect(
chunkHistoricSqlUnifiedStagedDir(stagedDir, {
added: [],
modified: ['patterns-input/part-0001.json'],
deleted: [],
unchanged: ['manifest.json', 'patterns-input.json', 'tables/public.orders.json'],
}),
).resolves.toMatchObject({
workUnits: [expect.objectContaining({ unitKey: 'historic-sql-patterns-part-0001' })],
});
await expect(
chunkHistoricSqlUnifiedStagedDir(stagedDir, {
added: [],
modified: ['patterns-input.json'],
deleted: [],
unchanged: ['manifest.json', 'patterns-input/part-0001.json', 'tables/public.orders.json'],
}),
).resolves.toMatchObject({
workUnits: [],
});
});
it('describes unified staged scope', async () => {
const stagedDir = await tempDir();
await writeUnifiedStagedDir(stagedDir);
const scope = await describeHistoricSqlUnifiedScope(stagedDir);
expect(scope.isPathInScope('manifest.json')).toBe(true);
expect(scope.isPathInScope('patterns-input.json')).toBe(true);
expect(scope.isPathInScope('patterns-input/part-0001.json')).toBe(true);
expect(scope.isPathInScope('patterns-input/part-1.json')).toBe(false);
expect(scope.isPathInScope('tables/public.orders.json')).toBe(true);
expect(scope.isPathInScope('templates/old/page.md')).toBe(false);
});
it('emits one patterns WorkUnit per changed shard', async () => {
const stagedDir = await tempDir();
await writeUnifiedStagedDir(stagedDir);
await writeJson(stagedDir, 'patterns-input/part-0002.json', {
templates: [
{
id: 'line-items',
canonicalSql: 'select * from public.orders join public.line_items on true',
tablesTouched: ['public.orders', 'public.line_items'],
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
},
],
});
const result = await chunkHistoricSqlUnifiedStagedDir(stagedDir, {
added: ['patterns-input/part-0002.json'],
modified: ['patterns-input/part-0001.json'],
deleted: [],
unchanged: ['manifest.json', 'patterns-input.json', 'tables/public.orders.json'],
});
expect(result.workUnits.map((unit) => unit.unitKey)).toEqual([
'historic-sql-patterns-part-0001',
'historic-sql-patterns-part-0002',
]);
expect(result.workUnits.map((unit) => unit.rawFiles)).toEqual([
['patterns-input/part-0001.json'],
['patterns-input/part-0002.json'],
]);
});
});

View file

@ -0,0 +1,99 @@
import { createHash } from 'node:crypto';
import { readFile, readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js';
import { isHistoricSqlPatternInputShardPath } from './pattern-inputs.js';
import { stagedManifestSchema, stagedPatternsInputSchema, stagedTableInputSchema } from './types.js';
async function walk(root: string): Promise<string[]> {
const entries = await readdir(root, { withFileTypes: true, recursive: true });
return entries
.filter((entry) => entry.isFile())
.map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/'))
.sort();
}
async function readJson<T>(stagedDir: string, relPath: string): Promise<T> {
return JSON.parse(await readFile(join(stagedDir, relPath), 'utf-8')) as T;
}
function safeUnitKey(value: string): string {
return value.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '');
}
function touchedPath(path: string, touched: Set<string> | null): boolean {
return !touched || touched.has(path);
}
export async function chunkHistoricSqlUnifiedStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const files = await walk(stagedDir);
const manifest = stagedManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
const workUnits: WorkUnit[] = [];
for (const path of files.filter((file) => /^tables\/.+\.json$/.test(file))) {
if (!touchedPath(path, touched)) {
continue;
}
const table = stagedTableInputSchema.parse(await readJson(stagedDir, path));
workUnits.push({
unitKey: `historic-sql-table-${safeUnitKey(table.table)}`,
displayLabel: `Historic SQL usage: ${table.table}`,
rawFiles: [path],
dependencyPaths: ['manifest.json'],
peerFileIndex: files.filter((file) => file !== path && file !== 'manifest.json').sort(),
notes:
'Use historic_sql_table_digest. Read this table usage JSON and emit exactly one table_usage object with emit_historic_sql_evidence. Do not call wiki_write or sl_write_source.',
});
}
for (const path of files.filter(isHistoricSqlPatternInputShardPath)) {
if (!touchedPath(path, touched)) {
continue;
}
stagedPatternsInputSchema.parse(await readJson(stagedDir, path));
const shardLabel = path.replace(/^patterns-input\//, '').replace(/\.json$/, '');
workUnits.push({
unitKey: `historic-sql-patterns-${safeUnitKey(shardLabel)}`,
displayLabel: `Historic SQL cross-table patterns: ${shardLabel}`,
rawFiles: [path],
dependencyPaths: ['manifest.json'],
peerFileIndex: files.filter((file) => file !== path && file !== 'manifest.json').sort(),
notes:
`Use historic_sql_patterns. Read ${path} and emit pattern objects with emit_historic_sql_evidence using rawPath "${path}". Do not call wiki_write or sl_write_source.`,
});
}
const deleted = diffSet?.deleted
.filter((path) => isHistoricSqlPatternInputShardPath(path) || /^tables\/.+\.json$/.test(path))
.sort();
return {
workUnits,
eviction: deleted && deleted.length > 0 ? { deletedRawPaths: deleted } : undefined,
reconcileNotes: [`Historic-SQL touched tables=${manifest.touchedTableCount} parseFailures=${manifest.parseFailures}`],
contextReport: {
capped: false,
warnings: [...manifest.probeWarnings, ...manifest.warnings],
},
};
}
export async function describeHistoricSqlUnifiedScope(stagedDir: string): Promise<ScopeDescriptor> {
const manifest = stagedManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const fingerprint = createHash('sha256')
.update(JSON.stringify({
connectionId: manifest.connectionId,
dialect: manifest.dialect,
windowStart: manifest.windowStart,
windowEnd: manifest.windowEnd,
}))
.digest('hex');
return {
fingerprint,
isPathInScope: (rawPath) =>
rawPath === 'manifest.json' ||
rawPath === 'patterns-input.json' ||
isHistoricSqlPatternInputShardPath(rawPath) ||
/^tables\/.+\.json$/.test(rawPath),
};
}

View file

@ -1,251 +0,0 @@
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-chunk-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
async function writeTemplate(root: string): Promise<void> {
await writeJson(root, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 1,
capped: false,
warnings: ['source warning'],
templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }],
});
await writeJson(root, 'templates/fp_1/metadata.json', {
id: 'fp_1',
title: 'snowflake · analytics.orders [fp_1]',
path: 'templates/fp_1/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_1',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
},
});
await writeFile(join(root, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8');
await writeJson(root, 'templates/fp_1/usage.json', {
stats: {
executions: 20,
distinct_users: 3,
first_seen: '2026-05-01T00:00:00.000Z',
last_seen: '2026-05-04T11:55:00.000Z',
p50_runtime_ms: 100,
p95_runtime_ms: 200,
error_rate: 0,
rows_produced: 20,
},
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }],
samples: [],
});
}
async function writeSubclusterTemplates(root: string): Promise<void> {
await writeJson(root, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 2,
capped: false,
warnings: [],
templates: [
{
id: 'fp_order_status__cat_2b2ff2318877',
fingerprint: 'fp_order_status',
subClusterId: 'cat_2b2ff2318877',
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
},
{
id: 'fp_order_status__cat_34f037ddcbfa',
fingerprint: 'fp_order_status',
subClusterId: 'cat_34f037ddcbfa',
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
},
],
});
for (const template of [
{ id: 'fp_order_status__cat_2b2ff2318877', subClusterId: 'cat_2b2ff2318877' },
{ id: 'fp_order_status__cat_34f037ddcbfa', subClusterId: 'cat_34f037ddcbfa' },
]) {
await writeJson(root, `templates/${template.id}/metadata.json`, {
id: template.id,
title: `snowflake · analytics.orders [fp_ord:${template.subClusterId.slice(-6)}]`,
path: `templates/${template.id}/page.md`,
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_order_status',
sub_cluster_id: template.subClusterId,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
triage_signals: {
executions_bucket: 'mid',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '0 constant, 0 runtime',
},
},
});
await writeFile(join(root, `templates/${template.id}/page.md`), `# ${template.id}\n`, 'utf-8');
await writeJson(root, `templates/${template.id}/usage.json`, {
stats: {
executions: 3,
distinct_users: 3,
first_seen: '2026-05-04T10:00:00.000Z',
last_seen: '2026-05-04T10:05:00.000Z',
p50_runtime_ms: 120,
p95_runtime_ms: 150,
error_rate: 0,
rows_produced: 36,
},
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }],
samples: [],
});
}
}
describe('chunkHistoricSqlStagedDir', () => {
it('emits one WorkUnit per changed template and keeps usage as dependency', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: ['templates/fp_1/metadata.json'],
modified: [],
deleted: [],
unchanged: ['templates/fp_1/page.md', 'templates/fp_1/usage.json', 'manifest.json'],
});
expect(result.workUnits).toEqual([
{
unitKey: 'historic-sql-fp-1',
displayLabel: 'snowflake · analytics.orders [fp_1]',
rawFiles: ['templates/fp_1/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_1/usage.json'],
peerFileIndex: ['templates/fp_1/page.md'],
notes:
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
},
]);
expect(result.contextReport).toEqual({ capped: false, warnings: ['source warning'] });
});
it('emits one WorkUnit per changed categorical sub-cluster', async () => {
const stagedDir = await tempDir();
await writeSubclusterTemplates(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [
'templates/fp_order_status__cat_2b2ff2318877/metadata.json',
'templates/fp_order_status__cat_34f037ddcbfa/metadata.json',
],
modified: [],
deleted: [],
unchanged: [
'manifest.json',
'templates/fp_order_status__cat_2b2ff2318877/page.md',
'templates/fp_order_status__cat_2b2ff2318877/usage.json',
'templates/fp_order_status__cat_34f037ddcbfa/page.md',
'templates/fp_order_status__cat_34f037ddcbfa/usage.json',
],
});
expect(
result.workUnits.map((unit) => ({
unitKey: unit.unitKey,
displayLabel: unit.displayLabel,
rawFiles: unit.rawFiles,
dependencyPaths: unit.dependencyPaths,
})),
).toEqual([
{
unitKey: 'historic-sql-fp-order-status-cat-2b2ff2318877',
displayLabel: 'snowflake · analytics.orders [fp_ord:318877]',
rawFiles: ['templates/fp_order_status__cat_2b2ff2318877/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_2b2ff2318877/usage.json'],
},
{
unitKey: 'historic-sql-fp-order-status-cat-34f037ddcbfa',
displayLabel: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
rawFiles: ['templates/fp_order_status__cat_34f037ddcbfa/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'],
},
]);
});
it('emits zero WorkUnits for usage-only diffs', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [],
modified: ['templates/fp_1/usage.json'],
deleted: [],
unchanged: ['templates/fp_1/metadata.json', 'templates/fp_1/page.md', 'manifest.json'],
});
expect(result.workUnits).toEqual([]);
expect(result.eviction).toBeUndefined();
});
it('emits eviction only for deleted metadata or page files', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [],
modified: [],
deleted: ['templates/fp_1/usage.json', 'templates/fp_2/page.md'],
unchanged: [],
});
expect(result.eviction).toEqual({ deletedRawPaths: ['templates/fp_2/page.md'] });
});
it('describes historic-sql scope without including unrelated paths', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const scope = await describeHistoricSqlScope(stagedDir);
expect(scope.fingerprint).toHaveLength(64);
expect(scope.isPathInScope('manifest.json')).toBe(true);
expect(scope.isPathInScope('templates/fp_1/usage.json')).toBe(true);
expect(scope.isPathInScope('pages/notion/page.md')).toBe(false);
});
});

View file

@ -1,86 +0,0 @@
import { createHash } from 'node:crypto';
import { readFile, readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js';
import { historicSqlManifestSchema, historicSqlMetadataSchema } from './types.js';
async function walk(root: string): Promise<string[]> {
const entries = await readdir(root, { withFileTypes: true, recursive: true });
return entries
.filter((entry) => entry.isFile())
.map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/'))
.sort();
}
function safeUnitKey(id: string): string {
return `historic-sql-${id.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
}
async function readManifest(stagedDir: string) {
try {
return historicSqlManifestSchema.parse(JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')));
} catch (error) {
throw new Error(`Invalid historic-SQL manifest: ${error instanceof Error ? error.message : String(error)}`);
}
}
export async function chunkHistoricSqlStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const files = await walk(stagedDir);
const manifest = await readManifest(stagedDir);
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
const workUnits: WorkUnit[] = [];
for (const pagePath of files.filter((path) => /^templates\/[^/]+\/page\.md$/.test(path))) {
const metadataPath = pagePath.replace(/\/page\.md$/, '/metadata.json');
const usagePath = pagePath.replace(/\/page\.md$/, '/usage.json');
const primary = [metadataPath, pagePath].filter((path) => files.includes(path));
if (touched && !primary.some((path) => touched.has(path))) {
continue;
}
const metadata = historicSqlMetadataSchema.parse(JSON.parse(await readFile(join(stagedDir, metadataPath), 'utf-8')));
const rawFiles = touched ? primary.filter((path) => touched.has(path)).sort() : primary.sort();
const dependencyPaths = ['manifest.json', files.includes(usagePath) ? usagePath : null]
.filter((path): path is string => typeof path === 'string' && !rawFiles.includes(path))
.sort();
const excluded = new Set([...rawFiles, ...dependencyPaths]);
const peerFileIndex = files.filter((path) => !excluded.has(path)).sort();
workUnits.push({
unitKey: safeUnitKey(metadata.id),
displayLabel: metadata.title,
rawFiles,
dependencyPaths,
peerFileIndex,
notes:
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
});
}
const deletedPrimary = diffSet?.deleted.filter((path) => /^templates\/[^/]+\/(metadata\.json|page\.md)$/.test(path));
return {
workUnits,
eviction: deletedPrimary && deletedPrimary.length > 0 ? { deletedRawPaths: deletedPrimary.sort() } : undefined,
reconcileNotes: [`Historic-SQL staged templates=${manifest.templateCount}`],
contextReport: {
capped: manifest.capped,
warnings: manifest.warnings,
},
};
}
export async function describeHistoricSqlScope(stagedDir: string): Promise<ScopeDescriptor> {
const manifest = await readManifest(stagedDir);
const scopeKey = JSON.stringify({
connectionId: manifest.connectionId,
dialect: manifest.dialect,
windowStart: manifest.windowStart,
windowEnd: manifest.windowEnd,
});
const fingerprint = createHash('sha256').update(scopeKey).digest('hex');
return {
fingerprint,
isPathInScope: (rawPath) => rawPath === 'manifest.json' || rawPath.startsWith('templates/'),
};
}

View file

@ -3,13 +3,7 @@ import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { detectHistoricSqlStagedDir } from './detect.js';
import {
HISTORIC_SQL_SOURCE_KEY,
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlPullConfigSchema,
historicSqlUsageSchema,
} from './types.js';
import { HISTORIC_SQL_SOURCE_KEY, stagedManifestSchema } from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-detect-'));
@ -21,32 +15,35 @@ async function writeJson(root: string, relPath: string, value: unknown): Promise
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
function manifest() {
return stagedManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_1',
dialect: 'postgres',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
snapshotRowCount: 0,
touchedTableCount: 0,
parseFailures: 0,
warnings: [],
probeWarnings: [],
});
}
describe('historic-sql staged dir detection', () => {
it('detects manifest source', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 0,
capped: false,
warnings: [],
templates: [],
});
await writeJson(stagedDir, 'manifest.json', manifest());
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
});
it('detects document-shaped template structure without manifest', async () => {
it('detects unified table and patterns structure without manifest', async () => {
const stagedDir = await tempDir();
await writeFile(join(stagedDir, 'not-a-match.txt'), 'x', 'utf-8');
await mkdir(join(stagedDir, 'templates', 'fp_1'), { recursive: true });
await writeFile(join(stagedDir, 'templates', 'fp_1', 'metadata.json'), '{}', 'utf-8');
await writeFile(join(stagedDir, 'templates', 'fp_1', 'page.md'), '# fp_1\n', 'utf-8');
await writeJson(stagedDir, 'patterns-input.json', { templates: [] });
await writeJson(stagedDir, 'tables/public.orders.json', { table: 'public.orders' });
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
});
@ -58,140 +55,3 @@ describe('historic-sql staged dir detection', () => {
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(false);
});
});
describe('historic-sql schemas', () => {
it('defaults disabled optional pull-config fields through the parser', () => {
expect(
historicSqlPullConfigSchema.parse({
dialect: 'bigquery',
}),
).toEqual({
dialect: 'bigquery',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
});
});
it('accepts postgres pull config with a minCalls floor', () => {
expect(
historicSqlPullConfigSchema.parse({
dialect: 'postgres',
minCalls: 12,
}),
).toEqual({
dialect: 'postgres',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 12,
});
});
it('accepts postgres manifest fields with defaults for older dialects', () => {
expect(
historicSqlManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_pg',
dialect: 'postgres',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowStart: '2026-05-08T11:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
templateCount: 0,
capped: false,
warnings: [],
templates: [],
degraded: true,
statsResetAt: '2026-05-01T00:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 3,
}),
).toMatchObject({
dialect: 'postgres',
degraded: true,
statsResetAt: '2026-05-01T00:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 3,
});
expect(
historicSqlManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_sf',
dialect: 'snowflake',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowStart: '2026-05-01T12:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: null,
templateCount: 0,
capped: false,
warnings: [],
templates: [],
}),
).toMatchObject({
degraded: false,
statsResetAt: null,
baselineFirstRun: false,
pgServerVersion: null,
deallocCount: null,
});
});
it('accepts postgres usage stats with mean_runtime_ms and empty samples', () => {
const parsed = historicSqlUsageSchema.parse({
stats: {
executions: 25,
distinct_users: 2,
first_seen: '2026-05-08T10:00:00.000Z',
last_seen: '2026-05-08T12:00:00.000Z',
p50_runtime_ms: null,
p95_runtime_ms: null,
mean_runtime_ms: 32.5,
error_rate: 0,
rows_produced: 1042,
},
literal_slots: [],
samples: [],
});
expect(parsed.stats.mean_runtime_ms).toBe(32.5);
expect(parsed.samples).toEqual([]);
});
it('pins the Notion-compatible metadata envelope', () => {
const parsed = historicSqlMetadataSchema.parse({
id: 'fp_1',
title: 'snowflake · analytics.orders [fp_1]',
path: 'templates/fp_1/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_1',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
},
});
expect(parsed.objectType).toBe('historic_sql_template');
expect(parsed.lastEditedAt).toBeNull();
expect(parsed.properties.triage_signals.service_account_only).toBe('false');
});
});

View file

@ -16,21 +16,9 @@ export async function detectHistoricSqlStagedDir(stagedDir: string): Promise<boo
}
try {
const entries = await readdir(join(stagedDir, 'templates'), { withFileTypes: true, recursive: true });
const metadataDirs = new Set<string>();
const pageDirs = new Set<string>();
for (const entry of entries) {
if (!entry.isFile()) {
continue;
}
if (entry.name === 'metadata.json') {
metadataDirs.add(entry.parentPath);
}
if (entry.name === 'page.md') {
pageDirs.add(entry.parentPath);
}
}
return [...metadataDirs].some((dir) => pageDirs.has(dir));
await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8');
const entries = await readdir(join(stagedDir, 'tables'), { withFileTypes: true });
return entries.some((entry) => entry.isFile() && entry.name.endsWith('.json'));
} catch {
return false;
}

View file

@ -0,0 +1,89 @@
import { describe, expect, it, vi } from 'vitest';
import { asSchema } from 'ai';
import { createEmitHistoricSqlEvidenceTool } from './evidence-tool.js';
describe('emit_historic_sql_evidence tool', () => {
it('exposes an AI SDK v6 tool input schema with top-level object type', async () => {
const tool = createEmitHistoricSqlEvidenceTool();
expect(await asSchema(tool.inputSchema).jsonSchema).toMatchObject({
type: 'object',
});
});
it('writes table usage evidence to the ignored run evidence directory', async () => {
const writeFile = vi.fn(async () => ({ success: true, commitHash: null }));
const tool = createEmitHistoricSqlEvidenceTool();
const result = await tool.execute!(
{
kind: 'table_usage',
table: 'public.orders',
rawPath: 'tables/public.orders.json',
usage: {
narrative: 'Orders are repeatedly queried by paid status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [],
staleSince: null,
},
},
{
toolCallId: 'call-1',
messages: [],
abortSignal: new AbortController().signal,
experimental_context: {
connectionId: 'warehouse',
session: {
ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'historic-sql' },
configService: { writeFile },
},
},
} as never,
);
expect(result).toBe('Recorded historic-SQL table_usage evidence for public.orders.');
expect(writeFile).toHaveBeenCalledWith(
'.ktx/ingest-evidence/historic-sql/run-1/historic-sql-table-public-orders.json',
expect.stringContaining('"kind": "table_usage"'),
'System User',
'system@example.com',
'Record historic-SQL evidence: historic-sql-table-public-orders',
{ skipLock: true },
);
});
it('rejects non-historic ingest sessions', async () => {
const tool = createEmitHistoricSqlEvidenceTool();
await expect(
tool.execute!(
{
kind: 'pattern',
rawPath: 'patterns-input.json',
pattern: {
slug: 'orders',
title: 'Orders',
narrative: 'Orders pattern.',
definitionSql: 'select * from public.orders',
tablesInvolved: ['public.orders'],
slRefs: ['orders'],
constituentTemplateIds: ['pg:1'],
},
},
{
toolCallId: 'call-1',
messages: [],
abortSignal: new AbortController().signal,
experimental_context: {
connectionId: 'warehouse',
session: {
ingest: { runId: 'run-1', jobId: 'job-1', syncId: 'sync-1', sourceKey: 'notion' },
configService: { writeFile: vi.fn() },
},
},
} as never,
),
).resolves.toContain('Error: emit_historic_sql_evidence is only available during historic-sql ingest');
});
});

View file

@ -0,0 +1,121 @@
import { tool } from 'ai';
import { z } from 'zod';
import { historicSqlEvidencePath, serializeHistoricSqlEvidence } from './evidence.js';
import { patternOutputSchema, tableUsageOutputSchema } from './skill-schemas.js';
const SYSTEM_AUTHOR = 'System User';
const SYSTEM_EMAIL = 'system@example.com';
const emitHistoricSqlEvidenceInputSchema = z
.object({
kind: z.enum(['table_usage', 'pattern']),
table: z.string().min(1).optional(),
rawPath: z.string().min(1),
usage: tableUsageOutputSchema.optional(),
pattern: patternOutputSchema.optional(),
})
.superRefine((input, ctx) => {
if (input.kind === 'table_usage') {
if (!input.table) {
ctx.addIssue({
code: 'custom',
path: ['table'],
message: 'table is required when kind is table_usage',
});
}
if (!input.usage) {
ctx.addIssue({
code: 'custom',
path: ['usage'],
message: 'usage is required when kind is table_usage',
});
}
}
if (input.kind === 'pattern' && !input.pattern) {
ctx.addIssue({
code: 'custom',
path: ['pattern'],
message: 'pattern is required when kind is pattern',
});
}
});
type EmitHistoricSqlEvidenceInput = z.infer<typeof emitHistoricSqlEvidenceInputSchema>;
interface EmitHistoricSqlEvidenceToolContext {
connectionId?: string | null;
session?: {
ingest?: { runId: string; sourceKey: string };
configService?: {
writeFile(
path: string,
content: string,
author: string,
authorEmail: string,
commitMessage: string,
options?: { skipLock?: boolean },
): Promise<unknown>;
};
};
}
function unitKeyForEvidence(input: EmitHistoricSqlEvidenceInput): string {
if (input.kind === 'table_usage') {
return `historic-sql-table-${String(input.table).replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
}
return `historic-sql-pattern-${String(input.pattern?.slug).replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
}
function evidenceEnvelope(input: EmitHistoricSqlEvidenceInput, connectionId: string) {
if (input.kind === 'table_usage') {
if (!input.table || !input.usage) {
throw new Error('Invalid historic-SQL table usage evidence input.');
}
return {
kind: 'table_usage' as const,
connectionId,
table: input.table,
rawPath: input.rawPath,
usage: input.usage,
};
}
if (!input.pattern) {
throw new Error('Invalid historic-SQL pattern evidence input.');
}
return {
kind: 'pattern' as const,
connectionId,
rawPath: input.rawPath,
pattern: input.pattern,
};
}
export function createEmitHistoricSqlEvidenceTool(defaultContext?: EmitHistoricSqlEvidenceToolContext) {
return tool({
description:
'Record typed historic-SQL evidence for deterministic projection. Use this instead of wiki_write, sl_write_source, sl_edit_source, or context_candidate_write during historic-SQL WorkUnits.',
inputSchema: emitHistoricSqlEvidenceInputSchema,
execute: async (input, options): Promise<string> => {
const context = (options.experimental_context as EmitHistoricSqlEvidenceToolContext | undefined) ?? defaultContext;
const ingest = context?.session?.ingest;
const configService = context?.session?.configService;
if (!ingest || ingest.sourceKey !== 'historic-sql' || !configService || !context?.connectionId) {
return 'Error: emit_historic_sql_evidence is only available during historic-sql ingest.';
}
const unitKey = unitKeyForEvidence(input);
const evidence = evidenceEnvelope(input, context.connectionId);
const content = serializeHistoricSqlEvidence(evidence);
await configService.writeFile(
historicSqlEvidencePath(ingest.runId, unitKey),
content,
SYSTEM_AUTHOR,
SYSTEM_EMAIL,
`Record historic-SQL evidence: ${unitKey}`,
{ skipLock: true },
);
const label = evidence.kind === 'table_usage' ? evidence.table : evidence.pattern.slug;
return `Recorded historic-SQL ${input.kind} evidence for ${label}.`;
},
});
}

View file

@ -0,0 +1,57 @@
import { describe, expect, it } from 'vitest';
import {
historicSqlEvidenceEnvelopeSchema,
historicSqlEvidencePath,
historicSqlPatternEvidenceSchema,
historicSqlTableUsageEvidenceSchema,
} from './evidence.js';
describe('historic-sql evidence contracts', () => {
it('validates table usage evidence emitted by table digest WorkUnits', () => {
const parsed = historicSqlTableUsageEvidenceSchema.parse({
kind: 'table_usage',
connectionId: 'warehouse',
table: 'public.orders',
rawPath: 'tables/public.orders.json',
usage: {
narrative: 'Orders are repeatedly queried for paid/refunded lifecycle analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: null,
},
});
expect(parsed.table).toBe('public.orders');
expect(parsed.usage.frequencyTier).toBe('high');
});
it('validates pattern evidence emitted by the patterns WorkUnit', () => {
const parsed = historicSqlPatternEvidenceSchema.parse(
historicSqlEvidenceEnvelopeSchema.parse({
kind: 'pattern',
connectionId: 'warehouse',
rawPath: 'patterns-input.json',
pattern: {
slug: 'order-lifecycle-analysis',
title: 'Order Lifecycle Analysis',
narrative: 'Analysts compare order status changes by customer segment.',
definitionSql: 'select status, count(*) from public.orders group by status',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['pg:1', 'pg:2'],
},
}),
);
expect(parsed.kind).toBe('pattern');
expect(parsed.pattern.slug).toBe('order-lifecycle-analysis');
});
it('builds a stable ignored evidence path from run and WorkUnit identity', () => {
expect(historicSqlEvidencePath('run-1', 'historic-sql-table-public-orders')).toBe(
'.ktx/ingest-evidence/historic-sql/run-1/historic-sql-table-public-orders.json',
);
});
});

View file

@ -0,0 +1,41 @@
import { z } from 'zod';
import { patternOutputSchema, tableUsageOutputSchema } from './skill-schemas.js';
function safeEvidenceSegment(value: string): string {
const segment = value.replace(/[^a-zA-Z0-9._-]+/g, '-').replace(/^-+|-+$/g, '');
if (!segment) {
throw new Error(`Invalid historic-SQL evidence path segment: ${value}`);
}
return segment;
}
export const historicSqlTableUsageEvidenceSchema = z.object({
kind: z.literal('table_usage'),
connectionId: z.string().min(1),
table: z.string().min(1),
rawPath: z.string().min(1),
usage: tableUsageOutputSchema,
});
export type HistoricSqlTableUsageEvidence = z.infer<typeof historicSqlTableUsageEvidenceSchema>;
export const historicSqlPatternEvidenceSchema = z.object({
kind: z.literal('pattern'),
connectionId: z.string().min(1),
rawPath: z.string().min(1),
pattern: patternOutputSchema,
});
export type HistoricSqlPatternEvidence = z.infer<typeof historicSqlPatternEvidenceSchema>;
export const historicSqlEvidenceEnvelopeSchema = z.discriminatedUnion('kind', [
historicSqlTableUsageEvidenceSchema,
historicSqlPatternEvidenceSchema,
]);
export type HistoricSqlEvidenceEnvelope = z.infer<typeof historicSqlEvidenceEnvelopeSchema>;
export function historicSqlEvidencePath(runId: string, unitKey: string): string {
return `.ktx/ingest-evidence/historic-sql/${safeEvidenceSegment(runId)}/${safeEvidenceSegment(unitKey)}.json`;
}
export function serializeHistoricSqlEvidence(evidence: HistoricSqlEvidenceEnvelope): string {
return `${JSON.stringify(historicSqlEvidenceEnvelopeSchema.parse(evidence), null, 2)}\n`;
}

View file

@ -1,48 +1,30 @@
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises';
import { mkdtemp } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it, vi } from 'vitest';
import { describe, expect, it } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import type { SourceAdapter } from '../../types.js';
import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js';
import { pgssBaselinePath } from './stage-pgss.js';
import type { HistoricSqlQueryHistoryReader, PostgresPgssReader } from './types.js';
import type { HistoricSqlReader } from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-adapter-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
const sqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint() {
return {
fingerprint: 'fp_1',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: 'paid' }],
};
throw new Error('legacy analyzeForFingerprint must not be used');
},
async analyzeBatch() {
return new Map();
},
};
const reader: HistoricSqlQueryHistoryReader = {
async probe() {},
async *fetch() {
yield {
id: 'q1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: null,
runtimeMs: 10,
rowsProduced: 1,
success: true,
errorMessage: null,
};
const reader: HistoricSqlReader = {
async probe() {
return { warnings: [], info: [] };
},
async *fetchAggregated() {},
};
describe('HistoricSqlSourceAdapter', () => {
@ -50,255 +32,73 @@ describe('HistoricSqlSourceAdapter', () => {
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
expect(adapter.source).toBe('historic-sql');
expect(adapter.skillNames).toEqual(['historic_sql_ingest']);
expect(adapter.reconcileSkillNames).toEqual(['historic_sql_curator']);
expect(adapter.evidenceIndexing).toBe('documents');
expect(adapter.triageSupported).toBe(true);
expect(adapter.skillNames).toEqual(['historic_sql_table_digest', 'historic_sql_patterns']);
expect(adapter.reconcileSkillNames).toEqual([]);
expect((adapter as SourceAdapter).evidenceIndexing).toBeUndefined();
expect(adapter.triageSupported).toBe(false);
});
it('fetches staged templates through injected reader and SqlAnalysisPort', async () => {
it('fetches a unified aggregate snapshot and emits unified WorkUnits', async () => {
const stagedDir = await tempDir();
const adapter = new HistoricSqlSourceAdapter({
sqlAnalysis,
reader,
queryClient: {},
now: () => new Date('2026-05-04T12:00:00.000Z'),
});
await adapter.fetch(
{
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
},
stagedDir,
{ connectionId: 'conn_1', sourceKey: 'historic-sql' },
);
await expect(adapter.detect(stagedDir)).resolves.toBe(true);
});
it('reads triage signals from usage.json and metadata properties', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 1,
capped: false,
warnings: [],
templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }],
});
await writeJson(stagedDir, 'templates/fp_1/metadata.json', {
id: 'fp_1',
title: 'snowflake · analytics.orders [fp_1]',
path: 'templates/fp_1/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_1',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
},
});
await writeFile(join(stagedDir, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8');
await writeJson(stagedDir, 'templates/fp_1/usage.json', {
stats: {
executions: 20,
distinct_users: 3,
first_seen: '2026-05-01T00:00:00.000Z',
last_seen: '2026-05-04T11:55:00.000Z',
p50_runtime_ms: 100,
p95_runtime_ms: 200,
error_rate: 0,
},
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }],
samples: [],
});
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
await expect(adapter.getTriageSignals(stagedDir, 'fp_1')).resolves.toEqual({
objectType: 'historic_sql_template',
lastEditedAt: '2026-05-04T11:55:00.000Z',
propertyHints: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
});
});
it('dispatches postgres fetches through PGSS staging and writes the baseline only after pull success', async () => {
const stagedDir = await tempDir();
const baselineRootDir = await tempDir();
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
const unusedPerExecutionReader: HistoricSqlQueryHistoryReader = {
const aggregateReader: HistoricSqlReader = {
async probe() {
throw new Error('per-execution reader must not be used for postgres');
return { warnings: [], info: [] };
},
async *fetch() {
throw new Error('per-execution reader must not be used for postgres');
},
};
const postgresReader: PostgresPgssReader = {
async probe() {
return { pgServerVersion: 'PostgreSQL 16.4', warnings: [] };
},
async readSnapshot() {
return {
statsResetAt: '2026-05-08T08:00:00.000Z',
deallocCount: 0,
rows: [
{
queryid: '901',
userid: '11',
username: 'analyst',
dbid: '5',
database: 'warehouse',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 9,
totalExecTime: 90,
meanExecTime: 10,
totalRows: 18,
},
],
async *fetchAggregated() {
yield {
templateId: 'pg:1',
canonicalSql:
'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status',
dialect: 'postgres',
stats: {
executions: 25,
distinctUsers: 3,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 10,
p95RuntimeMs: 20,
errorRate: 0,
rowsProduced: 10,
},
topUsers: [{ user: 'analyst', executions: 25 }],
};
},
};
const batchSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint() {
throw new Error('legacy analyzeForFingerprint must not be used');
},
async analyzeBatch() {
return new Map([
[
'pg:1',
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: { select: ['status'], join: ['customer_id', 'id'], groupBy: ['status'] },
},
],
]);
},
};
const adapter = new HistoricSqlSourceAdapter({
sqlAnalysis,
reader: unusedPerExecutionReader,
sqlAnalysis: batchSqlAnalysis,
reader: aggregateReader,
queryClient: {},
postgresReader,
postgresQueryClient: {
async executeQuery() {
return { headers: [], rows: [] };
},
},
postgresBaselineRootDir: baselineRootDir,
now: () => new Date('2026-05-08T12:00:00.000Z'),
now: () => new Date('2026-05-11T00:00:00.000Z'),
});
await adapter.fetch(
{
dialect: 'postgres',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
stagedDir,
{ connectionId: 'conn_pg', sourceKey: 'historic-sql' },
);
const manifest = JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')) as {
dialect: string;
baselineFirstRun: boolean;
templates: Array<{ id: string }>;
};
expect(manifest.dialect).toBe('postgres');
expect(manifest.baselineFirstRun).toBe(true);
expect(manifest.templates).toEqual([
{ id: 'db5_q901', fingerprint: 'fp_1', subClusterId: null, path: 'templates/db5_q901/page.md' },
]);
await expect(readFile(baselinePath, 'utf-8')).rejects.toMatchObject({ code: 'ENOENT' });
await adapter.onPullSucceeded({
connectionId: 'conn_pg',
await adapter.fetch({ dialect: 'postgres', minExecutions: 5 }, stagedDir, {
connectionId: 'warehouse',
sourceKey: 'historic-sql',
syncId: 'sync_pg',
trigger: 'scheduled_pull',
completedAt: new Date('2026-05-08T12:01:00.000Z'),
stagedDir,
});
const baseline = JSON.parse(await readFile(baselinePath, 'utf-8')) as {
fetchedAt: string;
templates: Record<string, { perUser: Record<string, { calls: number }> }>;
};
expect(baseline.fetchedAt).toBe('2026-05-08T12:00:00.000Z');
expect(baseline.templates.db5_q901.perUser['11'].calls).toBe(9);
});
it('fails postgres fetches clearly when no PGSS reader is configured', async () => {
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {} });
await expect(
adapter.fetch(
{
dialect: 'postgres',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
await tempDir(),
{ connectionId: 'conn_pg', sourceKey: 'historic-sql' },
),
).rejects.toThrow('Historic SQL Postgres fetch requires deps.postgresReader');
});
it('forwards manifest cursor through onPullSucceeded without changing the SourceAdapter signature', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 0,
capped: false,
warnings: [],
templates: [],
});
const onPullSucceeded = vi.fn(async () => {});
const adapter = new HistoricSqlSourceAdapter({ sqlAnalysis, reader, queryClient: {}, onPullSucceeded });
const completedAt = new Date('2026-05-04T12:01:00.000Z');
await adapter.onPullSucceeded({
connectionId: 'conn_1',
sourceKey: 'historic-sql',
syncId: 'sync_1',
trigger: 'scheduled_pull',
completedAt,
stagedDir,
});
expect(onPullSucceeded).toHaveBeenCalledWith({
connectionId: 'conn_1',
sourceKey: 'historic-sql',
syncId: 'sync_1',
trigger: 'scheduled_pull',
completedAt,
stagedDir,
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
await expect(adapter.detect(stagedDir)).resolves.toBe(true);
await expect(adapter.chunk(stagedDir)).resolves.toMatchObject({
workUnits: [
{ unitKey: 'historic-sql-table-public-customers' },
{ unitKey: 'historic-sql-table-public-orders' },
{ unitKey: 'historic-sql-patterns-part-0001' },
],
});
});
});

View file

@ -1,39 +1,16 @@
import { readFile } from 'node:fs/promises';
import { rm } from 'node:fs/promises';
import { join } from 'node:path';
import type {
ChunkResult,
DiffSet,
FetchContext,
IngestTrigger,
ScopeDescriptor,
SourceAdapter,
TriageSignals,
} from '../../types.js';
import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js';
import type { ChunkResult, DiffSet, FetchContext, ScopeDescriptor, SourceAdapter } from '../../types.js';
import { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './chunk-unified.js';
import { detectHistoricSqlStagedDir } from './detect.js';
import { stageHistoricSqlTemplates } from './stage.js';
import {
pgssBaselinePath,
stagePgStatStatementsTemplates,
writePgssBaselineAtomic,
type StagePgStatStatementsTemplatesResult,
} from './stage-pgss.js';
import {
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlPullConfigSchema,
historicSqlUsageSchema,
type HistoricSqlSourceAdapterDeps,
} from './types.js';
import { stageHistoricSqlAggregatedSnapshot } from './stage-unified.js';
import { type HistoricSqlSourceAdapterDeps } from './types.js';
export class HistoricSqlSourceAdapter implements SourceAdapter {
readonly source = 'historic-sql';
readonly skillNames = ['historic_sql_ingest'];
readonly reconcileSkillNames = ['historic_sql_curator'];
readonly evidenceIndexing = 'documents' as const;
readonly triageSupported = true;
private readonly pendingPgssBaselines = new Map<string, StagePgStatStatementsTemplatesResult>();
readonly skillNames = ['historic_sql_table_digest', 'historic_sql_patterns'];
readonly reconcileSkillNames: string[] = [];
readonly triageSupported = false;
constructor(private readonly deps: HistoricSqlSourceAdapterDeps) {}
@ -42,94 +19,27 @@ export class HistoricSqlSourceAdapter implements SourceAdapter {
}
async fetch(pullConfig: unknown, stagedDir: string, ctx: FetchContext): Promise<void> {
const config = historicSqlPullConfigSchema.parse(pullConfig);
if (config.dialect === 'postgres') {
if (!this.deps.postgresReader) {
throw new Error('Historic SQL Postgres fetch requires deps.postgresReader');
}
const postgresQueryClient = this.deps.postgresQueryClient ?? this.deps.queryClient;
if (
!postgresQueryClient ||
typeof postgresQueryClient !== 'object' ||
!('executeQuery' in postgresQueryClient) ||
typeof (postgresQueryClient as { executeQuery?: unknown }).executeQuery !== 'function'
) {
throw new Error('Historic SQL Postgres fetch requires deps.postgresQueryClient with executeQuery(sql, params?)');
}
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: ctx.connectionId,
queryClient: postgresQueryClient as NonNullable<HistoricSqlSourceAdapterDeps['postgresQueryClient']>,
reader: this.deps.postgresReader,
sqlAnalysis: this.deps.sqlAnalysis,
pullConfig: config,
baselinePath: pgssBaselinePath(this.deps.postgresBaselineRootDir, ctx.connectionId),
now: this.deps.now?.(),
});
this.pendingPgssBaselines.set(stagedDir, result);
return;
}
await stageHistoricSqlTemplates({
await stageHistoricSqlAggregatedSnapshot({
stagedDir,
connectionId: ctx.connectionId,
queryClient: this.deps.queryClient,
reader: this.deps.reader,
sqlAnalysis: this.deps.sqlAnalysis,
pullConfig: config,
pullConfig,
now: this.deps.now?.(),
});
if (this.deps.legacyPostgresBaselineRootDir) {
await rm(join(this.deps.legacyPostgresBaselineRootDir, ctx.connectionId, ['pgss', 'baseline.json'].join('-')), {
force: true,
});
}
}
chunk(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
return chunkHistoricSqlStagedDir(stagedDir, diffSet);
return chunkHistoricSqlUnifiedStagedDir(stagedDir, diffSet);
}
describeScope(stagedDir: string): Promise<ScopeDescriptor> {
return describeHistoricSqlScope(stagedDir);
}
async getTriageSignals(stagedDir: string, externalId: string): Promise<TriageSignals> {
const manifest = historicSqlManifestSchema.parse(
JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')),
);
const template = manifest.templates.find((entry) => entry.id === externalId);
if (!template) {
return {};
}
const templateDir = template.path.replace(/\/page\.md$/, '');
const metadata = historicSqlMetadataSchema.parse(
JSON.parse(await readFile(join(stagedDir, templateDir, 'metadata.json'), 'utf-8')),
);
const usage = historicSqlUsageSchema.parse(
JSON.parse(await readFile(join(stagedDir, templateDir, 'usage.json'), 'utf-8')),
);
return {
objectType: metadata.objectType,
lastEditedAt: usage.stats.last_seen,
propertyHints: metadata.properties.triage_signals,
};
}
async onPullSucceeded(ctx: {
connectionId: string;
sourceKey: string;
syncId: string;
trigger: IngestTrigger;
completedAt: Date;
stagedDir: string;
}): Promise<void> {
const manifest = historicSqlManifestSchema.parse(
JSON.parse(await readFile(join(ctx.stagedDir, 'manifest.json'), 'utf-8')),
);
if (manifest.dialect === 'postgres') {
const pending = this.pendingPgssBaselines.get(ctx.stagedDir);
if (pending) {
await writePgssBaselineAtomic(pending.baselinePath, pending.baseline);
this.pendingPgssBaselines.delete(ctx.stagedDir);
}
}
await this.deps.onPullSucceeded?.({ ...ctx, nextSuccessfulCursor: manifest.nextSuccessfulCursor });
return describeHistoricSqlUnifiedScope(stagedDir);
}
}

View file

@ -0,0 +1,304 @@
import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import YAML from 'yaml';
import { AgentRunnerService } from '../../../agent/index.js';
import { initKtxProject, loadKtxProject, type KtxLocalProject } from '../../../project/index.js';
import {
type SqlAnalysisBatchItem,
type SqlAnalysisBatchResult,
type SqlAnalysisDialect,
type SqlAnalysisPort,
} from '../../../sql-analysis/index.js';
import { searchLocalSlSources } from '../../../sl/local-sl.js';
import { searchLocalKnowledgePages } from '../../../wiki/local-knowledge.js';
import { runLocalIngest } from '../../local-ingest.js';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { HistoricSqlSourceAdapter } from './historic-sql.adapter.js';
import type { AggregatedTemplate, HistoricSqlReader, HistoricSqlUnifiedPullConfig } from './types.js';
class AcceptanceHistoricSqlReader implements HistoricSqlReader {
async probe() {
return { warnings: [], info: [] };
}
async *fetchAggregated(
_client: unknown,
_window: { start: Date; end: Date },
_config: HistoricSqlUnifiedPullConfig,
): AsyncIterable<AggregatedTemplate> {
yield {
templateId: 'pg:orders-lifecycle',
canonicalSql:
'select o.status, c.segment, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.status = $1 group by o.status, c.segment',
dialect: 'postgres',
stats: {
executions: 42,
distinctUsers: 4,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 18,
p95RuntimeMs: 84,
errorRate: 0,
rowsProduced: 420,
},
topUsers: [{ user: 'analyst@example.test', executions: 42 }],
};
}
}
class HistoricSqlAcceptanceAgentRunner extends AgentRunnerService {
override runLoop = vi.fn(async (params: any) => {
if (params.telemetryTags?.operationName !== 'ingest-bundle-wu') {
return { stopReason: 'natural' as const };
}
const emitEvidence = params.toolSet.emit_historic_sql_evidence;
if (!emitEvidence?.execute) {
throw new Error('emit_historic_sql_evidence tool was not available to the historic-SQL WorkUnit');
}
if (params.telemetryTags.unitKey === 'historic-sql-table-public-orders') {
const result = await emitEvidence.execute(
{
kind: 'table_usage',
table: 'public.orders',
rawPath: 'tables/public.orders.json',
usage: {
narrative: 'Analysts repeatedly inspect paid order lifecycle by customer segment.',
frequencyTier: 'high',
commonFilters: ['status'],
commonGroupBys: ['status', 'segment'],
commonJoins: [{ table: 'public.customers', on: ['customer_id', 'id'] }],
staleSince: null,
},
},
{ toolCallId: 'historic-sql-orders-usage' },
);
if (!String(result).includes('Recorded historic-SQL table_usage evidence')) {
throw new Error(`Unexpected orders evidence result: ${String(result)}`);
}
}
if (params.telemetryTags.unitKey === 'historic-sql-table-public-customers') {
const result = await emitEvidence.execute(
{
kind: 'table_usage',
table: 'public.customers',
rawPath: 'tables/public.customers.json',
usage: {
narrative: 'Customers provide segment context for paid order lifecycle analysis.',
frequencyTier: 'mid',
commonFilters: [],
commonGroupBys: ['segment'],
commonJoins: [{ table: 'public.orders', on: ['id', 'customer_id'] }],
staleSince: null,
},
},
{ toolCallId: 'historic-sql-customers-usage' },
);
if (!String(result).includes('Recorded historic-SQL table_usage evidence')) {
throw new Error(`Unexpected customers evidence result: ${String(result)}`);
}
}
if (params.telemetryTags.unitKey === 'historic-sql-patterns-part-0001') {
const result = await emitEvidence.execute(
{
kind: 'pattern',
rawPath: 'patterns-input/part-0001.json',
pattern: {
slug: 'paid-order-lifecycle',
title: 'Paid Order Lifecycle',
narrative: 'Analysts join orders and customers to compare paid order lifecycle by segment.',
definitionSql:
'select o.status, c.segment, count(*) from public.orders o join public.customers c on c.id = o.customer_id group by o.status, c.segment',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['pg:orders-lifecycle'],
},
},
{ toolCallId: 'historic-sql-pattern' },
);
if (!String(result).includes('Recorded historic-SQL pattern evidence')) {
throw new Error(`Unexpected pattern evidence result: ${String(result)}`);
}
}
return { stopReason: 'natural' as const };
});
constructor() {
super({ llmProvider: { getModel: () => ({}) as never } as never });
}
}
function acceptanceSqlAnalysis(): SqlAnalysisPort {
return {
analyzeForFingerprint: async () => {
throw new Error('analyzeForFingerprint should not be used by unified historic-SQL ingest');
},
analyzeBatch: vi.fn(
async (
items: SqlAnalysisBatchItem[],
_dialect: SqlAnalysisDialect,
): Promise<Map<string, SqlAnalysisBatchResult>> => {
return new Map(
items.map((item) => [
item.id,
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: {
select: ['status', 'segment'],
where: ['status'],
join: ['customer_id', 'id'],
groupBy: ['status', 'segment'],
},
},
]),
);
},
),
};
}
async function writeHistoricSqlProject(project: KtxLocalProject): Promise<KtxLocalProject> {
await writeFile(
join(project.projectDir, 'ktx.yaml'),
[
'project: warehouse',
'connections:',
' warehouse:',
' driver: postgres',
' historicSql:',
' enabled: true',
' dialect: postgres',
' minExecutions: 2',
'ingest:',
' adapters:',
' - historic-sql',
' embeddings:',
' backend: deterministic',
'storage:',
' state: sqlite',
' search: sqlite-fts5',
' git:',
' auto_commit: false',
' author: KTX Test <system@ktx.local>',
'',
].join('\n'),
'utf-8',
);
const loaded = await loadKtxProject({ projectDir: project.projectDir });
await loaded.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify({
tables: {
orders: {
table: 'public.orders',
columns: [
{ name: 'id', type: 'string' },
{ name: 'status', type: 'string' },
{ name: 'customer_id', type: 'string' },
],
},
customers: {
table: 'public.customers',
columns: [
{ name: 'id', type: 'string' },
{ name: 'segment', type: 'string' },
],
},
},
}),
'KTX Test',
'system@ktx.local',
'Seed schema shard',
);
return loaded;
}
describe('historic-SQL local ingest retrieval acceptance', () => {
let tempDir: string;
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'ktx-historic-sql-acceptance-'));
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
it('projects table and pattern evidence into semantic-layer and wiki retrieval surfaces', async () => {
const initialized = await initKtxProject({ projectDir: join(tempDir, 'project'), projectName: 'warehouse' });
const project = await writeHistoricSqlProject(initialized);
const sqlAnalysis = acceptanceSqlAnalysis();
const agentRunner = new HistoricSqlAcceptanceAgentRunner();
const adapter = new HistoricSqlSourceAdapter({
reader: new AcceptanceHistoricSqlReader(),
queryClient: {},
sqlAnalysis,
now: () => new Date('2026-05-11T00:00:00.000Z'),
});
const result = await runLocalIngest({
project,
adapters: [adapter],
adapter: 'historic-sql',
connectionId: 'warehouse',
jobId: 'historic-sql-retrieval-acceptance',
agentRunner,
});
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(1);
expect(result.result.failedWorkUnits).toEqual([]);
expect(result.result.workUnitCount).toBe(3);
expect(agentRunner.runLoop).toHaveBeenCalledTimes(3);
const postProcessor = result.report.body.postProcessor;
expect(postProcessor).toBeDefined();
if (!postProcessor) {
throw new Error('Expected historic-SQL post-processor result');
}
expect(postProcessor).toMatchObject({
sourceKey: 'historic-sql',
status: 'success',
result: {
tableUsageMerged: 2,
patternPagesWritten: 1,
},
});
expect(postProcessor.touchedSources).toEqual(
expect.arrayContaining([
{ connectionId: 'warehouse', sourceName: 'customers' },
{ connectionId: 'warehouse', sourceName: 'orders' },
]),
);
await expect(readFile(join(project.projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves
.toContain('Analysts repeatedly inspect paid order lifecycle by customer segment.');
await expect(readFile(join(project.projectDir, 'knowledge/global/historic-sql/paid-order-lifecycle.md'), 'utf-8'))
.resolves.toContain('Paid Order Lifecycle');
const reloaded = await loadKtxProject({ projectDir: project.projectDir });
await expect(
searchLocalSlSources(reloaded, { connectionId: 'warehouse', query: 'paid order lifecycle', limit: 5 }),
).resolves.toEqual(expect.arrayContaining([
expect.objectContaining({
name: 'orders',
frequencyTier: 'high',
snippet: expect.stringContaining('<mark>'),
matchReasons: expect.arrayContaining(['lexical']),
}),
]));
await expect(
searchLocalKnowledgePages(reloaded, { query: 'paid order lifecycle', userId: 'local', limit: 5 }),
).resolves.toEqual([
expect.objectContaining({
key: 'historic-sql/paid-order-lifecycle',
summary: 'Paid Order Lifecycle',
matchReasons: expect.arrayContaining(['lexical']),
}),
]);
});
});

View file

@ -0,0 +1,89 @@
import { describe, expect, it } from 'vitest';
import {
HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES,
isHistoricSqlPatternInputShardPath,
serializedStagedPatternsInputByteLength,
splitHistoricSqlPatternInputs,
} from './pattern-inputs.js';
import type { StagedPatternsInput } from './types.js';
type PatternTemplate = StagedPatternsInput['templates'][number];
function template(id: string, tablesTouched: string[], canonicalSql = 'select 1'): PatternTemplate {
return {
id,
canonicalSql,
tablesTouched,
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
};
}
describe('historic-SQL pattern input sharding', () => {
it('keeps the audit input complete while sharding only cross-table pattern candidates', () => {
const largeSql = `select * from public.orders join public.customers on true where marker = '${'x'.repeat(260)}'`;
const input: StagedPatternsInput = {
templates: [
template('single-table-orders', ['public.orders']),
template('orders-customers-2', ['public.orders', 'public.customers'], largeSql),
template('orders-customers-1', ['public.customers', 'public.orders'], largeSql),
template('orders-customers-payments', ['public.orders', 'public.customers', 'public.payments'], largeSql),
],
};
const result = splitHistoricSqlPatternInputs(input, { maxBytes: 760 });
expect(result.auditInput.templates.map((entry) => entry.id)).toEqual([
'orders-customers-1',
'orders-customers-2',
'orders-customers-payments',
'single-table-orders',
]);
expect(result.shards.length).toBeGreaterThan(1);
expect(result.shards.map((shard) => shard.path)).toEqual([
'patterns-input/part-0001.json',
'patterns-input/part-0002.json',
'patterns-input/part-0003.json',
]);
expect(result.shards.flatMap((shard) => shard.input.templates.map((entry) => entry.id))).toEqual([
'orders-customers-payments',
'orders-customers-1',
'orders-customers-2',
]);
expect(result.shards.every((shard) => shard.byteLength <= 760)).toBe(true);
expect(result.shards.flatMap((shard) => shard.input.templates).some((entry) => entry.id === 'single-table-orders')).toBe(false);
expect(result.warnings).toEqual([]);
});
it('omits a single oversized template from shards and reports a manifest warning', () => {
const input: StagedPatternsInput = {
templates: [
template(
'oversized-cross-table',
['public.orders', 'public.customers'],
`select * from public.orders join public.customers on true where payload = '${'x'.repeat(500)}'`,
),
],
};
const result = splitHistoricSqlPatternInputs(input, { maxBytes: 240 });
expect(result.auditInput.templates.map((entry) => entry.id)).toEqual(['oversized-cross-table']);
expect(result.shards).toEqual([]);
expect(result.warnings).toEqual(['patterns_input_template_too_large:oversized-cross-table']);
});
it('recognizes only generated pattern shard paths', () => {
expect(isHistoricSqlPatternInputShardPath('patterns-input/part-0001.json')).toBe(true);
expect(isHistoricSqlPatternInputShardPath('patterns-input/part-0012.json')).toBe(true);
expect(isHistoricSqlPatternInputShardPath('patterns-input.json')).toBe(false);
expect(isHistoricSqlPatternInputShardPath('patterns-input/part-1.json')).toBe(false);
expect(isHistoricSqlPatternInputShardPath('patterns-input/readme.md')).toBe(false);
});
it('uses a production byte budget below read_raw_file maximum size', () => {
expect(HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES).toBeLessThan(120_000);
expect(serializedStagedPatternsInputByteLength({ templates: [] })).toBeGreaterThan(0);
});
});

View file

@ -0,0 +1,99 @@
import { Buffer } from 'node:buffer';
import type { StagedPatternsInput } from './types.js';
export const HISTORIC_SQL_PATTERN_WORKUNIT_DIR = 'patterns-input';
export const HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES = 110_000;
export const HISTORIC_SQL_PATTERN_WORKUNIT_PATH_RE = /^patterns-input\/part-\d{4}\.json$/;
type PatternTemplate = StagedPatternsInput['templates'][number];
export interface HistoricSqlPatternInputShard {
path: string;
input: StagedPatternsInput;
byteLength: number;
}
export interface HistoricSqlPatternInputSplitResult {
auditInput: StagedPatternsInput;
shards: HistoricSqlPatternInputShard[];
warnings: string[];
}
export interface HistoricSqlPatternInputSplitOptions {
maxBytes?: number;
}
export function isHistoricSqlPatternInputShardPath(path: string): boolean {
return HISTORIC_SQL_PATTERN_WORKUNIT_PATH_RE.test(path);
}
export function serializeStagedPatternsInput(input: StagedPatternsInput): string {
return `${JSON.stringify(input, null, 2)}\n`;
}
export function serializedStagedPatternsInputByteLength(input: StagedPatternsInput): number {
return Buffer.byteLength(serializeStagedPatternsInput(input), 'utf-8');
}
function sortedAuditTemplates(templates: readonly PatternTemplate[]): PatternTemplate[] {
return [...templates].sort((left, right) => left.id.localeCompare(right.id));
}
function sortedPatternCandidates(templates: readonly PatternTemplate[]): PatternTemplate[] {
return [...templates]
.filter((template) => template.tablesTouched.length >= 2)
.map((template) => ({ ...template, tablesTouched: [...template.tablesTouched].sort() }))
.sort((left, right) => {
const cardinality = right.tablesTouched.length - left.tablesTouched.length;
if (cardinality !== 0) return cardinality;
const tableSignature = left.tablesTouched.join('\0').localeCompare(right.tablesTouched.join('\0'));
if (tableSignature !== 0) return tableSignature;
return left.id.localeCompare(right.id);
});
}
function shardPath(index: number): string {
return `${HISTORIC_SQL_PATTERN_WORKUNIT_DIR}/part-${String(index).padStart(4, '0')}.json`;
}
export function splitHistoricSqlPatternInputs(
input: StagedPatternsInput,
options: HistoricSqlPatternInputSplitOptions = {},
): HistoricSqlPatternInputSplitResult {
const maxBytes = options.maxBytes ?? HISTORIC_SQL_PATTERN_WORKUNIT_MAX_BYTES;
const auditInput: StagedPatternsInput = { templates: sortedAuditTemplates(input.templates) };
const warnings: string[] = [];
const shards: HistoricSqlPatternInputShard[] = [];
let current: PatternTemplate[] = [];
const flush = () => {
if (current.length === 0) {
return;
}
const shardInput: StagedPatternsInput = { templates: current };
shards.push({
path: shardPath(shards.length + 1),
input: shardInput,
byteLength: serializedStagedPatternsInputByteLength(shardInput),
});
current = [];
};
for (const template of sortedPatternCandidates(input.templates)) {
const singleInput: StagedPatternsInput = { templates: [template] };
if (serializedStagedPatternsInputByteLength(singleInput) > maxBytes) {
warnings.push(`patterns_input_template_too_large:${template.id}`);
continue;
}
const nextInput: StagedPatternsInput = { templates: [...current, template] };
if (current.length > 0 && serializedStagedPatternsInputByteLength(nextInput) > maxBytes) {
flush();
}
current.push(template);
}
flush();
return { auditInput, shards, warnings };
}

View file

@ -0,0 +1,74 @@
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import YAML from 'yaml';
import { describe, expect, it } from 'vitest';
import { HistoricSqlProjectionPostProcessor } from './post-processor.js';
async function tempWorkdir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-post-processor-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
describe('HistoricSqlProjectionPostProcessor', () => {
it('projects current run evidence before the ingest squash commit', async () => {
const workdir = await tempWorkdir();
await mkdir(join(workdir, 'semantic-layer/warehouse/_schema'), { recursive: true });
await writeFile(
join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'),
YAML.stringify({ tables: { orders: { table: 'public.orders', columns: [{ name: 'id', type: 'string' }] } } }),
'utf-8',
);
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 1,
touchedTableCount: 1,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 90,
});
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/orders.json', {
kind: 'table_usage',
connectionId: 'warehouse',
table: 'public.orders',
rawPath: 'tables/public.orders.json',
usage: {
narrative: 'Orders are repeatedly queried by lifecycle status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [],
staleSince: null,
},
});
const result = await new HistoricSqlProjectionPostProcessor().run({
connectionId: 'warehouse',
sourceKey: 'historic-sql',
syncId: 'sync-1',
jobId: 'job-1',
runId: 'run-1',
workdir,
parseArtifacts: null,
});
expect(result.errors).toEqual([]);
expect(result.warnings).toEqual([]);
expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]);
expect(result.result).toMatchObject({ tableUsageMerged: 1 });
await expect(readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves.toContain(
'Orders are repeatedly queried by lifecycle status.',
);
});
});

View file

@ -0,0 +1,41 @@
import type { IngestBundlePostProcessorInput, IngestBundlePostProcessorPort, IngestBundlePostProcessorResult } from '../../ports.js';
import { createSimpleGit } from '../../../core/git-env.js';
import { projectHistoricSqlEvidence } from './projection.js';
async function commitProjectionChanges(workdir: string): Promise<void> {
const git = createSimpleGit(workdir);
if (!(await git.checkIsRepo().catch(() => false))) {
return;
}
const status = await git.status();
const paths = status.files
.map((file) => file.path)
.filter((path) => path.startsWith('semantic-layer/') || path.startsWith('knowledge/global/historic-sql/'));
if (paths.length === 0) {
return;
}
await git.add(paths);
const staged = await git.diff(['--cached', '--name-only']);
if (!staged.trim()) {
return;
}
await git.commit('Project historic SQL evidence', { '--author': 'System User <system@example.com>' });
}
export class HistoricSqlProjectionPostProcessor implements IngestBundlePostProcessorPort {
async run(input: IngestBundlePostProcessorInput): Promise<IngestBundlePostProcessorResult> {
const projection = await projectHistoricSqlEvidence({
workdir: input.workdir,
connectionId: input.connectionId,
syncId: input.syncId,
runId: input.runId,
});
await commitProjectionChanges(input.workdir);
return {
result: projection,
warnings: projection.warnings,
errors: [],
touchedSources: projection.touchedSources,
};
}
}

View file

@ -4,7 +4,7 @@ import {
HistoricSqlGrantsMissingError,
HistoricSqlVersionUnsupportedError,
} from './errors.js';
import { PostgresPgssQueryHistoryReader } from './postgres-pgss-query-history-reader.js';
import { PostgresPgssReader } from './postgres-pgss-reader.js';
interface FakeQueryResult {
headers: string[];
@ -35,7 +35,7 @@ function executedSql(client: ReturnType<typeof queryClient>, index: number): str
return call[0];
}
describe('PostgresPgssQueryHistoryReader', () => {
describe('PostgresPgssReader aggregate path', () => {
it('probes version, extension presence, grants, and tracking state', async () => {
const client = queryClient([
{
@ -47,11 +47,12 @@ describe('PostgresPgssQueryHistoryReader', () => {
{ headers: ['track'], rows: [['top']] },
{ headers: ['max'], rows: [['5000']] },
]);
const reader = new PostgresPgssQueryHistoryReader();
const reader = new PostgresPgssReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4 on x86_64-apple-darwin',
warnings: [],
info: [],
});
expect(executedSql(client, 0)).toContain("current_setting('server_version_num')::int");
@ -69,12 +70,8 @@ describe('PostgresPgssQueryHistoryReader', () => {
headers: ['server_version_num', 'server_version'],
rows: [[130012, 'PostgreSQL 13.12']],
},
{
headers: ['stats_reset', 'dealloc'],
rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]],
},
]);
const reader = new PostgresPgssQueryHistoryReader();
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
@ -95,7 +92,7 @@ describe('PostgresPgssQueryHistoryReader', () => {
},
new Error('relation "pg_stat_statements" does not exist'),
]);
const reader = new PostgresPgssQueryHistoryReader();
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
@ -113,7 +110,7 @@ describe('PostgresPgssQueryHistoryReader', () => {
},
new Error('pg_stat_statements must be loaded via shared_preload_libraries'),
]);
const reader = new PostgresPgssQueryHistoryReader();
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
@ -134,7 +131,7 @@ describe('PostgresPgssQueryHistoryReader', () => {
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[false]] },
]);
const reader = new PostgresPgssQueryHistoryReader();
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
@ -156,17 +153,18 @@ describe('PostgresPgssQueryHistoryReader', () => {
{ headers: ['track'], rows: [['none']] },
{ headers: ['max'], rows: [['5000']] },
]);
const reader = new PostgresPgssQueryHistoryReader();
const reader = new PostgresPgssReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4',
warnings: [
"pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config",
],
info: [],
});
});
it('warns when pg_stat_statements.max is below the recommended floor', async () => {
it('returns an info note when pg_stat_statements.max is below the recommended floor', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
@ -177,105 +175,68 @@ describe('PostgresPgssQueryHistoryReader', () => {
{ headers: ['track'], rows: [['top']] },
{ headers: ['max'], rows: [['1000']] },
]);
const reader = new PostgresPgssQueryHistoryReader();
const reader = new PostgresPgssReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4',
warnings: [
warnings: [],
info: [
'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn',
],
});
});
it('reads a parameterized pg_stat_statements snapshot and stats info', async () => {
const client = queryClient([
{
headers: [
'queryid',
'userid',
'username',
'dbid',
'database',
'query',
'calls',
'total_exec_time',
'mean_exec_time',
'total_rows',
],
it('aggregates pg_stat_statements rows by queryid and query', async () => {
const executeQuery = vi.fn(async (sql: string, params?: unknown[]) => {
if (sql.includes('pg_stat_statements_info')) {
return { headers: ['stats_reset', 'dealloc'], rows: [['2026-05-01T00:00:00.000Z', 1]] };
}
expect(sql).toContain('GROUP BY queryid, query');
expect(sql).toContain('HAVING SUM(calls) >= $1');
expect(params).toEqual([5]);
return {
headers: ['template_id', 'canonical_sql', 'executions', 'distinct_users', 'mean_ms', 'rows_produced', 'top_users'],
rows: [
[
'922337203685477580',
'16384',
'analyst',
'16385',
'warehouse',
'SELECT count(*) FROM public.orders WHERE status = $1',
'123',
'select status from public.orders',
'42',
'2100.5',
'50.0119',
'9001',
],
[
'922337203685477581',
'16386',
'unknown',
'16385',
'warehouse',
'SELECT * FROM public.customers WHERE id = $1',
5,
30,
6,
5,
'3',
'11.5',
'100',
JSON.stringify([{ user: 'analyst', executions: 40 }]),
],
],
},
{
headers: ['stats_reset', 'dealloc'],
rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]],
},
]);
const reader = new PostgresPgssQueryHistoryReader();
await expect(reader.readSnapshot(client, { minCalls: 5, maxTemplates: 500 })).resolves.toEqual({
statsResetAt: '2026-05-01T00:00:00.000Z',
deallocCount: 7,
rows: [
{
queryid: '922337203685477580',
userid: '16384',
username: 'analyst',
dbid: '16385',
database: 'warehouse',
query: 'SELECT count(*) FROM public.orders WHERE status = $1',
calls: 42,
totalExecTime: 2100.5,
meanExecTime: 50.0119,
totalRows: 9001,
},
{
queryid: '922337203685477581',
userid: '16386',
username: 'unknown',
dbid: '16385',
database: 'warehouse',
query: 'SELECT * FROM public.customers WHERE id = $1',
calls: 5,
totalExecTime: 30,
meanExecTime: 6,
totalRows: 5,
},
],
};
});
const snapshotSql = executedSql(client, 0);
expect(snapshotSql).toContain('FROM pg_stat_statements s');
expect(snapshotSql).toContain('LEFT JOIN pg_roles');
expect(snapshotSql).toContain('LEFT JOIN pg_database');
expect(snapshotSql).toContain('WHERE s.toplevel = true');
expect(snapshotSql).toContain('AND s.calls >= $1');
expect(snapshotSql).toContain('ORDER BY s.total_exec_time DESC');
expect(snapshotSql).toContain('LIMIT $2');
expect(client.executeQuery.mock.calls[0]?.[1]).toEqual([5, 500]);
expect(executedSql(client, 1)).toBe('SELECT stats_reset, dealloc FROM pg_stat_statements_info');
const reader = new PostgresPgssReader();
const rows = [];
for await (const row of reader.fetchAggregated(
{ executeQuery },
{ start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') },
{ dialect: 'postgres', minExecutions: 5, windowDays: 90, concurrency: 12, filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 },
)) {
rows.push(row);
}
expect(rows).toEqual([
{
templateId: '123',
canonicalSql: 'select status from public.orders',
dialect: 'postgres',
stats: {
executions: 42,
distinctUsers: 3,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 11.5,
p95RuntimeMs: 11.5,
errorRate: 0,
rowsProduced: 100,
},
topUsers: [{ user: 'analyst', executions: 40 }],
},
]);
});
});

View file

@ -3,12 +3,13 @@ import {
HistoricSqlGrantsMissingError,
HistoricSqlVersionUnsupportedError,
} from './errors.js';
import type {
KtxPostgresQueryClient,
PostgresPgssProbeResult,
PostgresPgssReader,
PostgresPgssRow,
PostgresPgssSnapshot,
import {
aggregatedTemplateSchema,
type AggregatedTemplate,
type HistoricSqlTimeWindow,
type HistoricSqlUnifiedPullConfig,
type KtxPostgresQueryClient,
type PostgresPgssProbeResult,
} from './types.js';
interface QueryResultLike {
@ -18,37 +19,35 @@ interface QueryResultLike {
error?: string;
}
const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info';
const VERSION_SQL = `
SELECT current_setting('server_version_num')::int AS server_version_num,
version() AS server_version
`.trim();
const EXTENSION_PROBE_SQL = 'SELECT 1 FROM pg_stat_statements LIMIT 1';
const GRANTS_PROBE_SQL = "SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role";
const TRACKING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.track') AS track";
const MAX_SETTING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.max') AS max";
const RECOMMENDED_PGSS_MAX = 5000;
const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info';
const SNAPSHOT_SQL = `
SELECT
s.queryid::text AS queryid,
s.userid::text AS userid,
COALESCE(r.rolname, 'unknown') AS username,
s.dbid::text AS dbid,
d.datname AS database,
s.query,
s.calls,
s.total_exec_time,
s.mean_exec_time,
s.rows AS total_rows
FROM pg_stat_statements s
LEFT JOIN pg_roles r ON s.userid = r.oid
LEFT JOIN pg_database d ON s.dbid = d.oid
WHERE s.toplevel = true
AND s.calls >= $1
ORDER BY s.total_exec_time DESC
LIMIT $2
const AGGREGATE_SQL = `
SELECT queryid::text AS template_id,
query AS canonical_sql,
SUM(calls)::bigint AS executions,
COUNT(DISTINCT userid) AS distinct_users,
SUM(total_exec_time) / NULLIF(SUM(calls), 0) AS mean_ms,
SUM(rows)::bigint AS rows_produced,
COALESCE(
json_agg(json_build_object('user', rolname, 'executions', calls) ORDER BY calls DESC)
FILTER (WHERE userid IS NOT NULL),
'[]'::json
)::text AS top_users
FROM pg_stat_statements
LEFT JOIN pg_roles ON pg_roles.oid = pg_stat_statements.userid
WHERE toplevel = true
GROUP BY queryid, query
HAVING SUM(calls) >= $1
ORDER BY SUM(total_exec_time) DESC
`.trim();
const POSTGRES_EXTENSION_REMEDIATION = [
@ -78,7 +77,7 @@ async function execute(client: KtxPostgresQueryClient, sql: string, params?: unk
return result;
}
function indexes(headers: string[]): Map<string, number> {
function indexByHeader(headers: string[]): Map<string, number> {
const out = new Map<string, number>();
headers.forEach((header, index) => out.set(header.toLowerCase(), index));
return out;
@ -113,12 +112,21 @@ function requiredFiniteNumber(raw: unknown, field: string): number {
return number;
}
function nullableInteger(raw: unknown): number | null {
function requiredInteger(raw: unknown, field: string): number {
return Math.trunc(requiredFiniteNumber(raw, field));
}
function nullableNumber(raw: unknown): number | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
const number = typeof raw === 'number' ? raw : Number(raw);
return Number.isFinite(number) ? Math.trunc(number) : null;
return Number.isFinite(number) ? number : null;
}
function nullableInteger(raw: unknown): number | null {
const number = nullableNumber(raw);
return number === null ? null : Math.trunc(number);
}
function nullableIsoTimestamp(raw: unknown): string | null {
@ -137,7 +145,7 @@ function firstRow(result: QueryResultLike, context: string): { row: unknown[]; h
if (!row) {
throw new Error(`Postgres historic-SQL ${context} query returned no rows`);
}
return { row, headers: indexes(result.headers) };
return { row, headers: indexByHeader(result.headers) };
}
function isMissingPgssRelation(error: unknown): boolean {
@ -167,22 +175,30 @@ function grantsMissingError(): HistoricSqlGrantsMissingError {
});
}
function mapSnapshotRow(row: unknown[], headerIndexes: Map<string, number>): PostgresPgssRow {
return {
queryid: requiredString(value(row, headerIndexes, 'queryid'), 'queryid'),
userid: requiredString(value(row, headerIndexes, 'userid'), 'userid'),
username: nullableString(value(row, headerIndexes, 'username')),
dbid: requiredString(value(row, headerIndexes, 'dbid'), 'dbid'),
database: nullableString(value(row, headerIndexes, 'database')),
query: requiredString(value(row, headerIndexes, 'query'), 'query'),
calls: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'calls'), 'calls')),
totalExecTime: requiredFiniteNumber(value(row, headerIndexes, 'total_exec_time'), 'total_exec_time'),
meanExecTime: requiredFiniteNumber(value(row, headerIndexes, 'mean_exec_time'), 'mean_exec_time'),
totalRows: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'total_rows'), 'total_rows')),
};
function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> {
const text = nullableString(raw);
if (!text) {
return [];
}
try {
const parsed = JSON.parse(text) as unknown;
if (!Array.isArray(parsed)) {
return [];
}
return parsed.flatMap((entry) => {
if (!entry || typeof entry !== 'object') {
return [];
}
const user = nullableString((entry as { user?: unknown }).user);
const executions = nullableInteger((entry as { executions?: unknown }).executions);
return executions === null ? [] : [{ user, executions }];
});
} catch {
return [];
}
}
export class PostgresPgssQueryHistoryReader implements PostgresPgssReader {
export class PostgresPgssReader {
async probe(client: unknown): Promise<PostgresPgssProbeResult> {
const pgClient = queryClient(client);
const versionResult = await execute(pgClient, VERSION_SQL);
@ -231,32 +247,47 @@ export class PostgresPgssQueryHistoryReader implements PostgresPgssReader {
const pgssMax = nullableInteger(value(maxRow, maxHeaders, 'max'));
const warnings: string[] = [];
const info: string[] = [];
if (track === 'none') {
warnings.push('pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config');
}
if (pgssMax !== null && pgssMax < RECOMMENDED_PGSS_MAX) {
warnings.push(
info.push(
`pg_stat_statements.max is ${pgssMax}; set it to at least ${RECOMMENDED_PGSS_MAX} to reduce query-template eviction churn`,
);
}
return { pgServerVersion, warnings };
return { pgServerVersion, warnings, info };
}
async readSnapshot(
async *fetchAggregated(
client: unknown,
options: { minCalls: number; maxTemplates: number },
): Promise<PostgresPgssSnapshot> {
window: HistoricSqlTimeWindow,
config: HistoricSqlUnifiedPullConfig,
): AsyncIterable<AggregatedTemplate> {
const pgClient = queryClient(client);
const snapshotResult = await execute(pgClient, SNAPSHOT_SQL, [options.minCalls, options.maxTemplates]);
const snapshotHeaders = indexes(snapshotResult.headers);
const statsResult = await execute(pgClient, STATS_INFO_SQL);
const { row: statsRow, headers: statsHeaders } = firstRow(statsResult, 'stats-info');
return {
statsResetAt: nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')),
deallocCount: nullableInteger(value(statsRow, statsHeaders, 'dealloc')),
rows: snapshotResult.rows.map((row) => mapSnapshotRow(row, snapshotHeaders)),
};
const firstSeen = nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')) ?? window.start.toISOString();
const result = await execute(pgClient, AGGREGATE_SQL, [config.minExecutions]);
const indexes = indexByHeader(result.headers);
for (const row of result.rows) {
yield aggregatedTemplateSchema.parse({
templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'),
canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'),
dialect: 'postgres',
stats: {
executions: requiredInteger(value(row, indexes, 'executions'), 'executions'),
distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'),
firstSeen,
lastSeen: window.end.toISOString(),
p50RuntimeMs: nullableNumber(value(row, indexes, 'mean_ms')),
p95RuntimeMs: nullableNumber(value(row, indexes, 'mean_ms')),
errorRate: 0,
rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')),
},
topUsers: parseTopUsers(value(row, indexes, 'top_users')),
});
}
}
}

View file

@ -0,0 +1,372 @@
import { mkdir, mkdtemp, readFile, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import YAML from 'yaml';
import { describe, expect, it } from 'vitest';
import { projectHistoricSqlEvidence } from './projection.js';
async function tempWorkdir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-projection-'));
}
async function writeText(root: string, relPath: string, content: string): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, content, 'utf-8');
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`);
}
describe('projectHistoricSqlEvidence', () => {
it('merges table usage into matching _schema shards and preserves external usage keys', async () => {
const workdir = await tempWorkdir();
await writeText(
workdir,
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify({
tables: {
orders: {
table: 'public.orders',
usage: {
narrative: 'Old generated usage.',
frequencyTier: 'low',
commonFilters: ['old_status'],
commonJoins: [],
ownerNote: 'keep me',
},
columns: [{ name: 'id', type: 'string' }],
},
},
}),
);
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 1,
touchedTableCount: 1,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 90,
});
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/orders.json', {
kind: 'table_usage',
connectionId: 'warehouse',
table: 'public.orders',
rawPath: 'tables/public.orders.json',
usage: {
narrative: 'Orders are repeatedly queried for lifecycle analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: null,
},
});
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]);
const shard = YAML.parse(await readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8'));
expect(shard.tables.orders.usage).toEqual({
ownerNote: 'keep me',
narrative: 'Orders are repeatedly queried for lifecycle analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: null,
});
});
it('writes pattern pages, reuses similar slugs, and marks missing old pattern pages stale', async () => {
const workdir = await tempWorkdir();
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 2,
touchedTableCount: 2,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 90,
});
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' });
await writeText(
workdir,
'knowledge/global/historic-sql/old-order-lifecycle.md',
[
'---',
YAML.stringify({
summary: 'Old order lifecycle page',
tags: ['historic-sql', 'pattern'],
refs: [],
sl_refs: ['orders'],
usage_mode: 'auto',
source: 'historic-sql',
tables: ['public.orders', 'public.customers'],
fingerprints: ['pg:1'],
}).trimEnd(),
'---',
'',
'Old body',
'',
].join('\n'),
);
await writeText(
workdir,
'knowledge/global/historic-sql/retired-pattern.md',
[
'---',
YAML.stringify({
summary: 'Retired pattern',
tags: ['historic-sql', 'pattern'],
refs: [],
sl_refs: [],
usage_mode: 'auto',
source: 'historic-sql',
tables: ['public.tickets'],
fingerprints: ['pg:9'],
}).trimEnd(),
'---',
'',
'Retired body',
'',
].join('\n'),
);
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/pattern.json', {
kind: 'pattern',
connectionId: 'warehouse',
rawPath: 'patterns-input.json',
pattern: {
slug: 'order-lifecycle-analysis',
title: 'Order Lifecycle Analysis',
narrative: 'Analysts compare order status with customer segment.',
definitionSql: 'select * from public.orders join public.customers on customers.id = orders.customer_id',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['pg:1', 'pg:2'],
},
});
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.patternPagesWritten).toBe(1);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/old-order-lifecycle.md'), 'utf-8')).resolves.toContain(
'Order Lifecycle Analysis',
);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/retired-pattern.md'), 'utf-8')).resolves.toContain(
'stale_since: "2026-05-11T00:00:00.000Z"',
);
});
it('writes a reappearing pattern to the active slug instead of reusing an archived page key', async () => {
const workdir = await tempWorkdir();
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 2,
touchedTableCount: 2,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 30,
});
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.orders.json', { table: 'public.orders' });
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/tables/public.customers.json', { table: 'public.customers' });
await writeText(
workdir,
'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md',
[
'---',
YAML.stringify({
summary: 'Archived order lifecycle page',
tags: ['historic-sql', 'pattern', 'archived'],
refs: [],
sl_refs: ['orders'],
usage_mode: 'auto',
source: 'historic-sql',
tables: ['public.orders', 'public.customers'],
fingerprints: ['pg:1'],
stale_since: '2026-01-01T00:00:00.000Z',
}).trimEnd(),
'---',
'',
'Archived body',
'',
].join('\n'),
);
await writeJson(workdir, '.ktx/ingest-evidence/historic-sql/run-1/pattern.json', {
kind: 'pattern',
connectionId: 'warehouse',
rawPath: 'patterns-input.json',
pattern: {
slug: 'order-lifecycle-analysis',
title: 'Order Lifecycle Analysis',
narrative: 'Analysts compare order status with customer segment again.',
definitionSql: 'select * from public.orders join public.customers on customers.id = orders.customer_id',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['pg:1', 'pg:2'],
},
});
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.patternPagesWritten).toBe(1);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain(
'Order Lifecycle Analysis',
);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/order-lifecycle-analysis.md'), 'utf-8')).resolves.toContain(
'Archived body',
);
await expect(
readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/order-lifecycle-analysis.md'), 'utf-8'),
).rejects.toMatchObject({ code: 'ENOENT' });
});
it('leaves already archived pattern pages stable when they are still absent', async () => {
const workdir = await tempWorkdir();
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 0,
touchedTableCount: 0,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 30,
});
await writeText(
workdir,
'knowledge/global/historic-sql/_archived/retired-pattern.md',
[
'---',
YAML.stringify({
summary: 'Retired pattern',
tags: ['historic-sql', 'pattern', 'archived'],
refs: [],
sl_refs: [],
usage_mode: 'auto',
source: 'historic-sql',
tables: ['public.tickets'],
fingerprints: ['pg:9'],
stale_since: '2026-01-01T00:00:00.000Z',
}).trimEnd(),
'---',
'',
'Archived retired body',
'',
].join('\n'),
);
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.archivedPatternPages).toBe(0);
expect(result.stalePatternPagesMarked).toBe(0);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/retired-pattern.md'), 'utf-8')).resolves.toContain(
'Archived retired body',
);
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/_archived/_archived/retired-pattern.md'), 'utf-8')).rejects.toMatchObject({
code: 'ENOENT',
});
});
it('marks missing table usage stale and deletes legacy historic SQL query pages', async () => {
const workdir = await tempWorkdir();
await writeText(
workdir,
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify({
tables: {
orders: {
table: 'public.orders',
usage: {
narrative: 'Orders were active before.',
frequencyTier: 'high',
commonFilters: ['status'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
ownerNote: 'keep analyst annotation',
},
columns: [{ name: 'id', type: 'string' }],
},
},
}),
);
await writeJson(workdir, 'raw-sources/warehouse/historic-sql/sync-1/manifest.json', {
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 0,
touchedTableCount: 0,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 90,
});
await writeText(
workdir,
'knowledge/global/historic-sql/legacy-template.md',
[
'---',
YAML.stringify({
summary: 'Legacy template page',
tags: ['historic-sql', 'query-pattern'],
refs: [],
sl_refs: ['orders'],
usage_mode: 'auto',
source: 'historic-sql',
tables: ['public.orders'],
fingerprints: ['legacy:1'],
}).trimEnd(),
'---',
'',
'Legacy body',
'',
].join('\n'),
);
const result = await projectHistoricSqlEvidence({ workdir, connectionId: 'warehouse', syncId: 'sync-1', runId: 'run-1' });
expect(result.staleTablesMarked).toBe(1);
expect(result.legacyPagesDeleted).toBe(1);
expect(result.touchedSources).toEqual([{ connectionId: 'warehouse', sourceName: 'orders' }]);
const shard = YAML.parse(await readFile(join(workdir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8'));
expect(shard.tables.orders.usage).toEqual({
ownerNote: 'keep analyst annotation',
narrative: 'No recent historic SQL usage was observed in the latest snapshot.',
frequencyTier: 'unused',
commonFilters: [],
commonGroupBys: [],
commonJoins: [],
staleSince: '2026-05-11T00:00:00.000Z',
});
await expect(readFile(join(workdir, 'knowledge/global/historic-sql/legacy-template.md'), 'utf-8')).rejects.toMatchObject({
code: 'ENOENT',
});
});
});

View file

@ -0,0 +1,334 @@
import { access, mkdir, readdir, readFile, rename, rm, writeFile } from 'node:fs/promises';
import { dirname, join, relative } from 'node:path';
import YAML from 'yaml';
import { rawSourcesDirForSync } from '../../raw-sources-paths.js';
import { mergeUsagePreservingExternal } from '../live-database/manifest.js';
import { historicSqlEvidenceEnvelopeSchema, type HistoricSqlEvidenceEnvelope } from './evidence.js';
import type { TableUsageOutput } from './skill-schemas.js';
import { stagedManifestSchema } from './types.js';
export interface HistoricSqlProjectionInput {
workdir: string;
connectionId: string;
syncId: string;
runId: string;
}
export interface HistoricSqlProjectionResult {
tableUsageMerged: number;
staleTablesMarked: number;
patternPagesWritten: number;
stalePatternPagesMarked: number;
archivedPatternPages: number;
legacyPagesDeleted: number;
touchedSources: Array<{ connectionId: string; sourceName: string }>;
warnings: string[];
}
interface ManifestShard {
tables?: Record<string, { table?: string; usage?: Record<string, unknown>; columns?: unknown[]; [key: string]: unknown }>;
}
interface HistoricSqlPatternPage {
key: string;
path: string;
frontmatter: Record<string, unknown>;
content: string;
}
function safeKnowledgeSlug(value: string): string {
return value.toLowerCase().replace(/[^a-z0-9/-]+/g, '-').replace(/^-+|-+$/g, '');
}
async function pathExists(path: string): Promise<boolean> {
try {
await access(path);
return true;
} catch {
return false;
}
}
async function walkFiles(root: string): Promise<string[]> {
if (!(await pathExists(root))) return [];
const result: string[] = [];
async function visit(dir: string): Promise<void> {
const entries = await readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const absolute = join(dir, entry.name);
if (entry.isDirectory()) {
await visit(absolute);
} else if (entry.isFile()) {
result.push(relative(root, absolute).replace(/\\/g, '/'));
}
}
}
await visit(root);
return result.sort();
}
async function readJson(path: string): Promise<unknown> {
return JSON.parse(await readFile(path, 'utf-8')) as unknown;
}
async function writeYamlAtomic(path: string, value: unknown): Promise<void> {
await mkdir(dirname(path), { recursive: true });
const tmp = `${path}.tmp`;
await writeFile(tmp, YAML.stringify(value, { indent: 2, lineWidth: 0 }), 'utf-8');
await rename(tmp, path);
}
function tableSourceName(tableRef: string): string {
return tableRef.split('.').filter(Boolean).at(-1) ?? tableRef;
}
function staleUsage(fetchedAt: string) {
return {
narrative: 'No recent historic SQL usage was observed in the latest snapshot.',
frequencyTier: 'unused' as const,
commonFilters: [],
commonGroupBys: [],
commonJoins: [],
staleSince: fetchedAt,
};
}
async function loadEvidence(workdir: string, runId: string): Promise<HistoricSqlEvidenceEnvelope[]> {
const root = join(workdir, '.ktx/ingest-evidence/historic-sql', runId);
const files = await walkFiles(root);
const evidence: HistoricSqlEvidenceEnvelope[] = [];
for (const file of files.filter((candidate) => candidate.endsWith('.json'))) {
evidence.push(historicSqlEvidenceEnvelopeSchema.parse(await readJson(join(root, file))));
}
return evidence;
}
function renderPatternMarkdown(pattern: HistoricSqlEvidenceEnvelope & { kind: 'pattern' }): string {
return [
`# ${pattern.pattern.title}`,
'',
pattern.pattern.narrative,
'',
'## Representative SQL',
'',
'```sql',
pattern.pattern.definitionSql,
'```',
'',
'## Tables',
'',
...pattern.pattern.tablesInvolved.map((table) => `- ${table}`),
'',
'## Constituent Templates',
'',
...pattern.pattern.constituentTemplateIds.map((id) => `- ${id}`),
'',
].join('\n');
}
function overlapRatio(left: string[], right: string[]): number {
const rightSet = new Set(right);
const intersection = left.filter((value) => rightSet.has(value)).length;
return left.length === 0 ? 0 : intersection / left.length;
}
function parseMarkdownPage(key: string, path: string, raw: string): HistoricSqlPatternPage | null {
const match = raw.match(/^---\n([\s\S]*?)\n---\n?([\s\S]*)$/);
if (!match) return null;
return {
key,
path,
frontmatter: (YAML.parse(match[1] ?? '') ?? {}) as Record<string, unknown>,
content: match[2] ?? '',
};
}
function isHistoricPatternPage(page: HistoricSqlPatternPage): boolean {
const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : [];
return (
page.frontmatter.source === 'historic-sql' &&
tags.includes('historic-sql') &&
tags.includes('pattern')
);
}
function isLegacyQueryPage(page: HistoricSqlPatternPage): boolean {
const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : [];
return page.frontmatter.source === 'historic-sql' && tags.includes('query-pattern') && !tags.includes('pattern');
}
function isArchivedPatternPage(page: HistoricSqlPatternPage): boolean {
const tags = Array.isArray(page.frontmatter.tags) ? page.frontmatter.tags : [];
return page.key.startsWith('_archived/') || tags.includes('archived');
}
function stringArray(value: unknown): string[] {
return Array.isArray(value) ? value.filter((entry): entry is string => typeof entry === 'string') : [];
}
function renderMarkdownPage(frontmatter: Record<string, unknown>, content: string): string {
let yaml = YAML.stringify(frontmatter, { indent: 2, lineWidth: 0 }).trimEnd();
const staleSince = frontmatter.stale_since;
if (typeof staleSince === 'string') {
yaml = yaml.replace(`stale_since: ${staleSince}`, `stale_since: "${staleSince}"`);
}
return `---\n${yaml}\n---\n\n${content.trim()}\n`;
}
function existingPageSignals(page: HistoricSqlPatternPage): string[] {
return [...stringArray(page.frontmatter.tables), ...stringArray(page.frontmatter.fingerprints)];
}
function shouldArchive(staleSince: unknown, fetchedAt: string, days: number): boolean {
if (typeof staleSince !== 'string') return false;
const staleTime = Date.parse(staleSince);
const fetchedTime = Date.parse(fetchedAt);
if (!Number.isFinite(staleTime) || !Number.isFinite(fetchedTime)) return false;
return fetchedTime - staleTime > days * 24 * 60 * 60 * 1000;
}
async function loadPatternPages(root: string): Promise<HistoricSqlPatternPage[]> {
const files = await walkFiles(root);
const pages: HistoricSqlPatternPage[] = [];
for (const file of files.filter((candidate) => candidate.endsWith('.md'))) {
const key = file.replace(/\.md$/, '');
const path = join(root, file);
const page = parseMarkdownPage(key, path, await readFile(path, 'utf-8'));
if (page) {
pages.push(page);
}
}
return pages;
}
async function currentStagedTables(rawDir: string): Promise<Set<string>> {
const tablesRoot = join(rawDir, 'tables');
const files = await walkFiles(tablesRoot);
const tables = new Set<string>();
for (const file of files.filter((candidate) => candidate.endsWith('.json'))) {
const value = await readJson(join(tablesRoot, file));
if (typeof value === 'object' && value !== null && 'table' in value && typeof value.table === 'string') {
tables.add(value.table);
}
}
return tables;
}
export async function projectHistoricSqlEvidence(input: HistoricSqlProjectionInput): Promise<HistoricSqlProjectionResult> {
const result: HistoricSqlProjectionResult = {
tableUsageMerged: 0,
staleTablesMarked: 0,
patternPagesWritten: 0,
stalePatternPagesMarked: 0,
archivedPatternPages: 0,
legacyPagesDeleted: 0,
touchedSources: [],
warnings: [],
};
const touchedKeys = new Set<string>();
const rawDir = join(input.workdir, rawSourcesDirForSync(input.connectionId, 'historic-sql', input.syncId));
const manifest = stagedManifestSchema.parse(await readJson(join(rawDir, 'manifest.json')));
const currentTables = await currentStagedTables(rawDir);
const evidence = await loadEvidence(input.workdir, input.runId);
const tableEvidence = evidence.filter((entry): entry is HistoricSqlEvidenceEnvelope & { kind: 'table_usage' } => entry.kind === 'table_usage');
const patternEvidence = evidence.filter((entry): entry is HistoricSqlEvidenceEnvelope & { kind: 'pattern' } => entry.kind === 'pattern');
const schemaRoot = join(input.workdir, 'semantic-layer', input.connectionId, '_schema');
for (const file of (await walkFiles(schemaRoot)).filter((candidate) => candidate.endsWith('.yaml') || candidate.endsWith('.yml'))) {
const path = join(schemaRoot, file);
const before = await readFile(path, 'utf-8');
const shard = (YAML.parse(before) ?? {}) as ManifestShard;
if (!shard.tables) continue;
for (const [tableName, entry] of Object.entries(shard.tables)) {
const tableRef = entry.table ?? tableName;
const matchingEvidence = tableEvidence.find(
(candidate) => candidate.table === tableRef || tableSourceName(candidate.table) === tableName,
);
if (matchingEvidence) {
const merged = mergeUsagePreservingExternal(entry.usage as TableUsageOutput | undefined, matchingEvidence.usage);
if (JSON.stringify(entry.usage ?? null) !== JSON.stringify(merged ?? null)) {
entry.usage = merged as Record<string, unknown>;
result.tableUsageMerged += 1;
const sourceName = tableSourceName(matchingEvidence.table);
const key = `${input.connectionId}:${sourceName}`;
if (!touchedKeys.has(key)) {
touchedKeys.add(key);
result.touchedSources.push({ connectionId: input.connectionId, sourceName });
}
}
} else if (entry.usage && !currentTables.has(tableRef)) {
const merged = mergeUsagePreservingExternal(entry.usage as TableUsageOutput | undefined, staleUsage(manifest.fetchedAt));
if (JSON.stringify(entry.usage ?? null) !== JSON.stringify(merged ?? null)) {
entry.usage = merged as Record<string, unknown>;
result.staleTablesMarked += 1;
const sourceName = tableSourceName(tableRef);
const key = `${input.connectionId}:${sourceName}`;
if (!touchedKeys.has(key)) {
touchedKeys.add(key);
result.touchedSources.push({ connectionId: input.connectionId, sourceName });
}
}
}
}
const after = YAML.stringify(shard, { indent: 2, lineWidth: 0 });
if (after !== before) {
await writeYamlAtomic(path, shard);
}
}
const wikiRoot = join(input.workdir, 'knowledge/global/historic-sql');
await mkdir(wikiRoot, { recursive: true });
const allPages = await loadPatternPages(wikiRoot);
const activePages = allPages.filter((page) => !isArchivedPatternPage(page));
const patternPages = activePages.filter(isHistoricPatternPage);
const writtenKeys = new Set<string>();
for (const pattern of patternEvidence) {
const incomingSignals = [...pattern.pattern.tablesInvolved, ...pattern.pattern.constituentTemplateIds];
const reusable = patternPages.find((page) => overlapRatio(incomingSignals, existingPageSignals(page)) >= 0.6);
const key = reusable?.key ?? safeKnowledgeSlug(pattern.pattern.slug);
const pagePath = join(wikiRoot, `${key}.md`);
const frontmatter = {
summary: pattern.pattern.title,
tags: ['historic-sql', 'pattern'],
refs: [],
sl_refs: pattern.pattern.slRefs,
usage_mode: 'auto',
source: 'historic-sql',
tables: pattern.pattern.tablesInvolved,
representative_sql: pattern.pattern.definitionSql,
fingerprints: pattern.pattern.constituentTemplateIds,
};
await mkdir(dirname(pagePath), { recursive: true });
await writeFile(pagePath, renderMarkdownPage(frontmatter, renderPatternMarkdown(pattern)), 'utf-8');
writtenKeys.add(key);
result.patternPagesWritten += 1;
}
for (const page of patternPages) {
if (writtenKeys.has(page.key)) continue;
if (shouldArchive(page.frontmatter.stale_since, manifest.fetchedAt, manifest.staleArchiveAfterDays)) {
const archivePath = join(wikiRoot, '_archived', `${page.key}.md`);
const tags = [...new Set([...stringArray(page.frontmatter.tags), 'archived'])];
await mkdir(dirname(archivePath), { recursive: true });
await writeFile(archivePath, renderMarkdownPage({ ...page.frontmatter, tags }, page.content), 'utf-8');
await rm(page.path, { force: true });
result.archivedPatternPages += 1;
continue;
}
const tags = [...new Set([...stringArray(page.frontmatter.tags), 'stale'])];
await writeFile(
page.path,
renderMarkdownPage({ ...page.frontmatter, tags, stale_since: manifest.fetchedAt }, page.content),
'utf-8',
);
result.stalePatternPagesMarked += 1;
}
for (const page of allPages.filter(isLegacyQueryPage)) {
await rm(page.path, { force: true });
result.legacyPagesDeleted += 1;
}
return result;
}

View file

@ -0,0 +1,36 @@
import { describe, expect, it } from 'vitest';
import { compileHistoricSqlRedactionPatterns, redactHistoricSqlText } from './redaction.js';
describe('historic-SQL redaction', () => {
it('redacts regex matches and supports the (?i) case-insensitive prefix', () => {
const redactors = compileHistoricSqlRedactionPatterns([
'sk_live_[A-Za-z0-9]+',
'(?i)secret_token_[a-z0-9]+',
]);
const sql =
"select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'";
expect(redactHistoricSqlText(sql, redactors)).toBe(
"select * from public.api_events where api_key = '[REDACTED]' and note = '[REDACTED]'",
);
});
it('returns the original SQL text when no redaction patterns are configured', () => {
const sql = "select * from public.orders where status = 'paid'";
expect(redactHistoricSqlText(sql, compileHistoricSqlRedactionPatterns([]))).toBe(sql);
});
it('throws a config-focused error for invalid redaction regex patterns', () => {
expect(() => compileHistoricSqlRedactionPatterns(['[broken'])).toThrow(
'Invalid historicSql.redactionPatterns entry "[broken"',
);
});
it('throws a config-focused error for empty redaction regex patterns', () => {
expect(() => compileHistoricSqlRedactionPatterns([' '])).toThrow(
'Invalid historicSql.redactionPatterns entry " "',
);
});
});

View file

@ -0,0 +1,37 @@
export interface HistoricSqlRedactionPattern {
pattern: string;
expression: RegExp;
}
const CASE_INSENSITIVE_PREFIX = '(?i)';
const REDACTION_TOKEN = '[REDACTED]';
export function compileHistoricSqlRedactionPatterns(patterns: readonly string[]): HistoricSqlRedactionPattern[] {
return patterns.map((pattern) => {
const trimmed = pattern.trim();
const caseInsensitive = trimmed.startsWith(CASE_INSENSITIVE_PREFIX);
const source = caseInsensitive ? trimmed.slice(CASE_INSENSITIVE_PREFIX.length) : trimmed;
if (source.length === 0) {
throw new Error(`Invalid historicSql.redactionPatterns entry "${pattern}": pattern must not be empty`);
}
try {
return {
pattern,
expression: new RegExp(source, caseInsensitive ? 'gi' : 'g'),
};
} catch (error) {
const reason = error instanceof Error ? error.message : String(error);
throw new Error(`Invalid historicSql.redactionPatterns entry "${pattern}": ${reason}`);
}
});
}
export function redactHistoricSqlText(text: string, redactors: readonly HistoricSqlRedactionPattern[]): string {
let next = text;
for (const redactor of redactors) {
redactor.expression.lastIndex = 0;
next = next.replace(redactor.expression, REDACTION_TOKEN);
}
return next;
}

View file

@ -0,0 +1,74 @@
import { describe, expect, it } from 'vitest';
import { z } from 'zod';
import {
patternOutputSchema,
patternsArraySchema,
tableUsageOutputSchema,
} from './skill-schemas.js';
describe('historic-sql skill schemas', () => {
it('accepts table usage output and preserves future keys', () => {
const parsed = tableUsageOutputSchema.parse({
narrative: 'Orders are queried for paid/refunded lifecycle analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: null,
analystNote: 'preserve me',
});
expect(parsed).toMatchObject({
narrative: 'Orders are queried for paid/refunded lifecycle analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: null,
analystNote: 'preserve me',
});
});
it('rejects invalid frequency tiers', () => {
const result = tableUsageOutputSchema.safeParse({
narrative: 'Orders are queried often.',
frequencyTier: 'sometimes',
commonFilters: [],
commonJoins: [],
});
expect(result.success).toBe(false);
});
it('accepts pattern outputs used for wiki projection', () => {
const parsed = patternsArraySchema.parse([
{
slug: 'order-lifecycle-analysis',
title: 'Order Lifecycle Analysis',
narrative: 'Teams inspect order status by customer and month.',
definitionSql: 'select status, count(*) from public.orders group by status',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['template_1', 'template_2'],
},
]);
expect(parsed[0]).toEqual({
slug: 'order-lifecycle-analysis',
title: 'Order Lifecycle Analysis',
narrative: 'Teams inspect order status by customer and month.',
definitionSql: 'select status, count(*) from public.orders group by status',
tablesInvolved: ['public.orders', 'public.customers'],
slRefs: ['orders', 'customers'],
constituentTemplateIds: ['template_1', 'template_2'],
});
});
it('exports zod schemas that can produce JSON schema for prompt prefixes', () => {
const tableUsageJsonSchema = z.toJSONSchema(tableUsageOutputSchema);
const patternJsonSchema = z.toJSONSchema(patternOutputSchema);
expect(tableUsageJsonSchema).toMatchObject({ type: 'object' });
expect(patternJsonSchema).toMatchObject({ type: 'object' });
});
});

View file

@ -0,0 +1,31 @@
import { z } from 'zod';
export const tableUsageOutputSchema = z
.object({
narrative: z.string(),
frequencyTier: z.enum(['high', 'mid', 'low', 'unused']),
commonFilters: z.array(z.string()),
commonGroupBys: z.array(z.string()).optional(),
commonJoins: z.array(
z.object({
table: z.string(),
on: z.array(z.string()),
}),
),
staleSince: z.iso.datetime().nullable().optional(),
})
.passthrough();
export type TableUsageOutput = z.infer<typeof tableUsageOutputSchema>;
export const patternOutputSchema = z.object({
slug: z.string(),
title: z.string(),
narrative: z.string(),
definitionSql: z.string(),
tablesInvolved: z.array(z.string()),
slRefs: z.array(z.string()),
constituentTemplateIds: z.array(z.string()),
});
export type PatternOutput = z.infer<typeof patternOutputSchema>;
export const patternsArraySchema = z.array(patternOutputSchema);

View file

@ -33,7 +33,7 @@ describe('SnowflakeHistoricSqlQueryHistoryReader', () => {
const client = queryClient([{ headers: ['1'], rows: [[1]], totalRows: 1 }]);
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
await expect(reader.probe(client)).resolves.toBeUndefined();
await expect(reader.probe(client)).resolves.toEqual({ warnings: [], info: [] });
expect(client.executeQuery).toHaveBeenCalledWith(
'SELECT 1 FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY LIMIT 1',
@ -62,130 +62,85 @@ describe('SnowflakeHistoricSqlQueryHistoryReader', () => {
await expect(reader.probe(client)).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('fetches query-history rows with cursor and maps them into RawQueryRow shape', async () => {
it('fetches aggregated Snowflake query templates', async () => {
const client = queryClient([
{
headers: [
'QUERY_ID',
'QUERY_TEXT',
'USER_NAME',
'ROLE_NAME',
'WAREHOUSE_NAME',
'DATABASE_NAME',
'SCHEMA_NAME',
'START_TIME',
'END_TIME',
'TOTAL_ELAPSED_TIME',
'ROWS_PRODUCED',
'EXECUTION_STATUS',
'ERROR_CODE',
'ERROR_MESSAGE',
'template_id',
'canonical_sql',
'executions',
'distinct_users',
'first_seen',
'last_seen',
'p50_ms',
'p95_ms',
'error_rate',
'rows_produced',
'top_users',
],
rows: [
[
'01a',
"SELECT count(*) FROM ANALYTICS.ORDERS WHERE STATUS = 'paid'",
'ANALYST_A',
'ANALYST_ROLE',
'WH_XS',
'ANALYTICS',
'PUBLIC',
'2026-05-04T10:00:00.000Z',
'2026-05-04T10:00:01.250Z',
1250,
'hash-1',
'select status from orders',
42,
3,
'2026-05-01T00:00:00.000Z',
'2026-05-11T00:00:00.000Z',
12,
'SUCCESS',
null,
null,
],
[
'01b',
'SELECT * FROM MISSING_TABLE',
'ANALYST_B',
'ANALYST_ROLE',
'WH_XS',
'ANALYTICS',
'PUBLIC',
new Date('2026-05-04T10:05:00.000Z'),
null,
null,
null,
'FAILED_WITH_ERROR',
'002003',
'SQL compilation error',
40,
0.05,
100,
JSON.stringify([{ user: 'ANALYST', executions: 1 }]),
],
],
totalRows: 2,
totalRows: 1,
},
]);
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
const rows = [];
for await (const row of reader.fetch(
for await (const row of reader.fetchAggregated(
client,
{
start: new Date('2026-05-01T00:00:00.000Z'),
end: new Date('2026-05-04T12:00:00.000Z'),
},
'2026-05-03T00:00:00.000Z',
{ start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') },
{ dialect: 'snowflake', minExecutions: 5, windowDays: 90, concurrency: 12, filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 },
)) {
rows.push(row);
}
expect(client.executeQuery).toHaveBeenCalledTimes(1);
const sql = firstQuery(client);
expect(sql).toContain('FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY');
expect(sql).toContain("START_TIME >= '2026-05-03T00:00:00.000Z'::TIMESTAMP_TZ");
expect(sql).toContain("START_TIME < '2026-05-04T12:00:00.000Z'::TIMESTAMP_TZ");
expect(sql).toContain('ORDER BY START_TIME ASC, QUERY_ID ASC');
expect(sql).toContain('ROWS_PRODUCED');
expect(rows).toEqual([
expect(sql).toContain('SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY');
expect(sql).toContain('COUNT(*) AS executions');
expect(sql).toContain('GROUP BY query_hash');
expect(sql).toContain('HAVING COUNT(*) >= 5');
expect(rows).toMatchObject([
{
id: '01a',
sql: "SELECT count(*) FROM ANALYTICS.ORDERS WHERE STATUS = 'paid'",
user: 'ANALYST_A',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: '2026-05-04T10:00:01.250Z',
runtimeMs: 1250,
rowsProduced: 12,
success: true,
errorMessage: null,
},
{
id: '01b',
sql: 'SELECT * FROM MISSING_TABLE',
user: 'ANALYST_B',
startedAt: '2026-05-04T10:05:00.000Z',
endedAt: null,
runtimeMs: null,
rowsProduced: null,
success: false,
errorMessage: '002003: SQL compilation error',
templateId: 'hash-1',
stats: {
executions: 42,
errorRate: 0.05,
},
topUsers: [{ user: 'ANALYST', executions: 1 }],
},
]);
});
it('uses the window start when no cursor is available', async () => {
const client = queryClient([{ headers: ['QUERY_ID'], rows: [], totalRows: 0 }]);
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
for await (const _row of reader.fetch(client, {
start: new Date('2026-02-03T12:00:00.000Z'),
end: new Date('2026-05-04T12:00:00.000Z'),
})) {
throw new Error('empty result should not yield rows');
}
const sql = firstQuery(client);
expect(sql).toContain("START_TIME >= '2026-02-03T12:00:00.000Z'::TIMESTAMP_TZ");
});
it('throws a clear error when the query client cannot execute SQL', async () => {
const reader = new SnowflakeHistoricSqlQueryHistoryReader();
await expect(async () => {
for await (const _row of reader.fetch({}, { start: new Date(), end: new Date() })) {
for await (const _row of reader.fetchAggregated(
{},
{ start: new Date(), end: new Date() },
{
dialect: 'snowflake',
minExecutions: 5,
windowDays: 90,
concurrency: 12,
filters: { dropTrivialProbes: true },
redactionPatterns: [],
staleArchiveAfterDays: 90,
},
)) {
throw new Error('unreachable');
}
}).rejects.toThrow('Historic SQL Snowflake reader requires a query client with executeQuery(query)');

View file

@ -1,5 +1,10 @@
import { HistoricSqlGrantsMissingError } from './errors.js';
import type { HistoricSqlQueryHistoryReader, HistoricSqlRawQueryRow, HistoricSqlTimeWindow } from './types.js';
import {
aggregatedTemplateSchema,
type AggregatedTemplate,
type HistoricSqlTimeWindow,
type HistoricSqlUnifiedPullConfig,
} from './types.js';
interface QueryResultLike {
headers: string[];
@ -52,32 +57,6 @@ function timestampLiteral(value: Date | string): string {
return `'${date.toISOString().replace(/'/g, "''")}'::TIMESTAMP_TZ`;
}
function queryHistorySql(window: HistoricSqlTimeWindow, cursor?: string | null): string {
const start = timestampLiteral(cursor ?? window.start);
const end = timestampLiteral(window.end);
return `
SELECT
QUERY_ID,
QUERY_TEXT,
USER_NAME,
ROLE_NAME,
WAREHOUSE_NAME,
DATABASE_NAME,
SCHEMA_NAME,
START_TIME,
END_TIME,
TOTAL_ELAPSED_TIME,
ROWS_PRODUCED,
EXECUTION_STATUS,
ERROR_CODE,
ERROR_MESSAGE
FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
WHERE START_TIME >= ${start}
AND START_TIME < ${end}
AND QUERY_TEXT IS NOT NULL
ORDER BY START_TIME ASC, QUERY_ID ASC`.trim();
}
function indexByHeader(headers: string[]): Map<string, number> {
const out = new Map<string, number>();
headers.forEach((header, index) => {
@ -87,7 +66,7 @@ function indexByHeader(headers: string[]): Map<string, number> {
}
function value(row: unknown[], indexes: Map<string, number>, name: string): unknown {
const index = indexes.get(name);
const index = indexes.get(name.toUpperCase());
return index === undefined ? null : row[index];
}
@ -118,6 +97,18 @@ function nullableNumber(raw: unknown): number | null {
return number;
}
function requiredNumber(raw: unknown, field: string): number {
const number = nullableNumber(raw);
if (number === null) {
throw new Error(`Snowflake QUERY_HISTORY row has invalid ${field}: ${String(raw)}`);
}
return number;
}
function requiredInteger(raw: unknown, field: string): number {
return Math.trunc(requiredNumber(raw, field));
}
function nullableInteger(raw: unknown): number | null {
const number = nullableNumber(raw);
return number === null ? null : Math.trunc(number);
@ -135,46 +126,50 @@ function isoTimestamp(raw: unknown, field: string): string {
return date.toISOString();
}
function nullableIsoTimestamp(raw: unknown): string | null {
if (raw === null || raw === undefined || raw === '') {
return null;
function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> {
const text = nullableString(raw);
if (!text) {
return [];
}
return isoTimestamp(raw, 'END_TIME');
}
function executionSucceeded(status: string | null, errorCode: string | null, errorMessage: string | null): boolean {
if (errorCode || errorMessage) {
return false;
try {
const parsed = JSON.parse(text) as unknown;
if (!Array.isArray(parsed)) {
return [];
}
return parsed.flatMap((entry) => {
if (!entry || typeof entry !== 'object') {
return [];
}
const user = nullableString((entry as { user?: unknown }).user);
const executions = nullableInteger((entry as { executions?: unknown }).executions);
return executions === null ? [] : [{ user, executions }];
});
} catch {
return [];
}
return status === null || status.toUpperCase().startsWith('SUCCESS');
}
function combinedErrorMessage(errorCode: string | null, errorMessage: string | null): string | null {
if (errorCode && errorMessage) {
return `${errorCode}: ${errorMessage}`;
}
return errorMessage ?? errorCode;
function mapAggregatedRow(row: unknown[], indexes: Map<string, number>): AggregatedTemplate {
return aggregatedTemplateSchema.parse({
templateId: requiredString(value(row, indexes, 'template_id'), 'template_id'),
canonicalSql: requiredString(value(row, indexes, 'canonical_sql'), 'canonical_sql'),
dialect: 'snowflake',
stats: {
executions: requiredInteger(value(row, indexes, 'executions'), 'executions'),
distinctUsers: requiredInteger(value(row, indexes, 'distinct_users'), 'distinct_users'),
firstSeen: isoTimestamp(value(row, indexes, 'first_seen'), 'first_seen'),
lastSeen: isoTimestamp(value(row, indexes, 'last_seen'), 'last_seen'),
p50RuntimeMs: nullableNumber(value(row, indexes, 'p50_ms')),
p95RuntimeMs: nullableNumber(value(row, indexes, 'p95_ms')),
errorRate: requiredNumber(value(row, indexes, 'error_rate'), 'error_rate'),
rowsProduced: nullableInteger(value(row, indexes, 'rows_produced')),
},
topUsers: parseTopUsers(value(row, indexes, 'top_users')),
});
}
function mapRow(row: unknown[], indexes: Map<string, number>): HistoricSqlRawQueryRow {
const errorCode = nullableString(value(row, indexes, 'ERROR_CODE'));
const errorMessage = nullableString(value(row, indexes, 'ERROR_MESSAGE'));
const rowsProduced = nullableInteger(value(row, indexes, 'ROWS_PRODUCED'));
return {
id: requiredString(value(row, indexes, 'QUERY_ID'), 'QUERY_ID'),
sql: requiredString(value(row, indexes, 'QUERY_TEXT'), 'QUERY_TEXT'),
user: nullableString(value(row, indexes, 'USER_NAME')),
startedAt: isoTimestamp(value(row, indexes, 'START_TIME'), 'START_TIME'),
endedAt: nullableIsoTimestamp(value(row, indexes, 'END_TIME')),
runtimeMs: nullableNumber(value(row, indexes, 'TOTAL_ELAPSED_TIME')),
rowsProduced,
success: executionSucceeded(nullableString(value(row, indexes, 'EXECUTION_STATUS')), errorCode, errorMessage),
errorMessage: combinedErrorMessage(errorCode, errorMessage),
};
}
export class SnowflakeHistoricSqlQueryHistoryReader implements HistoricSqlQueryHistoryReader {
async probe(client: unknown): Promise<void> {
export class SnowflakeHistoricSqlQueryHistoryReader {
async probe(client: unknown): Promise<{ warnings: string[]; info: string[] }> {
let result: QueryResultLike;
try {
result = await queryClient(client).executeQuery(PROBE_SQL);
@ -184,20 +179,42 @@ export class SnowflakeHistoricSqlQueryHistoryReader implements HistoricSqlQueryH
if (result.error) {
throw grantsError(result.error);
}
return { warnings: [], info: [] };
}
async *fetch(
async *fetchAggregated(
client: unknown,
window: HistoricSqlTimeWindow,
cursor?: string | null,
): AsyncIterable<HistoricSqlRawQueryRow> {
const result = await queryClient(client).executeQuery(queryHistorySql(window, cursor));
config: HistoricSqlUnifiedPullConfig,
): AsyncIterable<AggregatedTemplate> {
const sql = `
SELECT
query_hash AS template_id,
MIN(query_text) AS canonical_sql,
COUNT(*) AS executions,
COUNT(DISTINCT user_name) AS distinct_users,
MIN(start_time) AS first_seen,
MAX(start_time) AS last_seen,
APPROX_PERCENTILE(total_elapsed_time, 0.50) AS p50_ms,
APPROX_PERCENTILE(total_elapsed_time, 0.95) AS p95_ms,
DIV0(COUNT_IF(execution_status != 'SUCCESS'), COUNT(*)) AS error_rate,
SUM(rows_produced) AS rows_produced,
ARRAY_AGG(OBJECT_CONSTRUCT('user', user_name, 'executions', 1)) WITHIN GROUP (ORDER BY start_time DESC)::string AS top_users
FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY
WHERE query_text IS NOT NULL
AND query_type IN ('SELECT', 'MERGE')
AND start_time >= ${timestampLiteral(window.start)}
AND start_time < ${timestampLiteral(window.end)}
GROUP BY query_hash
HAVING COUNT(*) >= ${config.minExecutions}
ORDER BY executions DESC`.trim();
const result = await queryClient(client).executeQuery(sql);
if (result.error) {
throw grantsError(result.error);
}
const indexes = indexByHeader(result.headers);
for (const row of result.rows) {
yield mapRow(row, indexes);
yield mapAggregatedRow(row, indexes);
}
}
}

View file

@ -1,152 +0,0 @@
import { mkdir, mkdtemp, readdir, readFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { dirname, join, relative } from 'node:path';
import { describe, expect, it } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import { stagePgStatStatementsTemplates, writePgssBaselineAtomic, type PgssBaseline } from './stage-pgss.js';
import type { HistoricSqlPullConfig, KtxPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js';
const FIXTURE_ROOT = join(__dirname, '__fixtures__/postgres');
interface GoldenFixture {
name: string;
now: string;
connectionId: string;
probe: {
pgServerVersion: string;
warnings: string[];
};
snapshot: {
statsResetAt: string | null;
deallocCount: number | null;
rows: PostgresPgssRow[];
};
pullConfig: HistoricSqlPullConfig & { dialect: 'postgres' };
analysisBySql: Record<
string,
{
fingerprint: string;
normalizedSql: string;
tablesTouched: string[];
literalSlots: [];
error?: string;
}
>;
baseline: PgssBaseline | null;
expectedBaseline: PgssBaseline;
expectedFiles: Record<string, { json?: unknown; text?: string }>;
}
async function readFixture(name: string): Promise<GoldenFixture> {
return JSON.parse(await readFile(join(FIXTURE_ROOT, name, 'input.json'), 'utf-8')) as GoldenFixture;
}
async function tempDir(prefix: string): Promise<string> {
return mkdtemp(join(tmpdir(), prefix));
}
function fakePgClient(): KtxPostgresQueryClient {
return {
async executeQuery() {
return { headers: [], rows: [] };
},
};
}
function fixtureReader(fixture: GoldenFixture): PostgresPgssReader {
return {
async probe() {
return fixture.probe;
},
async readSnapshot(_client, options) {
return {
statsResetAt: fixture.snapshot.statsResetAt,
deallocCount: fixture.snapshot.deallocCount,
rows: fixture.snapshot.rows.slice(0, options.maxTemplates),
};
},
};
}
function fixtureSqlAnalysis(fixture: GoldenFixture): SqlAnalysisPort {
return {
async analyzeForFingerprint(sql) {
const result = fixture.analysisBySql[sql];
if (!result) {
return {
fingerprint: '',
normalizedSql: '',
tablesTouched: [],
literalSlots: [],
error: `missing fixture analysis for ${sql}`,
};
}
return result;
},
};
}
async function writeFixtureBaseline(path: string, baseline: PgssBaseline | null): Promise<void> {
if (!baseline) {
return;
}
await writePgssBaselineAtomic(path, baseline);
}
async function listFiles(root: string, current = root): Promise<string[]> {
const entries = await readdir(current, { withFileTypes: true });
const files: string[] = [];
for (const entry of entries) {
const fullPath = join(current, entry.name);
if (entry.isDirectory()) {
files.push(...(await listFiles(root, fullPath)));
} else {
files.push(relative(root, fullPath));
}
}
return files;
}
async function expectGoldenFiles(stagedDir: string, expectedFiles: GoldenFixture['expectedFiles']): Promise<void> {
const actualFiles = await listFiles(stagedDir);
const expectedPaths = Object.keys(expectedFiles).sort();
expect(actualFiles.sort()).toEqual(expectedPaths);
for (const path of expectedPaths) {
const expected = expectedFiles[path];
const actual = await readFile(join(stagedDir, path), 'utf-8');
if ('json' in expected) {
expect(JSON.parse(actual)).toEqual(expected.json);
} else {
expect(actual).toBe(expected.text);
}
}
}
describe('stagePgStatStatementsTemplates golden fixtures', () => {
it.each(['first-run', 'normal-delta', 'reset-detected', 'version-change', 'eviction-churn'] as const)(
'matches the committed %s golden output',
async (fixtureName) => {
const fixture = await readFixture(fixtureName);
const root = await tempDir(`pgss-golden-${fixtureName}-`);
const stagedDir = join(root, 'staged');
const baselinePath = join(root, 'cache', fixture.connectionId, 'pgss-baseline.json');
await mkdir(dirname(baselinePath), { recursive: true });
await writeFixtureBaseline(baselinePath, fixture.baseline);
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: fixture.connectionId,
queryClient: fakePgClient(),
reader: fixtureReader(fixture),
sqlAnalysis: fixtureSqlAnalysis(fixture),
pullConfig: fixture.pullConfig,
baselinePath,
now: new Date(fixture.now),
});
await expectGoldenFiles(stagedDir, fixture.expectedFiles);
expect(result.baseline).toEqual(fixture.expectedBaseline);
},
);
});

View file

@ -1,652 +0,0 @@
import { mkdtemp, readFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it, vi } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import {
pgssBaselinePath,
readPgssBaseline,
stagePgStatStatementsTemplates,
writePgssBaselineAtomic,
type PgssBaseline,
} from './stage-pgss.js';
import { historicSqlManifestSchema, historicSqlMetadataSchema, historicSqlUsageSchema } from './types.js';
import type { KtxPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js';
async function tempDir(prefix: string): Promise<string> {
return mkdtemp(join(tmpdir(), prefix));
}
async function readJson<T>(root: string, relPath: string): Promise<T> {
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
}
function fakePgClient(): KtxPostgresQueryClient {
return {
async executeQuery() {
return { headers: [], rows: [] };
},
};
}
function row(overrides: Partial<PostgresPgssRow> & Pick<PostgresPgssRow, 'queryid' | 'query'>): PostgresPgssRow {
return {
userid: '11',
username: 'analyst',
dbid: '5',
database: 'warehouse',
calls: 10,
totalExecTime: 250,
meanExecTime: 25,
totalRows: 20,
...overrides,
};
}
function fakeReader(input: {
pgServerVersion?: string;
warnings?: string[];
statsResetAt?: string | null;
deallocCount?: number | null;
rows: PostgresPgssRow[];
}): PostgresPgssReader {
return {
probe: vi.fn(async () => ({
pgServerVersion: input.pgServerVersion ?? 'PostgreSQL 16.4',
warnings: input.warnings ?? [],
})),
readSnapshot: vi.fn(async (_client, options) => ({
statsResetAt: input.statsResetAt ?? '2026-05-08T08:00:00.000Z',
deallocCount: input.deallocCount ?? 0,
rows: input.rows.slice(0, options.maxTemplates),
})),
};
}
const sqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
if (sql.includes('broken')) {
return {
fingerprint: '',
normalizedSql: '',
tablesTouched: [],
literalSlots: [],
error: 'parse failed',
};
}
if (sql.includes('customers')) {
return {
fingerprint: 'fp_customers',
normalizedSql: 'SELECT count(*) FROM analytics.customers',
tablesTouched: ['analytics.customers'],
literalSlots: [],
};
}
return {
fingerprint: 'fp_orders',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
tablesTouched: ['analytics.orders'],
literalSlots: [],
};
},
};
function postgresPullConfig(maxTemplatesPerRun = 5000) {
return {
dialect: 'postgres' as const,
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: ['^svc_'],
redactionPatterns: ['secret'],
maxTemplatesPerRun,
minCalls: 5,
};
}
describe('stagePgStatStatementsTemplates', () => {
it('stages first-run PGSS templates as degraded aggregate templates and builds a next baseline', async () => {
const stagedDir = await tempDir('pgss-stage-first-');
const baselineRootDir = await tempDir('pgss-baseline-first-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
warnings: ['pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config'],
deallocCount: 2,
rows: [
row({
queryid: '101',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 10,
totalExecTime: 250,
totalRows: 20,
}),
row({
queryid: '102',
query: 'SELECT * FROM pg_catalog.pg_class',
calls: 50,
totalExecTime: 500,
}),
row({
queryid: '103',
query: 'BEGIN',
calls: 75,
totalExecTime: 75,
}),
row({
queryid: '104',
query: 'SELECT broken FROM analytics.orders',
calls: 8,
totalExecTime: 80,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest).toMatchObject({
source: 'historic-sql',
connectionId: 'conn_pg',
dialect: 'postgres',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
templateCount: 1,
capped: false,
degraded: true,
statsResetAt: '2026-05-08T08:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 2,
});
expect(manifest.warnings).toEqual([
'pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config',
'pgss_dealloc_count:2; pg_stat_statements.max may be too low, causing template eviction churn',
'baseline_first_run:no_previous_pgss_baseline',
'analysis_failed:db5_q104',
]);
expect(manifest.templates).toEqual([
{
id: 'db5_q101',
fingerprint: 'fp_orders',
subClusterId: null,
path: 'templates/db5_q101/page.md',
},
]);
const metadata = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q101/metadata.json'));
expect(metadata).toMatchObject({
id: 'db5_q101',
title: 'postgres · analytics.orders [db5_q101]',
path: 'templates/db5_q101/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_orders',
sub_cluster_id: null,
dialect: 'postgres',
tables_touched: ['analytics.orders'],
literal_slots: [],
},
});
expect(metadata.properties.triage_signals).toEqual({
executions_bucket: 'mid',
distinct_users_bucket: 'solo',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
runtime_bucket: 'fast',
});
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q101/usage.json'));
expect(usage).toEqual({
stats: {
executions: 10,
distinct_users: 1,
first_seen: '2026-05-08T12:00:00.000Z',
last_seen: '2026-05-08T12:00:00.000Z',
p50_runtime_ms: null,
p95_runtime_ms: null,
mean_runtime_ms: 25,
error_rate: 0,
rows_produced: 20,
},
literal_slots: [],
samples: [],
});
expect(await readFile(join(stagedDir, 'templates/db5_q101/page.md'), 'utf-8')).toContain(
'SELECT count(*) FROM analytics.orders WHERE status = $1',
);
expect(result.baselinePath).toBe(baselinePath);
expect(result.baseline.templates.db5_q101.perUser['11']).toEqual({
calls: 10,
totalExecTime: 250,
totalRows: 20,
});
await expect(readPgssBaseline(baselinePath)).resolves.toBeNull();
});
it('warns when pg_stat_statements reports dealloc churn', async () => {
const root = await tempDir('pgss-churn-');
const stagedDir = join(root, 'staged');
const baselinePath = join(root, 'cache', 'warehouse', 'pgss-baseline.json');
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'warehouse',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '901',
query: 'SELECT COUNT(*) FROM public.orders WHERE status = $1',
calls: 20,
totalExecTime: 500,
meanExecTime: 25,
}),
],
deallocCount: 3,
}),
sqlAnalysis,
pullConfig: postgresPullConfig(50),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = await readJson<{ warnings: string[]; deallocCount: number }>(stagedDir, 'manifest.json');
expect(manifest.deallocCount).toBe(3);
expect(manifest.warnings).toContain(
'pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn',
);
});
it('uses the saved cumulative baseline to stage only positive deltas on later runs', async () => {
const stagedDir = await tempDir('pgss-stage-delta-');
const baselineRootDir = await tempDir('pgss-baseline-delta-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
const baseline: PgssBaseline = {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q201: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 10, totalExecTime: 100, totalRows: 50 },
'12': { calls: 5, totalExecTime: 50, totalRows: 25 },
},
},
},
};
await writePgssBaselineAtomic(baselinePath, baseline);
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '201',
userid: '11',
username: 'analyst',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 12,
totalExecTime: 160,
totalRows: 58,
}),
row({
queryid: '201',
userid: '12',
username: 'svc_loader',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 5,
totalExecTime: 50,
totalRows: 25,
}),
row({
queryid: '202',
userid: '13',
username: 'analyst_2',
query: 'SELECT count(*) FROM analytics.customers',
calls: 7,
totalExecTime: 210,
totalRows: 7,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.baselineFirstRun).toBe(false);
expect(manifest.windowStart).toBe('2026-05-08T10:00:00.000Z');
expect(manifest.templateCount).toBe(2);
expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q202', 'db5_q201']);
const usage201 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q201/usage.json'));
expect(usage201.stats).toMatchObject({
executions: 2,
distinct_users: 1,
first_seen: '2026-05-08T09:00:00.000Z',
last_seen: '2026-05-08T12:00:00.000Z',
mean_runtime_ms: 30,
rows_produced: 8,
});
const metadata201 = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q201/metadata.json'));
expect(metadata201.properties.triage_signals.service_account_only).toBe('false');
const usage202 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q202/usage.json'));
expect(usage202.stats).toMatchObject({
executions: 7,
distinct_users: 1,
first_seen: '2026-05-08T12:00:00.000Z',
mean_runtime_ms: 30,
rows_produced: 7,
});
});
it('keeps matching queryid values from different databases as distinct templates and baseline entries', async () => {
const stagedDir = await tempDir('pgss-stage-db-key-');
const baselineRootDir = await tempDir('pgss-baseline-db-key-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(baselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q701: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 10, totalExecTime: 100, totalRows: 50 },
},
},
db6_q701: {
firstObservedAt: '2026-05-08T09:30:00.000Z',
perUser: {
'11': { calls: 4, totalExecTime: 40, totalRows: 20 },
},
},
},
});
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '701',
dbid: '5',
database: 'warehouse',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 12,
totalExecTime: 160,
totalRows: 58,
}),
row({
queryid: '701',
dbid: '6',
database: 'app',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 9,
totalExecTime: 130,
totalRows: 35,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templates.map((template) => template.id).sort()).toEqual(['db5_q701', 'db6_q701']);
const warehouseUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q701/usage.json'));
expect(warehouseUsage.stats).toMatchObject({
executions: 2,
rows_produced: 8,
first_seen: '2026-05-08T09:00:00.000Z',
});
const appUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db6_q701/usage.json'));
expect(appUsage.stats).toMatchObject({
executions: 5,
rows_produced: 15,
first_seen: '2026-05-08T09:30:00.000Z',
});
expect(result.baseline.templates.db5_q701.perUser['11']).toEqual({
calls: 12,
totalExecTime: 160,
totalRows: 58,
});
expect(result.baseline.templates.db6_q701.perUser['11']).toEqual({
calls: 9,
totalExecTime: 130,
totalRows: 35,
});
});
it('treats stats_reset advancement and major-version changes as fresh baselines', async () => {
const resetStagedDir = await tempDir('pgss-stage-reset-');
const resetBaselineRootDir = await tempDir('pgss-baseline-reset-');
const resetBaselinePath = pgssBaselinePath(resetBaselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(resetBaselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q301: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
},
},
},
});
await stagePgStatStatementsTemplates({
stagedDir: resetStagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
statsResetAt: '2026-05-08T11:00:00.000Z',
rows: [
row({
queryid: '301',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 3,
totalExecTime: 90,
totalRows: 9,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath: resetBaselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const resetManifest = historicSqlManifestSchema.parse(await readJson(resetStagedDir, 'manifest.json'));
expect(resetManifest.baselineFirstRun).toBe(true);
expect(resetManifest.warnings).toContain(
'baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z',
);
const resetUsage = historicSqlUsageSchema.parse(await readJson(resetStagedDir, 'templates/db5_q301/usage.json'));
expect(resetUsage.stats.executions).toBe(3);
const versionStagedDir = await tempDir('pgss-stage-version-');
const versionBaselineRootDir = await tempDir('pgss-baseline-version-');
const versionBaselinePath = pgssBaselinePath(versionBaselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(versionBaselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 15.7',
templates: {
db5_q302: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
},
},
},
});
await stagePgStatStatementsTemplates({
stagedDir: versionStagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
pgServerVersion: 'PostgreSQL 16.4',
rows: [
row({
queryid: '302',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 4,
totalExecTime: 80,
totalRows: 8,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath: versionBaselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const versionManifest = historicSqlManifestSchema.parse(await readJson(versionStagedDir, 'manifest.json'));
expect(versionManifest.baselineFirstRun).toBe(true);
expect(versionManifest.warnings).toContain('baseline_reset:pg_server_major changed from 15 to 16');
});
it('handles scoped counter regressions without forcing a global first-run baseline', async () => {
const stagedDir = await tempDir('pgss-stage-scoped-');
const baselineRootDir = await tempDir('pgss-baseline-scoped-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(baselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q401: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
'12': { calls: 50, totalExecTime: 500, totalRows: 250 },
},
},
},
});
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
statsResetAt: '2026-05-08T08:00:00.000Z',
rows: [
row({
queryid: '401',
userid: '11',
username: 'analyst',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 2,
totalExecTime: 30,
totalRows: 6,
}),
row({
queryid: '401',
userid: '12',
username: 'svc_loader',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 55,
totalExecTime: 650,
totalRows: 275,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.baselineFirstRun).toBe(false);
expect(manifest.warnings).toContain('scoped_reset:dbid=5 queryid=401 userid=11');
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q401/usage.json'));
expect(usage.stats).toMatchObject({
executions: 7,
distinct_users: 2,
mean_runtime_ms: 25.714285714285715,
rows_produced: 31,
});
});
it('ranks and caps selected PGSS templates after skip and analysis filtering', async () => {
const stagedDir = await tempDir('pgss-stage-cap-');
const baselineRootDir = await tempDir('pgss-baseline-cap-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '501',
username: 'analyst_a',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 2,
totalExecTime: 20,
}),
row({
queryid: '502',
username: 'analyst_b',
query: 'SELECT count(*) FROM analytics.customers',
calls: 20,
totalExecTime: 200,
}),
row({
queryid: '503',
username: 'analyst_c',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 10,
totalExecTime: 100,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(2),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.capped).toBe(true);
expect(manifest.warnings).toContain('templates_truncated: kept 2 of 3 templates');
expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q502', 'db5_q503']);
});
});

View file

@ -1,508 +0,0 @@
import { mkdir, readFile, rename, writeFile } from 'node:fs/promises';
import { dirname, join } from 'node:path';
import { z } from 'zod';
import type { SqlAnalysisFingerprintResult, SqlAnalysisPort } from '../../../sql-analysis/index.js';
import {
HISTORIC_SQL_OBJECT_TYPE,
HISTORIC_SQL_SOURCE_KEY,
historicSqlPullConfigSchema,
type HistoricSqlManifest,
type HistoricSqlMetadata,
type HistoricSqlPullConfig,
type HistoricSqlUsage,
type KtxPostgresQueryClient,
type PostgresPgssAggregateRow,
type PostgresPgssReader,
type PostgresPgssRow,
} from './types.js';
const PGSS_BASELINE_VERSION = 1 as const;
const pgssCounterSchema = z.object({
calls: z.number().int().nonnegative(),
totalExecTime: z.number().nonnegative(),
totalRows: z.number().int().nonnegative(),
});
const pgssBaselineSchema = z.object({
version: z.literal(PGSS_BASELINE_VERSION),
fetchedAt: z.string().datetime(),
statsResetAt: z.string().datetime().nullable(),
pgServerVersion: z.string(),
templates: z.record(
z.string(),
z.object({
firstObservedAt: z.string().datetime(),
perUser: z.record(z.string(), pgssCounterSchema),
}),
),
});
export type PgssBaseline = z.infer<typeof pgssBaselineSchema>;
export interface StagePgStatStatementsTemplatesInput {
stagedDir: string;
connectionId: string;
queryClient: KtxPostgresQueryClient;
reader: PostgresPgssReader;
sqlAnalysis: SqlAnalysisPort;
pullConfig: HistoricSqlPullConfig;
baselinePath: string;
now?: Date;
}
export interface StagePgStatStatementsTemplatesResult {
baselinePath: string;
baseline: PgssBaseline;
}
interface PgssBaselineCounter {
calls: number;
totalExecTime: number;
totalRows: number;
}
interface PgssAggregateMutable {
id: string;
queryid: string;
dbid: string;
database: string | null;
query: string;
deltaCalls: number;
deltaExecTime: number;
deltaRows: number;
users: Set<string>;
firstObservedAt: string;
}
interface AnalyzedPgssTemplate {
aggregate: PostgresPgssAggregateRow;
analysis: SqlAnalysisFingerprintResult;
}
const ZERO_COUNTER: PgssBaselineCounter = {
calls: 0,
totalExecTime: 0,
totalRows: 0,
};
const PGSS_SNAPSHOT_READ_LIMIT = 5000;
const PGSS_HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET|BEGIN|COMMIT|ROLLBACK|VACUUM|ANALYZE)\b/i;
const PGSS_HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|pg_catalog\.|pg_toast\.|pg_stat_)/i;
function pgssTemplateId(row: Pick<PostgresPgssRow, 'dbid' | 'queryid'>): string {
return `db${row.dbid}_q${row.queryid}`;
}
export function pgssBaselinePath(rootDir: string | undefined, connectionId: string): string {
return join(rootDir ?? join(process.cwd(), '.ktx/cache/historic-sql'), connectionId, 'pgss-baseline.json');
}
export async function readPgssBaseline(path: string): Promise<PgssBaseline | null> {
try {
return pgssBaselineSchema.parse(JSON.parse(await readFile(path, 'utf-8')));
} catch (error) {
if (error && typeof error === 'object' && 'code' in error && error.code === 'ENOENT') {
return null;
}
throw error;
}
}
export async function writePgssBaselineAtomic(path: string, baseline: PgssBaseline): Promise<void> {
const parsed = pgssBaselineSchema.parse(baseline);
await mkdir(dirname(path), { recursive: true });
const tempPath = `${path}.tmp`;
await writeFile(tempPath, `${JSON.stringify(parsed, null, 2)}\n`, 'utf-8');
await rename(tempPath, path);
}
export async function stagePgStatStatementsTemplates(
input: StagePgStatStatementsTemplatesInput,
): Promise<StagePgStatStatementsTemplatesResult> {
const config = historicSqlPullConfigSchema.parse(input.pullConfig);
if (config.dialect !== 'postgres') {
throw new Error(`stagePgStatStatementsTemplates requires dialect postgres, got ${config.dialect}`);
}
const now = input.now ?? new Date();
const fetchedAt = now.toISOString();
const probe = await input.reader.probe(input.queryClient);
const warnings = [...probe.warnings];
const baseline = await readPgssBaseline(input.baselinePath);
const snapshot = await input.reader.readSnapshot(input.queryClient, {
minCalls: config.minCalls,
maxTemplates: PGSS_SNAPSHOT_READ_LIMIT,
});
if (snapshot.deallocCount !== null && snapshot.deallocCount > 0) {
warnings.push(
`pgss_dealloc_count:${snapshot.deallocCount}; pg_stat_statements.max may be too low, causing template eviction churn`,
);
}
const reset = detectBaselineReset({
baseline,
snapshotStatsResetAt: snapshot.statsResetAt,
currentPgServerVersion: probe.pgServerVersion,
});
warnings.push(...reset.warnings);
const aggregates = aggregatePgssRows({
rows: snapshot.rows,
baseline,
baselineFirstRun: reset.baselineFirstRun,
fetchedAt,
warnings,
}).filter((aggregate) => !shouldSkipPgssSql(aggregate.query));
const analyzed: AnalyzedPgssTemplate[] = [];
for (const aggregate of aggregates) {
const analysis = await input.sqlAnalysis.analyzeForFingerprint(aggregate.query, 'postgres');
if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) {
warnings.push(`analysis_failed:${aggregate.id}`);
continue;
}
analyzed.push({ aggregate, analysis });
}
const selected = selectPgssTemplates(analyzed, config.maxTemplatesPerRun);
if (selected.length < analyzed.length) {
warnings.push(`templates_truncated: kept ${selected.length} of ${analyzed.length} templates`);
}
await mkdir(input.stagedDir, { recursive: true });
const templates: HistoricSqlManifest['templates'] = [];
for (const template of selected) {
const staged = buildPgssStagedTemplate(template, config, now);
const basePath = `templates/${staged.metadata.id}`;
await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata);
await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown);
await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage);
templates.push({
id: staged.metadata.id,
fingerprint: staged.metadata.properties.fingerprint,
subClusterId: staged.metadata.properties.sub_cluster_id,
path: staged.metadata.path,
});
}
await writeJson(input.stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: input.connectionId,
dialect: 'postgres',
fetchedAt,
windowStart: baseline?.fetchedAt ?? snapshot.statsResetAt ?? fetchedAt,
windowEnd: fetchedAt,
nextSuccessfulCursor: fetchedAt,
templateCount: selected.length,
capped: selected.length < analyzed.length,
warnings,
degraded: true,
statsResetAt: snapshot.statsResetAt,
baselineFirstRun: reset.baselineFirstRun,
pgServerVersion: probe.pgServerVersion,
deallocCount: snapshot.deallocCount,
templates,
} satisfies HistoricSqlManifest);
return {
baselinePath: input.baselinePath,
baseline: buildNextBaseline({
rows: snapshot.rows,
fetchedAt,
statsResetAt: snapshot.statsResetAt,
pgServerVersion: probe.pgServerVersion,
previousBaseline: reset.baselineFirstRun ? null : baseline,
}),
};
}
function detectBaselineReset(input: {
baseline: PgssBaseline | null;
snapshotStatsResetAt: string | null;
currentPgServerVersion: string;
}): { baselineFirstRun: boolean; warnings: string[] } {
if (!input.baseline) {
return { baselineFirstRun: true, warnings: ['baseline_first_run:no_previous_pgss_baseline'] };
}
const warnings: string[] = [];
if (
input.baseline.statsResetAt &&
input.snapshotStatsResetAt &&
input.baseline.statsResetAt < input.snapshotStatsResetAt
) {
warnings.push(
`baseline_reset:stats_reset advanced from ${input.baseline.statsResetAt} to ${input.snapshotStatsResetAt}`,
);
}
const previousMajor = postgresMajor(input.baseline.pgServerVersion);
const currentMajor = postgresMajor(input.currentPgServerVersion);
if (previousMajor && currentMajor && previousMajor !== currentMajor) {
warnings.push(`baseline_reset:pg_server_major changed from ${previousMajor} to ${currentMajor}`);
}
return { baselineFirstRun: warnings.length > 0, warnings };
}
function postgresMajor(version: string): string | null {
return version.match(/PostgreSQL\s+(\d+)/i)?.[1] ?? version.match(/^(\d+)(?:\.|$)/)?.[1] ?? null;
}
function aggregatePgssRows(input: {
rows: PostgresPgssRow[];
baseline: PgssBaseline | null;
baselineFirstRun: boolean;
fetchedAt: string;
warnings: string[];
}): PostgresPgssAggregateRow[] {
const aggregates = new Map<string, PgssAggregateMutable>();
for (const row of input.rows) {
const templateId = pgssTemplateId(row);
const baselineTemplate = input.baselineFirstRun ? undefined : input.baseline?.templates[templateId];
const baselineCounter = baselineTemplate?.perUser[row.userid];
const previous = scopedCounterBaseline(row, baselineCounter, input.baselineFirstRun, input.warnings);
const deltaCalls = row.calls - previous.calls;
const deltaExecTime = row.totalExecTime - previous.totalExecTime;
const deltaRows = row.totalRows - previous.totalRows;
if (deltaCalls === 0 && !input.baselineFirstRun) {
continue;
}
const existing =
aggregates.get(templateId) ??
({
id: templateId,
queryid: row.queryid,
dbid: row.dbid,
database: row.database,
query: row.query,
deltaCalls: 0,
deltaExecTime: 0,
deltaRows: 0,
users: new Set<string>(),
firstObservedAt: baselineTemplate?.firstObservedAt ?? input.fetchedAt,
} satisfies PgssAggregateMutable);
existing.deltaCalls += Math.max(0, deltaCalls);
existing.deltaExecTime += Math.max(0, deltaExecTime);
existing.deltaRows += Math.max(0, deltaRows);
if (deltaCalls > 0) {
existing.users.add(row.username ?? 'unknown');
}
aggregates.set(templateId, existing);
}
return [...aggregates.values()]
.filter((aggregate) => aggregate.deltaCalls > 0)
.map((aggregate) => ({
id: aggregate.id,
queryid: aggregate.queryid,
dbid: aggregate.dbid,
database: aggregate.database,
query: aggregate.query,
deltaCalls: aggregate.deltaCalls,
deltaExecTime: aggregate.deltaExecTime,
deltaRows: aggregate.deltaRows,
meanExecTime: aggregate.deltaExecTime / Math.max(aggregate.deltaCalls, 1),
distinctUsersDelta: aggregate.users.size,
users: [...aggregate.users].sort(),
firstObservedAt: aggregate.firstObservedAt,
}));
}
function scopedCounterBaseline(
row: PostgresPgssRow,
baselineCounter: PgssBaselineCounter | undefined,
baselineFirstRun: boolean,
warnings: string[],
): PgssBaselineCounter {
if (!baselineCounter || baselineFirstRun) {
return ZERO_COUNTER;
}
if (
baselineCounter.calls > row.calls ||
baselineCounter.totalExecTime > row.totalExecTime ||
baselineCounter.totalRows > row.totalRows
) {
warnings.push(`scoped_reset:dbid=${row.dbid} queryid=${row.queryid} userid=${row.userid}`);
return ZERO_COUNTER;
}
return baselineCounter;
}
function shouldSkipPgssSql(sql: string): boolean {
return PGSS_HARD_SKIP_PREFIX_RE.test(sql) || PGSS_HARD_SKIP_TABLE_RE.test(sql);
}
function selectPgssTemplates(templates: AnalyzedPgssTemplate[], maxTemplatesPerRun: number): AnalyzedPgssTemplate[] {
return templates
.map((template) => ({
template,
score: template.aggregate.users.length * Math.log1p(template.aggregate.deltaCalls),
}))
.sort(
(left, right) => right.score - left.score || left.template.aggregate.id.localeCompare(right.template.aggregate.id),
)
.slice(0, maxTemplatesPerRun)
.map((entry) => entry.template);
}
function buildPgssStagedTemplate(
template: AnalyzedPgssTemplate,
config: HistoricSqlPullConfig,
now: Date,
): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } {
const tablesTouched = [...template.analysis.tablesTouched].sort();
const firstTable = tablesTouched[0] ?? 'query';
const id = template.aggregate.id;
const metadata: HistoricSqlMetadata = {
id,
title: `postgres · ${firstTable} [${id.slice(0, 12)}]`,
path: `templates/${id}/page.md`,
objectType: HISTORIC_SQL_OBJECT_TYPE,
lastEditedAt: null,
properties: {
fingerprint: template.analysis.fingerprint,
sub_cluster_id: null,
dialect: 'postgres',
tables_touched: tablesTouched,
literal_slots: [],
triage_signals: buildPgssTriageSignals({
executions: template.aggregate.deltaCalls,
distinctUsers: template.aggregate.distinctUsersDelta,
firstSeen: template.aggregate.firstObservedAt,
lastSeen: now.toISOString(),
meanRuntimeMs: template.aggregate.meanExecTime,
serviceAccountOnly: isServiceAccountOnly(template.aggregate.users, config.serviceAccountUserPatterns),
now,
}),
},
};
return {
metadata,
pageMarkdown: renderTemplatePage(id, template.analysis.normalizedSql, tablesTouched),
usage: {
stats: {
executions: template.aggregate.deltaCalls,
distinct_users: template.aggregate.distinctUsersDelta,
first_seen: template.aggregate.firstObservedAt,
last_seen: now.toISOString(),
p50_runtime_ms: null,
p95_runtime_ms: null,
mean_runtime_ms: template.aggregate.meanExecTime,
error_rate: 0,
rows_produced: template.aggregate.deltaRows,
},
literal_slots: [],
samples: [],
},
};
}
function buildPgssTriageSignals(input: {
executions: number;
distinctUsers: number;
firstSeen: string;
lastSeen: string;
meanRuntimeMs: number;
serviceAccountOnly: boolean;
now: Date;
}): Record<string, string> {
return {
executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high',
distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad',
error_rate_bucket: 'ok',
recency_bucket: recencyBucket(input.lastSeen, input.now),
service_account_only: String(input.serviceAccountOnly),
runtime_bucket: runtimeBucket(input.meanRuntimeMs),
};
}
function runtimeBucket(meanRuntimeMs: number): string {
if (meanRuntimeMs < 100) {
return 'fast';
}
if (meanRuntimeMs < 1000) {
return 'moderate';
}
return 'slow';
}
function recencyBucket(lastSeen: string, now: Date): string {
const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / 86400000);
if (ageDays <= 14) {
return 'active';
}
if (ageDays <= 60) {
return 'warm';
}
return 'cold';
}
function isServiceAccountOnly(users: string[], patterns: string[]): boolean {
if (users.length === 0 || patterns.length === 0) {
return false;
}
const regexes = patterns.map((pattern) => new RegExp(pattern));
return users.every((user) => regexes.some((regex) => regex.test(user)));
}
function renderTemplatePage(id: string, normalizedSql: string, tablesTouched: string[]): string {
return [
`# ${id}`,
'',
'## Normalized SQL',
'```sql',
normalizedSql,
'```',
'',
'## Tables touched',
...tablesTouched.map((table) => `- ${table}`),
'',
].join('\n');
}
function buildNextBaseline(input: {
rows: PostgresPgssRow[];
fetchedAt: string;
statsResetAt: string | null;
pgServerVersion: string;
previousBaseline: PgssBaseline | null;
}): PgssBaseline {
const templates: PgssBaseline['templates'] = {};
for (const row of input.rows) {
const templateId = pgssTemplateId(row);
const previous = input.previousBaseline?.templates[templateId];
const template = templates[templateId] ?? {
firstObservedAt: previous?.firstObservedAt ?? input.fetchedAt,
perUser: {},
};
template.perUser[row.userid] = {
calls: row.calls,
totalExecTime: row.totalExecTime,
totalRows: row.totalRows,
};
templates[templateId] = template;
}
return {
version: PGSS_BASELINE_VERSION,
fetchedAt: input.fetchedAt,
statsResetAt: input.statsResetAt,
pgServerVersion: input.pgServerVersion,
templates,
};
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`);
}
async function writeText(root: string, relPath: string, value: string): Promise<void> {
const target = join(root, relPath);
await mkdir(dirname(target), { recursive: true });
await writeFile(target, value, 'utf-8');
}

View file

@ -0,0 +1,358 @@
import { mkdtemp, readFile, readdir } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it, vi } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import { stageHistoricSqlAggregatedSnapshot } from './stage-unified.js';
import type { AggregatedTemplate, HistoricSqlReader } from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-unified-stage-'));
}
async function readJson<T>(root: string, relPath: string): Promise<T> {
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
}
function aggregate(overrides: Partial<AggregatedTemplate> & { templateId: string; canonicalSql: string }): AggregatedTemplate {
return {
templateId: overrides.templateId,
canonicalSql: overrides.canonicalSql,
dialect: overrides.dialect ?? 'postgres',
stats: overrides.stats ?? {
executions: 42,
distinctUsers: 3,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 20,
p95RuntimeMs: 80,
errorRate: 0,
rowsProduced: 100,
},
topUsers: overrides.topUsers ?? [{ user: 'analyst', executions: 40 }],
};
}
describe('stageHistoricSqlAggregatedSnapshot', () => {
it('batch parses templates and writes stable table and patterns artifacts', async () => {
const stagedDir = await tempDir();
const reader: HistoricSqlReader = {
async probe() {
return { warnings: ['pg_stat_statements.track is none; aggregation still proceeds'], info: [] };
},
async *fetchAggregated() {
yield aggregate({
templateId: 'orders-by-status',
canonicalSql: 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.created_at >= $1 group by o.status',
});
yield aggregate({
templateId: 'service-account-only',
canonicalSql: 'select * from public.orders where id = $1',
stats: {
executions: 20,
distinctUsers: 1,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 5,
p95RuntimeMs: 10,
errorRate: 0,
rowsProduced: 1,
},
topUsers: [{ user: 'svc_loader', executions: 20 }],
});
yield aggregate({
templateId: 'bad-parse',
canonicalSql: 'select broken from',
});
},
};
const sqlAnalysis: SqlAnalysisPort = {
analyzeForFingerprint: vi.fn(),
analyzeBatch: vi.fn(async () => new Map([
[
'orders-by-status',
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: {
select: ['status'],
where: ['created_at'],
join: ['customer_id'],
groupBy: ['status'],
},
},
],
['bad-parse', { tablesTouched: [], columnsByClause: {}, error: 'parse failed' }],
])),
};
await stageHistoricSqlAggregatedSnapshot({
stagedDir,
connectionId: 'warehouse',
queryClient: {},
reader,
sqlAnalysis,
pullConfig: {
dialect: 'postgres',
filters: {
serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' },
},
},
now: new Date('2026-05-11T12:00:00.000Z'),
});
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(1);
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith(
[
{
id: 'orders-by-status',
sql: 'select o.status, count(*) from public.orders o join public.customers c on c.id = o.customer_id where o.created_at >= $1 group by o.status',
},
{ id: 'bad-parse', sql: 'select broken from' },
],
'postgres',
);
expect(await readdir(join(stagedDir, 'tables'))).toEqual(['public.customers.json', 'public.orders.json']);
const manifest = await readJson<Record<string, unknown>>(stagedDir, 'manifest.json');
expect(manifest).toMatchObject({
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
snapshotRowCount: 3,
touchedTableCount: 2,
parseFailures: 1,
warnings: ['parse_failed:bad-parse'],
probeWarnings: ['pg_stat_statements.track is none; aggregation still proceeds'],
staleArchiveAfterDays: 90,
});
const orders = await readJson<Record<string, any>>(stagedDir, 'tables/public.orders.json');
expect(orders).toMatchObject({
table: 'public.orders',
stats: {
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
errorRateBucket: 'none',
p95RuntimeBucket: '<100ms',
recencyBucket: 'current',
},
columnsByClause: {
select: [['status', 'high']],
where: [['created_at', 'high']],
join: [['customer_id', 'high']],
groupBy: [['status', 'high']],
},
observedJoins: [{ withTable: 'public.customers', on: ['customer_id'], freq: 'high' }],
topTemplates: [
{
id: 'orders-by-status',
topUsers: [{ user: 'analyst' }],
},
],
});
expect(orders.topTemplates[0].canonicalSql).toContain('group by o.status');
const patterns = await readJson<Record<string, any>>(stagedDir, 'patterns-input.json');
expect(patterns.templates).toEqual([
{
id: 'orders-by-status',
canonicalSql: expect.stringContaining('public.orders'),
tablesTouched: ['public.customers', 'public.orders'],
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
},
]);
});
it('redacts configured SQL substrings in staged artifacts while analyzing original SQL', async () => {
const stagedDir = await tempDir();
const originalSql =
"select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'";
const reader: HistoricSqlReader = {
async probe() {
return { warnings: [], info: [] };
},
async *fetchAggregated() {
yield aggregate({
templateId: 'api-events-with-secret',
canonicalSql: originalSql,
stats: {
executions: 15,
distinctUsers: 2,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 12,
p95RuntimeMs: 25,
errorRate: 0,
rowsProduced: 15,
},
});
},
};
const sqlAnalysis: SqlAnalysisPort = {
analyzeForFingerprint: vi.fn(),
analyzeBatch: vi.fn(async () => new Map([
[
'api-events-with-secret',
{
tablesTouched: ['public.api_events'],
columnsByClause: {
select: [],
where: ['api_key', 'note'],
join: [],
groupBy: [],
},
},
],
])),
};
await stageHistoricSqlAggregatedSnapshot({
stagedDir,
connectionId: 'warehouse',
queryClient: {},
reader,
sqlAnalysis,
pullConfig: {
dialect: 'postgres',
redactionPatterns: ['sk_live_[A-Za-z0-9]+', '(?i)secret_token_[a-z0-9]+'],
},
now: new Date('2026-05-11T12:00:00.000Z'),
});
expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith(
[{ id: 'api-events-with-secret', sql: originalSql }],
'postgres',
);
const tableJson = await readFile(join(stagedDir, 'tables/public.api_events.json'), 'utf-8');
const patternsJson = await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8');
expect(tableJson).not.toContain('sk_live_abc123');
expect(tableJson).not.toContain('Secret_Token_9f');
expect(patternsJson).not.toContain('sk_live_abc123');
expect(patternsJson).not.toContain('Secret_Token_9f');
expect(tableJson).toContain('[REDACTED]');
expect(patternsJson).toContain('[REDACTED]');
});
it('preserves full patterns audit input and writes bounded cross-table pattern shards', async () => {
const stagedDir = await tempDir();
const largeSql = `select * from public.orders o join public.customers c on c.id = o.customer_id where payload = '${'x'.repeat(8000)}'`;
const reader: HistoricSqlReader = {
async probe() {
return { warnings: [], info: [] };
},
async *fetchAggregated() {
yield aggregate({
templateId: 'orders-customers-a',
canonicalSql: largeSql,
stats: {
executions: 25,
distinctUsers: 4,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 15,
p95RuntimeMs: 90,
errorRate: 0,
rowsProduced: 250,
},
});
yield aggregate({
templateId: 'orders-customers-b',
canonicalSql: largeSql.replace('payload', 'payload_b'),
stats: {
executions: 22,
distinctUsers: 3,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 20,
p95RuntimeMs: 95,
errorRate: 0,
rowsProduced: 220,
},
});
yield aggregate({
templateId: 'orders-single-table',
canonicalSql: 'select count(*) from public.orders',
stats: {
executions: 30,
distinctUsers: 2,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 10,
p95RuntimeMs: 20,
errorRate: 0,
rowsProduced: 30,
},
});
},
};
const sqlAnalysis: SqlAnalysisPort = {
analyzeForFingerprint: vi.fn(),
analyzeBatch: vi.fn(async () => new Map([
[
'orders-customers-a',
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: {
select: [],
where: ['payload'],
join: ['customer_id', 'id'],
groupBy: [],
},
},
],
[
'orders-customers-b',
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: {
select: [],
where: ['payload_b'],
join: ['customer_id', 'id'],
groupBy: [],
},
},
],
[
'orders-single-table',
{
tablesTouched: ['public.orders'],
columnsByClause: {
select: [],
where: [],
join: [],
groupBy: [],
},
},
],
])),
};
await stageHistoricSqlAggregatedSnapshot({
stagedDir,
connectionId: 'warehouse',
queryClient: {},
reader,
sqlAnalysis,
pullConfig: { dialect: 'postgres' },
now: new Date('2026-05-11T12:00:00.000Z'),
});
const audit = await readJson<Record<string, any>>(stagedDir, 'patterns-input.json');
expect(audit.templates.map((entry: any) => entry.id)).toEqual([
'orders-customers-a',
'orders-customers-b',
'orders-single-table',
]);
const firstShard = await readJson<Record<string, any>>(stagedDir, 'patterns-input/part-0001.json');
expect(firstShard.templates.map((entry: any) => entry.id)).toEqual(['orders-customers-a', 'orders-customers-b']);
expect(firstShard.templates.some((entry: any) => entry.id === 'orders-single-table')).toBe(false);
const manifest = await readJson<Record<string, any>>(stagedDir, 'manifest.json');
expect(manifest.warnings).toEqual([]);
});
});

View file

@ -0,0 +1,308 @@
import { mkdir, writeFile } from 'node:fs/promises';
import { dirname, join } from 'node:path';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import {
bucketDistinctUsers,
bucketErrorRate,
bucketExecutions,
bucketFrequency,
bucketP95Runtime,
bucketRecency,
} from './buckets.js';
import { splitHistoricSqlPatternInputs } from './pattern-inputs.js';
import {
compileHistoricSqlRedactionPatterns,
redactHistoricSqlText,
type HistoricSqlRedactionPattern,
} from './redaction.js';
import {
HISTORIC_SQL_SOURCE_KEY,
aggregatedTemplateSchema,
historicSqlUnifiedPullConfigSchema,
type AggregatedTemplate,
type HistoricSqlReader,
type HistoricSqlUnifiedPullConfig,
type StagedPatternsInput,
type StagedTableInput,
} from './types.js';
interface StageHistoricSqlAggregatedSnapshotInput {
stagedDir: string;
connectionId: string;
queryClient: unknown;
reader: HistoricSqlReader;
sqlAnalysis: SqlAnalysisPort;
pullConfig: unknown;
now?: Date;
}
interface ParsedTemplate {
template: AggregatedTemplate;
tablesTouched: string[];
columnsByClause: Record<string, string[]>;
}
interface TableAccumulator {
table: string;
executions: number;
distinctUsers: number;
errorRateNumerator: number;
p95RuntimeMs: number | null;
lastSeen: string;
columnsByClause: Map<string, Map<string, number>>;
observedJoins: Map<string, Map<string, number>>;
topTemplates: AggregatedTemplate[];
}
const TRIVIAL_SQL_RE = /^\s*SELECT\s+(1|NOW\(\)|CURRENT_TIMESTAMP|VERSION\(\))\s*;?\s*$/i;
const NOISE_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i;
const SYSTEM_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i;
function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
return mkdir(dirname(target), { recursive: true }).then(() =>
writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8'),
);
}
function compilePatterns(patterns: string[]): RegExp[] {
return patterns.map((pattern) => new RegExp(pattern));
}
function matchesAny(value: string | null, patterns: RegExp[]): boolean {
return !!value && patterns.some((pattern) => pattern.test(value));
}
function shouldDropBySql(sql: string, config: HistoricSqlUnifiedPullConfig): boolean {
if (NOISE_PREFIX_RE.test(sql) || SYSTEM_TABLE_RE.test(sql)) return true;
if (config.filters.dropTrivialProbes !== false && TRIVIAL_SQL_RE.test(sql)) return true;
return false;
}
function shouldDropByUsers(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean {
const service = config.filters.serviceAccounts;
if (!service || service.mode === 'mark-only' || service.patterns.length === 0) return false;
const patterns = compilePatterns(service.patterns);
const matchingExecutions = template.topUsers
.filter((entry) => matchesAny(entry.user, patterns))
.reduce((sum, entry) => sum + entry.executions, 0);
const allExecutions = template.topUsers.reduce((sum, entry) => sum + entry.executions, 0);
const serviceOnly = allExecutions > 0 && matchingExecutions >= allExecutions;
return service.mode === 'exclude' ? serviceOnly : !serviceOnly;
}
function shouldDropByFailure(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean {
const failed = config.filters.dropFailedBelow;
return !!failed && template.stats.errorRate > failed.errorRate && template.stats.executions < failed.executions;
}
function shouldDropTemplate(template: AggregatedTemplate, config: HistoricSqlUnifiedPullConfig): boolean {
if (shouldDropBySql(template.canonicalSql, config)) return true;
if (shouldDropByUsers(template, config)) return true;
if (shouldDropByFailure(template, config)) return true;
return false;
}
function redactTemplateSql(
template: AggregatedTemplate,
redactors: readonly HistoricSqlRedactionPattern[],
): AggregatedTemplate {
if (redactors.length === 0) {
return template;
}
return {
...template,
canonicalSql: redactHistoricSqlText(template.canonicalSql, redactors),
};
}
function recordColumn(acc: TableAccumulator, clause: string, column: string, executions: number): void {
const byColumn = acc.columnsByClause.get(clause) ?? new Map<string, number>();
byColumn.set(column, (byColumn.get(column) ?? 0) + executions);
acc.columnsByClause.set(clause, byColumn);
}
function recordJoin(acc: TableAccumulator, otherTable: string, columns: string[], executions: number): void {
const byColumns = acc.observedJoins.get(otherTable) ?? new Map<string, number>();
const key = [...new Set(columns)].sort().join(',');
if (key.length > 0) {
byColumns.set(key, (byColumns.get(key) ?? 0) + executions);
acc.observedJoins.set(otherTable, byColumns);
}
}
function accumulatorFor(table: string): TableAccumulator {
return {
table,
executions: 0,
distinctUsers: 0,
errorRateNumerator: 0,
p95RuntimeMs: null,
lastSeen: '1970-01-01T00:00:00.000Z',
columnsByClause: new Map(),
observedJoins: new Map(),
topTemplates: [],
};
}
function addTemplate(acc: TableAccumulator, parsed: ParsedTemplate): void {
const executions = parsed.template.stats.executions;
acc.executions += executions;
acc.distinctUsers = Math.max(acc.distinctUsers, parsed.template.stats.distinctUsers);
acc.errorRateNumerator += parsed.template.stats.errorRate * executions;
acc.p95RuntimeMs =
acc.p95RuntimeMs === null
? parsed.template.stats.p95RuntimeMs
: parsed.template.stats.p95RuntimeMs === null
? acc.p95RuntimeMs
: Math.max(acc.p95RuntimeMs, parsed.template.stats.p95RuntimeMs);
acc.lastSeen = parsed.template.stats.lastSeen > acc.lastSeen ? parsed.template.stats.lastSeen : acc.lastSeen;
for (const [clause, columns] of Object.entries(parsed.columnsByClause)) {
for (const column of columns) {
recordColumn(acc, clause, column, executions);
}
}
const joinColumns = parsed.columnsByClause.join ?? [];
for (const otherTable of parsed.tablesTouched.filter((table) => table !== acc.table)) {
recordJoin(acc, otherTable, joinColumns, executions);
}
acc.topTemplates.push(parsed.template);
}
function toStagedTable(acc: TableAccumulator, now: Date): StagedTableInput {
const errorRate = acc.executions > 0 ? acc.errorRateNumerator / acc.executions : 0;
const columnsByClause: Record<string, Array<[string, string]>> = Object.fromEntries(
[...acc.columnsByClause.entries()]
.sort(([left], [right]) => left.localeCompare(right))
.map(([clause, counts]) => [
clause,
[...counts.entries()]
.sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))
.map(([column, count]) => [column, bucketFrequency(count, acc.executions)] as [string, string]),
]),
);
const observedJoins = [...acc.observedJoins.entries()]
.flatMap(([withTable, byColumns]) =>
[...byColumns.entries()].map(([columns, count]) => ({
withTable,
on: columns.split(',').filter(Boolean),
freq: bucketFrequency(count, acc.executions),
})),
)
.sort((left, right) => left.withTable.localeCompare(right.withTable) || left.on.join(',').localeCompare(right.on.join(',')));
const topTemplates = [...acc.topTemplates]
.sort((left, right) => right.stats.executions - left.stats.executions || left.templateId.localeCompare(right.templateId))
.slice(0, 5)
.map((template) => ({
id: template.templateId,
canonicalSql: template.canonicalSql,
topUsers: template.topUsers.slice(0, 5).map((entry) => ({ user: entry.user })),
}));
return {
table: acc.table,
stats: {
executionsBucket: bucketExecutions(acc.executions),
distinctUsersBucket: bucketDistinctUsers(acc.distinctUsers),
errorRateBucket: bucketErrorRate(errorRate),
p95RuntimeBucket: bucketP95Runtime(acc.p95RuntimeMs),
recencyBucket: bucketRecency(acc.lastSeen, now),
},
columnsByClause,
observedJoins,
topTemplates,
};
}
function toPatternsInput(parsedTemplates: ParsedTemplate[]): StagedPatternsInput {
return {
templates: parsedTemplates
.map(({ template, tablesTouched }) => ({
id: template.templateId,
canonicalSql: template.canonicalSql,
tablesTouched: [...tablesTouched].sort(),
executionsBucket: bucketExecutions(template.stats.executions),
distinctUsersBucket: bucketDistinctUsers(template.stats.distinctUsers),
dialect: template.dialect,
}))
.sort((left, right) => left.id.localeCompare(right.id)),
};
}
export async function stageHistoricSqlAggregatedSnapshot(input: StageHistoricSqlAggregatedSnapshotInput): Promise<void> {
const config = historicSqlUnifiedPullConfigSchema.parse(input.pullConfig);
const redactors = compileHistoricSqlRedactionPatterns(config.redactionPatterns);
const now = input.now ?? new Date();
const windowStart = new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000);
const probe = await input.reader.probe(input.queryClient);
const snapshot: AggregatedTemplate[] = [];
let snapshotRowCount = 0;
for await (const row of input.reader.fetchAggregated(input.queryClient, { start: windowStart, end: now }, config)) {
snapshotRowCount += 1;
const parsed = aggregatedTemplateSchema.parse(row);
if (!shouldDropTemplate(parsed, config)) {
snapshot.push(parsed);
}
}
const analysis = await input.sqlAnalysis.analyzeBatch(
snapshot.map((template) => ({ id: template.templateId, sql: template.canonicalSql })),
config.dialect,
);
const warnings: string[] = [];
const parsedTemplates: ParsedTemplate[] = [];
for (const template of snapshot) {
const parsed = analysis.get(template.templateId);
if (!parsed || parsed.error) {
warnings.push(`parse_failed:${template.templateId}`);
continue;
}
const tablesTouched = [...new Set(parsed.tablesTouched)].filter((table) => table.length > 0).sort();
if (tablesTouched.length === 0) {
continue;
}
parsedTemplates.push({
template: redactTemplateSql(template, redactors),
tablesTouched,
columnsByClause: Object.fromEntries(
Object.entries(parsed.columnsByClause).map(([clause, columns]) => [clause, [...new Set(columns)].sort()]),
),
});
}
const byTable = new Map<string, TableAccumulator>();
for (const parsed of parsedTemplates) {
for (const table of parsed.tablesTouched) {
const acc = byTable.get(table) ?? accumulatorFor(table);
addTemplate(acc, parsed);
byTable.set(table, acc);
}
}
await mkdir(input.stagedDir, { recursive: true });
for (const [table, acc] of [...byTable.entries()].sort(([left], [right]) => left.localeCompare(right))) {
await writeJson(input.stagedDir, `tables/${table}.json`, toStagedTable(acc, now));
}
const patternsInput = toPatternsInput(parsedTemplates);
const patternInputSplit = splitHistoricSqlPatternInputs(patternsInput);
const allWarnings = [...warnings, ...patternInputSplit.warnings];
await writeJson(input.stagedDir, 'patterns-input.json', patternInputSplit.auditInput);
for (const shard of patternInputSplit.shards) {
await writeJson(input.stagedDir, shard.path, shard.input);
}
await writeJson(input.stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: input.connectionId,
dialect: config.dialect,
fetchedAt: now.toISOString(),
windowStart: windowStart.toISOString(),
windowEnd: now.toISOString(),
snapshotRowCount,
touchedTableCount: byTable.size,
parseFailures: allWarnings.filter((warning) => warning.startsWith('parse_failed:')).length,
warnings: allWarnings,
probeWarnings: probe.warnings,
staleArchiveAfterDays: config.staleArchiveAfterDays,
});
}

View file

@ -1,798 +0,0 @@
import { mkdtemp, readFile, readdir } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import { stageHistoricSqlTemplates } from './stage.js';
import {
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlUsageSchema,
type HistoricSqlQueryHistoryReader,
type HistoricSqlRawQueryRow,
} from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-stage-'));
}
async function readJson<T>(root: string, relPath: string): Promise<T> {
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
}
function fakeReader(rows: HistoricSqlRawQueryRow[]): HistoricSqlQueryHistoryReader {
return {
async probe() {},
async *fetch() {
for (const row of rows) {
yield row;
}
},
};
}
const fakeSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
if (sql.includes('paid')) {
return {
fingerprint: 'fp_paid_orders',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?',
tablesTouched: ['analytics.orders'],
literalSlots: [
{ position: 1, type: 'string', exampleValue: 'paid' },
{ position: 2, type: 'date', exampleValue: '2026-04-01' },
],
};
}
return {
fingerprint: 'fp_refunds',
normalizedSql: 'SELECT count(*) FROM analytics.refunds WHERE state = ?',
tablesTouched: ['analytics.refunds'],
literalSlots: [{ position: 1, type: 'string', exampleValue: 'complete' }],
};
},
};
const categoricalSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
const status = sql.includes("'refunded'") ? 'refunded' : 'paid';
return {
fingerprint: 'fp_order_status',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: status }],
};
},
};
function categoricalRows(): HistoricSqlRawQueryRow[] {
return [
{
id: 'paid-1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-a',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 11,
success: true,
errorMessage: null,
},
{
id: 'paid-2',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-b',
startedAt: '2026-05-04T10:01:00.000Z',
endedAt: null,
runtimeMs: 110,
rowsProduced: 12,
success: true,
errorMessage: null,
},
{
id: 'paid-3',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-c',
startedAt: '2026-05-04T10:02:00.000Z',
endedAt: null,
runtimeMs: 120,
rowsProduced: 13,
success: true,
errorMessage: null,
},
{
id: 'refunded-1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
user: 'analyst-a',
startedAt: '2026-05-04T10:03:00.000Z',
endedAt: null,
runtimeMs: 130,
rowsProduced: 21,
success: true,
errorMessage: null,
},
{
id: 'refunded-2',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
user: 'analyst-b',
startedAt: '2026-05-04T10:04:00.000Z',
endedAt: null,
runtimeMs: 140,
rowsProduced: 22,
success: true,
errorMessage: null,
},
{
id: 'refunded-3',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
user: 'analyst-c',
startedAt: '2026-05-04T10:05:00.000Z',
endedAt: null,
runtimeMs: 150,
rowsProduced: 23,
success: true,
errorMessage: null,
},
];
}
const diverseSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
const value = sql.match(/status = '([^']+)'/)?.[1] ?? 'unknown';
return {
fingerprint: 'fp_diverse_samples',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: value }],
};
},
};
const classificationMatrixSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
if (sql.includes('stale_orders')) {
return {
fingerprint: 'fp_stale_date',
normalizedSql: 'SELECT count(*) FROM analytics.stale_orders WHERE created_at >= ?',
tablesTouched: ['analytics.stale_orders'],
literalSlots: [{ position: 1, type: 'date', exampleValue: '2026-04-01' }],
};
}
const stringValue = (field: string): string => sql.match(new RegExp(`${field} = '([^']+)'`))?.[1] ?? 'unknown';
const amount = sql.match(/amount >= (\d+)/)?.[1] ?? '0';
const asOf = sql.match(/created_at >= '([^']+)'/)?.[1] ?? '2026-05-01';
return {
fingerprint: 'fp_classification_matrix',
normalizedSql:
'SELECT count(*) FROM analytics.orders WHERE region = ? AND plan = ? AND status = ? AND amount >= ? AND created_at >= ?',
tablesTouched: ['analytics.orders'],
literalSlots: [
{ position: 1, type: 'string', exampleValue: stringValue('region') },
{ position: 2, type: 'string', exampleValue: stringValue('plan') },
{ position: 3, type: 'string', exampleValue: stringValue('status') },
{ position: 4, type: 'number', exampleValue: amount },
{ position: 5, type: 'date', exampleValue: asOf },
],
};
},
};
function classificationMatrixRows(): HistoricSqlRawQueryRow[] {
const rows: HistoricSqlRawQueryRow[] = Array.from({ length: 20 }, (_, index) => {
const status = index < 10 ? 'paid' : 'refunded';
const plan = index === 19 ? 'self_serve' : 'enterprise';
const amount = 100 + index;
const asOf = `2026-05-${String(1 + Math.floor(index / 5)).padStart(2, '0')}`;
return {
id: `matrix-${index + 1}`,
sql: `SELECT count(*) FROM analytics.orders WHERE region = 'us' AND plan = '${plan}' AND status = '${status}' AND amount >= ${amount} AND created_at >= '${asOf}'`,
user: `analyst-${(index % 4) + 1}`,
startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`,
endedAt: null,
runtimeMs: 100 + index,
rowsProduced: 1,
success: true,
errorMessage: null,
};
});
return [
...rows,
{
id: 'stale-date-1',
sql: "SELECT count(*) FROM analytics.stale_orders WHERE created_at >= '2026-04-01'",
user: 'analyst-1',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: null,
runtimeMs: 75,
rowsProduced: 1,
success: true,
errorMessage: null,
},
];
}
describe('stageHistoricSqlTemplates', () => {
it('compresses rows by fingerprint into document-shaped staged templates', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'q1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01' AND email = 'analyst@example.com'",
user: 'analyst@example.com',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: '2026-05-04T10:00:01.000Z',
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
{
id: 'q2',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-05-01' AND email = 'analyst-2@example.com'",
user: 'analyst-2@example.com',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: '2026-05-04T11:00:01.000Z',
runtimeMs: 300,
rowsProduced: 1,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: fakeSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: ['^svc_'],
redactionPatterns: ['[\\w.+-]+@[\\w-]+\\.[\\w.-]+'],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest).toMatchObject({
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
nextSuccessfulCursor: '2026-05-04T11:00:00.000Z',
templateCount: 1,
capped: false,
});
const files = (await readdir(join(stagedDir, 'templates', 'fp_paid_orders'))).sort();
expect(files).toEqual(['metadata.json', 'page.md', 'usage.json']);
const metadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, 'templates/fp_paid_orders/metadata.json'),
);
expect(metadata).toEqual({
id: 'fp_paid_orders',
title: 'snowflake · analytics.orders [fp_pai]',
path: 'templates/fp_paid_orders/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_paid_orders',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [
{ position: 1, type: 'string', classification: 'constant' },
{ position: 2, type: 'date', classification: 'runtime' },
],
triage_signals: {
executions_bucket: 'low',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 1 runtime',
},
},
});
const page = await readFile(join(stagedDir, 'templates/fp_paid_orders/page.md'), 'utf-8');
expect(page).toContain('## Normalized SQL');
expect(page).toContain('SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?');
expect(page).toContain('- analytics.orders');
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json'));
expect(usage.stats).toMatchObject({
executions: 2,
distinct_users: 2,
first_seen: '2026-05-04T10:00:00.000Z',
last_seen: '2026-05-04T11:00:00.000Z',
p50_runtime_ms: 100,
p95_runtime_ms: 300,
error_rate: 0,
});
expect(usage.samples).toHaveLength(1);
expect(usage.samples[0].bound_sql).toContain('<redacted>');
expect(usage.samples[0].bound_sql).not.toContain('analyst@example.com');
expect(usage.samples[0].bound_sql).not.toContain('analyst-2@example.com');
});
it('skips hard-noise SQL and caps templates deterministically', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'show-1',
sql: 'SHOW TABLES',
user: 'analyst',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: null,
success: true,
errorMessage: null,
},
{
id: 'q3',
sql: "SELECT count(*) FROM analytics.refunds WHERE state = 'complete'",
user: 'analyst',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: null,
runtimeMs: 50,
success: true,
errorMessage: null,
},
{
id: 'q4',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01'",
user: 'analyst',
startedAt: '2026-05-04T11:30:00.000Z',
endedAt: null,
runtimeMs: 40,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: fakeSqlAnalysis,
pullConfig: {
dialect: 'bigquery',
windowDays: 7,
lastSuccessfulCursor: '2026-05-01T00:00:00.000Z',
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 1,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templateCount).toBe(1);
expect(manifest.capped).toBe(true);
expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']);
expect(manifest.templates.map((template) => template.id)).toEqual(['fp_paid_orders']);
});
it('splits categorical fingerprints into one document directory per dominant value', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(categoricalRows()),
sqlAnalysis: categoricalSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const templates = manifest.templates
.map((template) => ({
id: template.id,
fingerprint: template.fingerprint,
subClusterId: template.subClusterId,
path: template.path,
}))
.sort((left, right) => left.id.localeCompare(right.id));
expect(manifest.templateCount).toBe(2);
expect(templates).toEqual([
{
id: 'fp_order_status__cat_2b2ff2318877',
fingerprint: 'fp_order_status',
subClusterId: 'cat_2b2ff2318877',
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
},
{
id: 'fp_order_status__cat_34f037ddcbfa',
fingerprint: 'fp_order_status',
subClusterId: 'cat_34f037ddcbfa',
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
},
]);
const paidMetadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/metadata.json'),
);
expect(paidMetadata).toMatchObject({
id: 'fp_order_status__cat_34f037ddcbfa',
title: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
properties: {
fingerprint: 'fp_order_status',
sub_cluster_id: 'cat_34f037ddcbfa',
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
},
});
const paidUsage = historicSqlUsageSchema.parse(
await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'),
);
expect(paidUsage.stats).toMatchObject({
executions: 3,
distinct_users: 3,
first_seen: '2026-05-04T10:00:00.000Z',
last_seen: '2026-05-04T10:02:00.000Z',
rows_produced: 36,
});
expect(paidUsage.literal_slots).toEqual([{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }]);
const refundedUsage = historicSqlUsageSchema.parse(
await readJson(stagedDir, 'templates/fp_order_status__cat_2b2ff2318877/usage.json'),
);
expect(refundedUsage.stats).toMatchObject({
executions: 3,
distinct_users: 3,
first_seen: '2026-05-04T10:03:00.000Z',
last_seen: '2026-05-04T10:05:00.000Z',
rows_produced: 66,
});
expect(refundedUsage.literal_slots).toEqual([
{ position: 1, distinct_values: 1, top_values: [['refunded', 3]] },
]);
});
it('classifies literal slots across the spec matrix and stale-date demotion', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(classificationMatrixRows()),
sqlAnalysis: classificationMatrixSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const matrixTemplates = manifest.templates.filter((template) => template.fingerprint === 'fp_classification_matrix');
expect(matrixTemplates).toHaveLength(2);
expect(matrixTemplates.every((template) => template.subClusterId?.startsWith('cat_'))).toBe(true);
const matrixTemplate = matrixTemplates[0];
if (!matrixTemplate) {
throw new Error('expected classification matrix template');
}
const matrixMetadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, matrixTemplate.path.replace('/page.md', '/metadata.json')),
);
expect(matrixMetadata.properties.literal_slots).toMatchInlineSnapshot(`
[
{
"classification": "constant",
"position": 1,
"type": "string",
},
{
"classification": "constant",
"position": 2,
"type": "string",
},
{
"classification": "categorical",
"position": 3,
"type": "string",
},
{
"classification": "runtime",
"position": 4,
"type": "number",
},
{
"classification": "runtime",
"position": 5,
"type": "date",
},
]
`);
expect(matrixMetadata.properties.triage_signals.slot_summary).toBe('2 constant, 2 runtime');
const staleMetadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, 'templates/fp_stale_date/metadata.json'),
);
expect(staleMetadata.properties.literal_slots).toMatchInlineSnapshot(`
[
{
"classification": "runtime",
"position": 1,
"type": "date",
},
]
`);
expect(staleMetadata.properties.triage_signals.slot_summary).toBe('0 constant, 1 runtime');
});
it('applies the templates-per-run cap after categorical expansion', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(categoricalRows()),
sqlAnalysis: categoricalSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 1,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templateCount).toBe(1);
expect(manifest.capped).toBe(true);
expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']);
expect(manifest.templates).toHaveLength(1);
expect(manifest.templates[0].id).toMatch(/^fp_order_status__cat_/);
});
it('omits rows_produced for BigQuery templates when reader rows have no row counts', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_bq',
queryClient: {},
reader: fakeReader([
{
id: 'bq-1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-a@example.com',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: fakeSqlAnalysis,
pullConfig: {
dialect: 'bigquery',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json'));
expect(usage.stats).not.toHaveProperty('rows_produced');
expect(usage.samples[0]).not.toHaveProperty('rows_produced');
});
it('keeps at most five diverse samples, preferring recent successful representatives per literal tuple', async () => {
const stagedDir = await tempDir();
const statuses = [
'paid',
'refunded',
'pending',
'failed',
'trial',
'cancelled',
'draft',
'returned',
'review',
'held',
'archived',
];
const rows: HistoricSqlRawQueryRow[] = statuses.flatMap((status, index) => [
{
id: `${status}-old`,
sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`,
user: 'analyst-a',
startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`,
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: false,
errorMessage: 'old failed sample',
},
{
id: `${status}-new`,
sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`,
user: 'analyst-a',
startedAt: `2026-05-04T11:${String(index).padStart(2, '0')}:00.000Z`,
endedAt: null,
runtimeMs: 90,
rowsProduced: 2,
success: true,
errorMessage: null,
},
]);
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(rows),
sqlAnalysis: diverseSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_diverse_samples/usage.json'));
expect(usage.samples).toHaveLength(5);
expect(usage.samples.every((sample) => sample.success)).toBe(true);
expect(new Set(usage.samples.map((sample) => sample.bound_sql.match(/status = '([^']+)'/)?.[1])).size).toBe(5);
expect(usage.samples.map((sample) => sample.started_at)).toEqual([
'2026-05-04T11:10:00.000Z',
'2026-05-04T11:09:00.000Z',
'2026-05-04T11:08:00.000Z',
'2026-05-04T11:07:00.000Z',
'2026-05-04T11:06:00.000Z',
]);
});
it('uses recency as a tie-breaker when the templates-per-run cap overflows', async () => {
const stagedDir = await tempDir();
const sqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
const table = sql.includes('fresh_orders') ? 'fresh_orders' : 'stale_orders';
return {
fingerprint: `fp_${table}`,
normalizedSql: `SELECT count(*) FROM analytics.${table}`,
tablesTouched: [`analytics.${table}`],
literalSlots: [],
};
},
};
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'stale-1',
sql: 'SELECT count(*) FROM analytics.stale_orders',
user: 'analyst-a',
startedAt: '2026-02-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
{
id: 'fresh-1',
sql: 'SELECT count(*) FROM analytics.fresh_orders',
user: 'analyst-a',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
]),
sqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 1,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templates.map((template) => template.id)).toEqual(['fp_fresh_orders']);
});
it('does not persist bound SQL samples when redaction patterns are invalid', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'q1',
sql: "SELECT * FROM analytics.orders WHERE email = 'analyst@example.com'",
user: 'analyst@example.com',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: {
async analyzeForFingerprint() {
return {
fingerprint: 'fp_redaction',
normalizedSql: 'SELECT * FROM analytics.orders WHERE email = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: 'analyst@example.com' }],
};
},
},
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: ['['],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_redaction/usage.json'));
expect(manifest.warnings.some((warning) => warning.startsWith('redaction_skipped:invalid_redaction_pattern'))).toBe(
true,
);
expect(usage.samples).toEqual([]);
});
});

View file

@ -1,630 +0,0 @@
import { createHash } from 'node:crypto';
import { mkdir, writeFile } from 'node:fs/promises';
import { dirname, join } from 'node:path';
import type {
SqlAnalysisFingerprintResult,
SqlAnalysisLiteralSlot,
SqlAnalysisLiteralSlotType,
SqlAnalysisPort,
} from '../../../sql-analysis/index.js';
import {
HISTORIC_SQL_OBJECT_TYPE,
HISTORIC_SQL_SOURCE_KEY,
historicSqlPullConfigSchema,
historicSqlRawQueryRowSchema,
type HistoricSqlLiteralSlotClassification,
type HistoricSqlManifest,
type HistoricSqlMetadata,
type HistoricSqlPullConfig,
type HistoricSqlQueryHistoryReader,
type HistoricSqlRawQueryRow,
type HistoricSqlUsage,
} from './types.js';
interface StageHistoricSqlTemplatesInput {
stagedDir: string;
connectionId: string;
queryClient: unknown;
reader: HistoricSqlQueryHistoryReader;
sqlAnalysis: SqlAnalysisPort;
pullConfig: HistoricSqlPullConfig;
now?: Date;
}
interface SlotObservation {
value: string;
rowStartedAt: string;
}
interface SlotStats {
position: number;
type: SqlAnalysisLiteralSlotType;
values: Map<string, number>;
observations: SlotObservation[];
}
interface TemplateAccumulator {
fingerprint: string;
normalizedSql: string;
tablesTouched: Set<string>;
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
slotStats: Map<number, SlotStats>;
}
interface ClassifiedLiteralSlot {
position: number;
type: SqlAnalysisLiteralSlotType;
classification: HistoricSqlLiteralSlotClassification;
}
interface TemplateVariant {
id: string;
fingerprint: string;
subClusterId: string | null;
normalizedSql: string;
tablesTouched: Set<string>;
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
slotStats: Map<number, SlotStats>;
slotClassifications: ClassifiedLiteralSlot[];
}
interface CategoricalTupleEntry {
position: number;
value: string;
}
interface RedactionPolicy {
redactors: RegExp[];
samplesAllowed: boolean;
}
const HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i;
const HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i;
export async function stageHistoricSqlTemplates(input: StageHistoricSqlTemplatesInput): Promise<void> {
const config = historicSqlPullConfigSchema.parse(input.pullConfig);
const now = input.now ?? new Date();
const windowStart = config.lastSuccessfulCursor
? new Date(config.lastSuccessfulCursor)
: new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000);
const warnings: string[] = [];
const redaction = compileRedactors(config.redactionPatterns, warnings);
const groups = new Map<string, TemplateAccumulator>();
let nextSuccessfulCursor: string | null = null;
await input.reader.probe(input.queryClient);
for await (const rawRow of input.reader.fetch(
input.queryClient,
{ start: windowStart, end: now },
config.lastSuccessfulCursor,
)) {
const row = historicSqlRawQueryRowSchema.parse(rawRow);
if (!nextSuccessfulCursor || row.startedAt > nextSuccessfulCursor) {
nextSuccessfulCursor = row.startedAt;
}
if (shouldSkipSql(row.sql)) {
continue;
}
const analysis = await input.sqlAnalysis.analyzeForFingerprint(row.sql, config.dialect);
if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) {
warnings.push(`analysis_failed:${row.id}`);
continue;
}
const group =
groups.get(analysis.fingerprint) ??
{
fingerprint: analysis.fingerprint,
normalizedSql: analysis.normalizedSql,
tablesTouched: new Set<string>(),
rows: [],
slotStats: new Map<number, SlotStats>(),
};
for (const table of analysis.tablesTouched) {
group.tablesTouched.add(table);
}
for (const slot of analysis.literalSlots) {
recordSlot(group.slotStats, slot, redaction.redactors, row.startedAt);
}
group.rows.push({ row, analysis });
groups.set(analysis.fingerprint, group);
}
const expandedTemplates = expandCategoricalTemplates([...groups.values()], redaction.redactors);
const selected = selectTemplates(expandedTemplates, config.maxTemplatesPerRun, now);
if (selected.length < expandedTemplates.length) {
warnings.push(`templates_truncated: kept ${selected.length} of ${expandedTemplates.length} templates`);
}
await mkdir(input.stagedDir, { recursive: true });
const templates: HistoricSqlManifest['templates'] = [];
for (const template of selected) {
const staged = buildStagedTemplate(template, config, redaction, now);
const basePath = `templates/${staged.metadata.id}`;
await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata);
await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown);
await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage);
templates.push({
id: staged.metadata.id,
fingerprint: staged.metadata.properties.fingerprint,
subClusterId: staged.metadata.properties.sub_cluster_id,
path: staged.metadata.path,
});
}
await writeJson(input.stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: input.connectionId,
dialect: config.dialect,
fetchedAt: now.toISOString(),
windowStart: windowStart.toISOString(),
windowEnd: now.toISOString(),
nextSuccessfulCursor,
templateCount: selected.length,
capped: selected.length < expandedTemplates.length,
warnings,
degraded: false,
statsResetAt: null,
baselineFirstRun: false,
pgServerVersion: null,
deallocCount: null,
templates,
} satisfies HistoricSqlManifest);
}
function shouldSkipSql(sql: string): boolean {
return HARD_SKIP_PREFIX_RE.test(sql) || HARD_SKIP_TABLE_RE.test(sql);
}
function recordSlot(
slotStats: Map<number, SlotStats>,
slot: SqlAnalysisLiteralSlot,
redactors: RegExp[],
rowStartedAt: string,
): void {
const existing = slotStats.get(slot.position) ?? {
position: slot.position,
type: slot.type,
values: new Map<string, number>(),
observations: [],
};
const persistedValue = redactText(slot.exampleValue, redactors);
existing.values.set(persistedValue, (existing.values.get(persistedValue) ?? 0) + 1);
existing.observations.push({ value: persistedValue, rowStartedAt });
slotStats.set(slot.position, existing);
}
function expandCategoricalTemplates(groups: TemplateAccumulator[], redactors: RegExp[]): TemplateVariant[] {
return groups.flatMap((group) => expandTemplateGroup(group, redactors));
}
function expandTemplateGroup(group: TemplateAccumulator, redactors: RegExp[]): TemplateVariant[] {
const rows = [...group.rows].sort((left, right) => left.row.startedAt.localeCompare(right.row.startedAt));
const firstSeen = rows[0]?.row.startedAt;
if (!firstSeen) {
return [];
}
const slotClassifications = classifySlots(group.slotStats, rows.length, firstSeen);
const categoricalPositions = slotClassifications
.filter((slot) => slot.classification === 'categorical')
.map((slot) => slot.position)
.sort((left, right) => left - right);
if (categoricalPositions.length === 0) {
return [
{
id: group.fingerprint,
fingerprint: group.fingerprint,
subClusterId: null,
normalizedSql: group.normalizedSql,
tablesTouched: group.tablesTouched,
rows,
slotStats: group.slotStats,
slotClassifications,
},
];
}
const byTuple = new Map<
string,
{
tuple: CategoricalTupleEntry[];
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
}
>();
for (const entry of rows) {
const tuple = categoricalTuple(entry.analysis.literalSlots, categoricalPositions, redactors);
const key = JSON.stringify(tuple);
const existing = byTuple.get(key) ?? { tuple, rows: [] };
existing.rows.push(entry);
byTuple.set(key, existing);
}
return [...byTuple.values()]
.map(({ tuple, rows: tupleRows }) => {
const subClusterId = subClusterIdForTuple(tuple);
return {
id: `${group.fingerprint}__${subClusterId}`,
fingerprint: group.fingerprint,
subClusterId,
normalizedSql: group.normalizedSql,
tablesTouched: group.tablesTouched,
rows: tupleRows,
slotStats: collectSlotStats(tupleRows, redactors),
slotClassifications,
};
})
.sort((left, right) => left.id.localeCompare(right.id));
}
function classifySlots(
slotStats: Map<number, SlotStats>,
executions: number,
firstSeen: string,
): ClassifiedLiteralSlot[] {
return [...slotStats.values()]
.sort((left, right) => left.position - right.position)
.map((slot) => ({
position: slot.position,
type: slot.type,
classification: classifySlot(slot, executions, firstSeen),
}));
}
function collectSlotStats(
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>,
redactors: RegExp[],
): Map<number, SlotStats> {
const slotStats = new Map<number, SlotStats>();
for (const entry of rows) {
for (const slot of entry.analysis.literalSlots) {
recordSlot(slotStats, slot, redactors, entry.row.startedAt);
}
}
return slotStats;
}
function categoricalTuple(
literalSlots: SqlAnalysisLiteralSlot[],
categoricalPositions: number[],
redactors: RegExp[],
): CategoricalTupleEntry[] {
const valuesByPosition = new Map(
literalSlots.map((slot) => [slot.position, redactText(slot.exampleValue, redactors)] as const),
);
return categoricalPositions.map((position) => ({
position,
value: valuesByPosition.get(position) ?? '<missing>',
}));
}
function subClusterIdForTuple(tuple: CategoricalTupleEntry[]): string {
return `cat_${createHash('sha256').update(JSON.stringify(tuple)).digest('hex').slice(0, 12)}`;
}
function buildStagedTemplate(
template: TemplateVariant,
config: HistoricSqlPullConfig,
redaction: RedactionPolicy,
now: Date,
): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } {
const rows = template.rows
.map((entry) => entry.row)
.sort((left, right) => left.startedAt.localeCompare(right.startedAt));
const firstSeen = rows[0].startedAt;
const lastSeen = rows[rows.length - 1].startedAt;
const distinctUsers = new Set(rows.map((row) => row.user).filter((user): user is string => !!user)).size;
const errorCount = rows.filter((row) => !row.success).length;
const runtimes = rows
.map((row) => row.runtimeMs)
.filter((runtime): runtime is number => typeof runtime === 'number')
.sort((left, right) => left - right);
const triageSignals = buildTriageSignals({
executions: rows.length,
distinctUsers,
errorRate: rows.length === 0 ? 0 : errorCount / rows.length,
lastSeen,
now,
serviceAccountOnly: isServiceAccountOnly(rows, config.serviceAccountUserPatterns),
slotClassifications: template.slotClassifications.map((slot) => slot.classification),
});
const tablesTouched = [...template.tablesTouched].sort();
const firstTable = tablesTouched[0] ?? 'query';
const id = template.id;
const rowsProduced = sumRowsProduced(rows);
const metadata: HistoricSqlMetadata = {
id,
title: buildTemplateTitle(config.dialect, firstTable, template.fingerprint, template.subClusterId),
path: `templates/${id}/page.md`,
objectType: HISTORIC_SQL_OBJECT_TYPE,
lastEditedAt: null,
properties: {
fingerprint: template.fingerprint,
sub_cluster_id: template.subClusterId,
dialect: config.dialect,
tables_touched: tablesTouched,
literal_slots: template.slotClassifications,
triage_signals: triageSignals,
},
};
return {
metadata,
pageMarkdown: renderTemplatePage(id, template.normalizedSql, tablesTouched),
usage: {
stats: {
executions: rows.length,
distinct_users: distinctUsers,
first_seen: firstSeen,
last_seen: lastSeen,
p50_runtime_ms: percentile(runtimes, 0.5),
p95_runtime_ms: percentile(runtimes, 0.95),
error_rate: rows.length === 0 ? 0 : errorCount / rows.length,
...(rowsProduced === null ? {} : { rows_produced: rowsProduced }),
},
literal_slots: [...template.slotStats.values()]
.sort((left, right) => left.position - right.position)
.map((slot) => ({
position: slot.position,
distinct_values: slot.values.size,
top_values: [...slot.values.entries()]
.sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))
.slice(0, 10),
})),
samples: selectSamples(template.rows, redaction),
},
};
}
const TEMPORAL_SLOT_TYPES = new Set<SqlAnalysisLiteralSlotType>(['date', 'timestamp']);
function isStaleDateConstant(slot: SlotStats, value: string, firstSeen: string): boolean {
return slot.type === 'date' && parseTemporalSlotValue(value) !== null && value < firstSeen.slice(0, 10);
}
function isMovingTemporalSlot(slot: SlotStats): boolean {
if (!TEMPORAL_SLOT_TYPES.has(slot.type) || slot.values.size < 2) {
return false;
}
const observations: Array<{ rowStartedAt: number; literalTime: number }> = [];
for (const observation of slot.observations) {
const rowStartedAt = Date.parse(observation.rowStartedAt);
const literalTime = parseTemporalSlotValue(observation.value);
if (Number.isNaN(rowStartedAt) || literalTime === null) {
return false;
}
observations.push({ rowStartedAt, literalTime });
}
const literalTimes = observations
.sort((left, right) => left.rowStartedAt - right.rowStartedAt)
.map((observation) => observation.literalTime);
return isMonotonic(literalTimes);
}
function parseTemporalSlotValue(value: string): number | null {
const parsed = Date.parse(value);
return Number.isNaN(parsed) ? null : parsed;
}
function isMonotonic(values: number[]): boolean {
if (values.length < 2) {
return false;
}
let nonDecreasing = true;
let nonIncreasing = true;
for (let index = 1; index < values.length; index += 1) {
if (values[index] < values[index - 1]) {
nonDecreasing = false;
}
if (values[index] > values[index - 1]) {
nonIncreasing = false;
}
}
return nonDecreasing || nonIncreasing;
}
function classifySlot(
slot: SlotStats,
executions: number,
firstSeen: string,
): HistoricSqlLiteralSlotClassification {
const ordered = [...slot.values.entries()].sort((left, right) => right[1] - left[1]);
const distinct = ordered.length;
const topCount = ordered[0]?.[1] ?? 0;
const topValue = ordered[0]?.[0] ?? '';
const staleDateConstant = isStaleDateConstant(slot, topValue, firstSeen);
if (distinct === 1 && !staleDateConstant) {
return 'constant';
}
if (executions > 0 && topCount / executions >= 0.95 && !staleDateConstant) {
return 'constant';
}
if (isMovingTemporalSlot(slot)) {
return 'runtime';
}
if (executions > 0 && distinct >= 2 && distinct <= 10 && ordered.every(([, count]) => count / executions >= 0.05)) {
return 'categorical';
}
return 'runtime';
}
function buildTriageSignals(input: {
executions: number;
distinctUsers: number;
errorRate: number;
lastSeen: string;
now: Date;
serviceAccountOnly: boolean;
slotClassifications: HistoricSqlLiteralSlotClassification[];
}): Record<string, string> {
const runtimeCount = input.slotClassifications.filter((classification) => classification === 'runtime').length;
const constantCount = input.slotClassifications.filter((classification) => classification === 'constant').length;
return {
executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high',
distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad',
error_rate_bucket: input.errorRate <= 0.01 ? 'ok' : input.errorRate <= 0.1 ? 'noisy' : 'broken',
recency_bucket: recencyBucket(input.lastSeen, input.now),
service_account_only: String(input.serviceAccountOnly),
slot_summary: `${constantCount} constant, ${runtimeCount} runtime`,
};
}
function recencyBucket(lastSeen: string, now: Date): string {
const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / (24 * 60 * 60 * 1000));
if (ageDays <= 14) {
return 'active';
}
if (ageDays <= 60) {
return 'warm';
}
return 'cold';
}
function isServiceAccountOnly(rows: HistoricSqlRawQueryRow[], patterns: string[]): boolean {
const users = rows.map((row) => row.user).filter((user): user is string => !!user);
if (users.length === 0 || patterns.length === 0) {
return false;
}
const regexes = patterns.map((pattern) => new RegExp(pattern));
return users.every((user) => regexes.some((regex) => regex.test(user)));
}
function buildTemplateTitle(
dialect: HistoricSqlPullConfig['dialect'],
firstTable: string,
fingerprint: string,
subClusterId: string | null,
): string {
if (!subClusterId) {
return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}]`;
}
return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}:${subClusterId.slice(-6)}]`;
}
function renderTemplatePage(fingerprint: string, normalizedSql: string, tablesTouched: string[]): string {
return [
`# ${fingerprint}`,
'',
'## Normalized SQL',
'```sql',
normalizedSql,
'```',
'',
'## Tables touched',
...tablesTouched.map((table) => `- ${table}`),
'',
].join('\n');
}
function selectSamples(
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>,
redaction: RedactionPolicy,
): HistoricSqlUsage['samples'] {
if (!redaction.samplesAllowed) {
return [];
}
const byLiteralTuple = new Map<string, { row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>();
const preferred = [...rows].sort((left, right) => {
if (left.row.success !== right.row.success) {
return left.row.success ? -1 : 1;
}
return right.row.startedAt.localeCompare(left.row.startedAt);
});
for (const entry of preferred) {
const key = [...entry.analysis.literalSlots]
.sort((left, right) => left.position - right.position)
.map((slot) => slot.exampleValue)
.join('\u001f');
if (!byLiteralTuple.has(key)) {
byLiteralTuple.set(key, entry);
}
}
return [...byLiteralTuple.values()]
.sort((left, right) => right.row.startedAt.localeCompare(left.row.startedAt))
.slice(0, 5)
.map(({ row }) => ({
started_at: row.startedAt,
user: row.user,
bound_sql: redactText(row.sql, redaction.redactors),
...(row.rowsProduced === undefined ? {} : { rows_produced: row.rowsProduced ?? null }),
runtime_ms: row.runtimeMs,
success: row.success,
}));
}
function selectTemplates(templates: TemplateVariant[], maxTemplatesPerRun: number, now: Date): TemplateVariant[] {
return templates
.map((template) => ({ template, score: rankTemplate(template, now) }))
.sort((left, right) => right.score - left.score || left.template.id.localeCompare(right.template.id))
.slice(0, maxTemplatesPerRun)
.map((entry) => entry.template);
}
function rankTemplate(template: TemplateVariant, now: Date): number {
const users = new Set(template.rows.map(({ row }) => row.user).filter((user): user is string => !!user)).size;
const latestStartedAt = template.rows.reduce<string | null>(
(latest, { row }) => (latest === null || row.startedAt > latest ? row.startedAt : latest),
null,
);
const ageDays =
latestStartedAt === null ? 365 : Math.max(0, (now.getTime() - new Date(latestStartedAt).getTime()) / 86400000);
const recencyWeight = 1 / (1 + ageDays / 30);
return users * Math.log1p(template.rows.length) * recencyWeight;
}
function percentile(values: number[], percentileValue: number): number | null {
if (values.length === 0) {
return null;
}
const index = Math.min(values.length - 1, Math.max(0, Math.ceil(values.length * percentileValue) - 1));
return values[index];
}
function sumRowsProduced(rows: HistoricSqlRawQueryRow[]): number | null {
const values = rows.map((row) => row.rowsProduced).filter((value): value is number => typeof value === 'number');
return values.length > 0 ? values.reduce((sum, value) => sum + value, 0) : null;
}
function compileRedactors(patterns: string[], warnings: string[]): RedactionPolicy {
let samplesAllowed = true;
const redactors = patterns.flatMap((pattern) => {
try {
return [new RegExp(pattern, 'g')];
} catch (error) {
samplesAllowed = false;
warnings.push(
`redaction_skipped:invalid_redaction_pattern:${pattern}:${error instanceof Error ? error.message : String(error)}`,
);
return [];
}
});
return { redactors, samplesAllowed };
}
function redactText(value: string, redactors: RegExp[]): string {
return redactors.reduce((current, regex) => current.replace(regex, '<redacted>'), value);
}
async function writeJson(stagedDir: string, relPath: string, value: unknown): Promise<void> {
await writeText(stagedDir, relPath, `${JSON.stringify(value, null, 2)}\n`);
}
async function writeText(stagedDir: string, relPath: string, value: string): Promise<void> {
const target = join(stagedDir, relPath);
await mkdir(dirname(target), { recursive: true });
await writeFile(target, value, 'utf-8');
}

View file

@ -0,0 +1,98 @@
import { describe, expect, it } from 'vitest';
import {
aggregatedTemplateSchema,
historicSqlUnifiedPullConfigSchema,
stagedManifestSchema,
stagedPatternsInputSchema,
stagedTableInputSchema,
} from './types.js';
describe('historic-sql unified contracts', () => {
it('parses minExecutions and accepts minCalls as a one-release alias', () => {
expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minExecutions: 9 })).toMatchObject({
dialect: 'postgres',
minExecutions: 9,
windowDays: 90,
concurrency: 12,
redactionPatterns: [],
staleArchiveAfterDays: 90,
});
expect(historicSqlUnifiedPullConfigSchema.parse({ dialect: 'postgres', minCalls: 7 }).minExecutions).toBe(7);
});
it('validates aggregate templates from warehouse readers', () => {
const parsed = aggregatedTemplateSchema.parse({
templateId: 'pg:123',
canonicalSql: 'select status, count(*) from public.orders group by status',
dialect: 'postgres',
stats: {
executions: 42,
distinctUsers: 3,
firstSeen: '2026-05-01T00:00:00.000Z',
lastSeen: '2026-05-11T00:00:00.000Z',
p50RuntimeMs: 12.5,
p95RuntimeMs: 40,
errorRate: 0,
rowsProduced: 100,
},
topUsers: [{ user: 'analyst', executions: 40 }],
});
expect(parsed.templateId).toBe('pg:123');
expect(parsed.topUsers).toEqual([{ user: 'analyst', executions: 40 }]);
});
it('validates staged table, patterns, and manifest artifacts', () => {
expect(
stagedTableInputSchema.parse({
table: 'public.orders',
stats: {
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
errorRateBucket: 'none',
p95RuntimeBucket: '<100ms',
recencyBucket: 'current',
},
columnsByClause: {
select: [['status', 'high']],
where: [['created_at', 'mid']],
},
observedJoins: [{ withTable: 'public.customers', on: ['customer_id'], freq: 'high' }],
topTemplates: [{ id: 'pg:123', canonicalSql: 'select * from public.orders', topUsers: [{ user: 'analyst' }] }],
}).table,
).toBe('public.orders');
expect(
stagedPatternsInputSchema.parse({
templates: [
{
id: 'pg:123',
canonicalSql: 'select * from public.orders',
tablesTouched: ['public.orders'],
executionsBucket: '10-100',
distinctUsersBucket: '2-5',
dialect: 'postgres',
},
],
}).templates,
).toHaveLength(1);
expect(
stagedManifestSchema.parse({
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 2,
touchedTableCount: 1,
parseFailures: 1,
warnings: ['parse_failed:bad'],
probeWarnings: [],
staleArchiveAfterDays: 90,
}).staleArchiveAfterDays,
).toBe(90);
});
});

View file

@ -2,200 +2,161 @@ import { z } from 'zod';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
export const HISTORIC_SQL_SOURCE_KEY = 'historic-sql' as const;
export const HISTORIC_SQL_OBJECT_TYPE = 'historic_sql_template' as const;
const historicSqlDialectSchema = z.enum(['snowflake', 'bigquery', 'postgres']);
export type HistoricSqlDialect = z.infer<typeof historicSqlDialectSchema>;
export const historicSqlPullConfigSchema = z.object({
const filterModeSchema = z.enum(['exclude', 'include', 'mark-only']);
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}
export const historicSqlUnifiedPullConfigSchema = z.preprocess((value) => {
if (!isRecord(value)) {
return value;
}
const next: Record<string, unknown> = { ...value };
if (next.minExecutions === undefined && typeof next.minCalls === 'number') {
next.minExecutions = next.minCalls;
}
if (!next.filters && Array.isArray(next.serviceAccountUserPatterns)) {
next.filters = {
serviceAccounts: { patterns: next.serviceAccountUserPatterns, mode: 'exclude' },
dropTrivialProbes: true,
};
}
return next;
}, z.object({
dialect: historicSqlDialectSchema,
windowDays: z.number().int().min(1).max(365).default(90),
lastSuccessfulCursor: z.string().datetime().nullable().default(null),
serviceAccountUserPatterns: z.array(z.string()).default([]),
windowDays: z.number().int().positive().default(90),
minExecutions: z.number().int().nonnegative().default(5),
concurrency: z.number().int().positive().default(12),
filters: z.object({
serviceAccounts: z.object({
patterns: z.array(z.string()).default([]),
mode: filterModeSchema.default('exclude'),
}).optional(),
orchestrators: z.object({
mode: filterModeSchema.default('mark-only'),
}).optional(),
dropTrivialProbes: z.boolean().default(true),
dropFailedBelow: z.object({
errorRate: z.number().min(0).max(1),
executions: z.number().int().nonnegative(),
}).optional(),
}).default({ dropTrivialProbes: true }),
redactionPatterns: z.array(z.string()).default([]),
maxTemplatesPerRun: z.number().int().min(1).max(5000).default(5000),
minCalls: z.number().int().min(1).default(5),
staleArchiveAfterDays: z.number().int().positive().default(90),
}));
export type HistoricSqlUnifiedPullConfig = z.infer<typeof historicSqlUnifiedPullConfigSchema>;
export const aggregatedTemplateSchema = z.object({
templateId: z.string().min(1),
canonicalSql: z.string().min(1),
dialect: historicSqlDialectSchema,
stats: z.object({
executions: z.number().int().nonnegative(),
distinctUsers: z.number().int().nonnegative(),
firstSeen: z.iso.datetime(),
lastSeen: z.iso.datetime(),
p50RuntimeMs: z.number().nonnegative().nullable(),
p95RuntimeMs: z.number().nonnegative().nullable(),
errorRate: z.number().min(0).max(1),
rowsProduced: z.number().int().nonnegative().nullable(),
}),
topUsers: z.array(z.object({
user: z.string().nullable(),
executions: z.number().int().nonnegative(),
})).default([]),
});
export type HistoricSqlPullConfig = z.infer<typeof historicSqlPullConfigSchema>;
export type AggregatedTemplate = z.infer<typeof aggregatedTemplateSchema>;
export const stagedTableInputSchema = z.object({
table: z.string().min(1),
stats: z.object({
executionsBucket: z.string(),
distinctUsersBucket: z.string(),
errorRateBucket: z.string(),
p95RuntimeBucket: z.string(),
recencyBucket: z.string(),
}),
columnsByClause: z.record(z.string(), z.array(z.tuple([z.string(), z.string()]))),
observedJoins: z.array(z.object({
withTable: z.string(),
on: z.array(z.string()),
freq: z.string(),
})),
topTemplates: z.array(z.object({
id: z.string(),
canonicalSql: z.string(),
topUsers: z.array(z.object({ user: z.string().nullable() })),
})),
});
export type StagedTableInput = z.infer<typeof stagedTableInputSchema>;
export const stagedPatternsInputSchema = z.object({
templates: z.array(z.object({
id: z.string(),
canonicalSql: z.string(),
tablesTouched: z.array(z.string()),
executionsBucket: z.string(),
distinctUsersBucket: z.string(),
dialect: historicSqlDialectSchema,
})),
});
export type StagedPatternsInput = z.infer<typeof stagedPatternsInputSchema>;
export const stagedManifestSchema = z.object({
source: z.literal(HISTORIC_SQL_SOURCE_KEY),
connectionId: z.string().min(1),
dialect: historicSqlDialectSchema,
fetchedAt: z.iso.datetime(),
windowStart: z.iso.datetime(),
windowEnd: z.iso.datetime(),
snapshotRowCount: z.number().int().nonnegative(),
touchedTableCount: z.number().int().nonnegative(),
parseFailures: z.number().int().nonnegative(),
warnings: z.array(z.string()),
probeWarnings: z.array(z.string()),
staleArchiveAfterDays: z.number().int().positive().default(90),
});
export type StagedManifest = z.infer<typeof stagedManifestSchema>;
export interface HistoricSqlProbeResult {
warnings: string[];
info?: string[];
}
export interface HistoricSqlReader {
probe(client: unknown): Promise<HistoricSqlProbeResult>;
fetchAggregated(
client: unknown,
window: HistoricSqlTimeWindow,
config: HistoricSqlUnifiedPullConfig,
): AsyncIterable<AggregatedTemplate>;
}
export interface HistoricSqlTimeWindow {
start: Date;
end: Date;
}
export const historicSqlRawQueryRowSchema = z.object({
id: z.string().min(1),
sql: z.string().min(1),
user: z.string().nullable().default(null),
startedAt: z.string().datetime(),
endedAt: z.string().datetime().nullable().default(null),
runtimeMs: z.number().nonnegative().nullable().default(null),
rowsProduced: z.number().int().nonnegative().nullable().optional(),
success: z.boolean().default(true),
errorMessage: z.string().nullable().default(null),
});
export type HistoricSqlRawQueryRow = z.infer<typeof historicSqlRawQueryRowSchema>;
export interface HistoricSqlQueryHistoryReader {
probe(client: unknown): Promise<void>;
fetch(
client: unknown,
window: HistoricSqlTimeWindow,
cursor?: string | null,
): AsyncIterable<HistoricSqlRawQueryRow>;
}
export interface KtxPostgresQueryClient {
executeQuery(sql: string, params?: unknown[]): Promise<{ headers: string[]; rows: unknown[][]; totalRows?: number }>;
}
export interface PostgresPgssProbeResult {
export interface PostgresPgssProbeResult extends HistoricSqlProbeResult {
pgServerVersion: string;
warnings: string[];
}
export interface PostgresPgssSnapshot {
statsResetAt: string | null;
deallocCount: number | null;
rows: PostgresPgssRow[];
}
export interface PostgresPgssReader {
probe(client: KtxPostgresQueryClient): Promise<PostgresPgssProbeResult>;
readSnapshot(
client: KtxPostgresQueryClient,
options: { minCalls: number; maxTemplates: number },
): Promise<PostgresPgssSnapshot>;
}
export interface PostgresPgssRow {
queryid: string;
userid: string;
username: string | null;
dbid: string;
database: string | null;
query: string;
calls: number;
totalExecTime: number;
meanExecTime: number;
totalRows: number;
}
export interface PostgresPgssAggregateRow {
id: string;
queryid: string;
dbid: string;
database: string | null;
query: string;
deltaCalls: number;
deltaExecTime: number;
deltaRows: number;
meanExecTime: number;
distinctUsersDelta: number;
users: string[];
firstObservedAt: string;
info: string[];
}
export interface HistoricSqlSourceAdapterDeps {
sqlAnalysis: SqlAnalysisPort;
reader: HistoricSqlQueryHistoryReader;
reader: HistoricSqlReader;
queryClient: unknown;
postgresReader?: PostgresPgssReader;
postgresQueryClient?: KtxPostgresQueryClient;
postgresBaselineRootDir?: string;
legacyPostgresBaselineRootDir?: string;
now?: () => Date;
onPullSucceeded?: (ctx: {
connectionId: string;
sourceKey: string;
syncId: string;
trigger: import('../../types.js').IngestTrigger;
completedAt: Date;
stagedDir: string;
nextSuccessfulCursor: string | null;
}) => Promise<void>;
}
const historicSqlLiteralSlotClassificationSchema = z.enum(['constant', 'runtime', 'categorical']);
export type HistoricSqlLiteralSlotClassification = z.infer<typeof historicSqlLiteralSlotClassificationSchema>;
export const historicSqlMetadataSchema = z.object({
id: z.string().min(1),
title: z.string().min(1),
path: z.string().min(1),
objectType: z.literal(HISTORIC_SQL_OBJECT_TYPE),
lastEditedAt: z.null(),
properties: z.object({
fingerprint: z.string().min(1),
sub_cluster_id: z.string().nullable(),
dialect: historicSqlDialectSchema,
tables_touched: z.array(z.string()),
literal_slots: z.array(
z.object({
position: z.number().int().min(1),
type: z.enum(['string', 'number', 'timestamp', 'date', 'boolean', 'null', 'unknown']),
classification: historicSqlLiteralSlotClassificationSchema,
}),
),
triage_signals: z.record(z.string(), z.string()),
}),
});
export type HistoricSqlMetadata = z.infer<typeof historicSqlMetadataSchema>;
export const historicSqlUsageSchema = z.object({
stats: z.object({
executions: z.number().int().nonnegative(),
distinct_users: z.number().int().nonnegative(),
first_seen: z.string().datetime(),
last_seen: z.string().datetime(),
p50_runtime_ms: z.number().nonnegative().nullable(),
p95_runtime_ms: z.number().nonnegative().nullable(),
mean_runtime_ms: z.number().nonnegative().nullable().optional(),
error_rate: z.number().min(0).max(1),
rows_produced: z.number().int().nonnegative().nullable().optional(),
}),
literal_slots: z.array(
z.object({
position: z.number().int().min(1),
distinct_values: z.number().int().nonnegative(),
top_values: z.array(z.tuple([z.string(), z.number().int().nonnegative()])),
}),
),
samples: z.array(
z.object({
started_at: z.string().datetime(),
user: z.string().nullable(),
bound_sql: z.string(),
rows_produced: z.number().int().nonnegative().nullable().optional(),
runtime_ms: z.number().nonnegative().nullable(),
success: z.boolean(),
}),
),
});
export type HistoricSqlUsage = z.infer<typeof historicSqlUsageSchema>;
export const historicSqlManifestSchema = z.object({
source: z.literal(HISTORIC_SQL_SOURCE_KEY),
connectionId: z.string().min(1),
dialect: historicSqlDialectSchema,
fetchedAt: z.string().datetime(),
windowStart: z.string().datetime(),
windowEnd: z.string().datetime(),
nextSuccessfulCursor: z.string().datetime().nullable(),
templateCount: z.number().int().nonnegative(),
capped: z.boolean(),
warnings: z.array(z.string()),
degraded: z.boolean().default(false),
statsResetAt: z.string().datetime().nullable().default(null),
baselineFirstRun: z.boolean().default(false),
pgServerVersion: z.string().nullable().default(null),
deallocCount: z.number().int().nonnegative().nullable().default(null),
templates: z.array(
z.object({
id: z.string().min(1),
fingerprint: z.string().min(1),
subClusterId: z.string().nullable(),
path: z.string().min(1),
}),
),
});
export type HistoricSqlManifest = z.infer<typeof historicSqlManifestSchema>;

View file

@ -186,6 +186,62 @@ describe('buildLiveDatabaseManifestShards', () => {
});
});
it('preserves external usage keys while replacing historic SQL managed keys', () => {
const existingUsage = new Map([
[
'orders',
{
narrative: 'Old generated usage narrative.',
frequencyTier: 'low' as const,
commonFilters: ['old_status'],
commonJoins: [],
ownerNote: 'Pinned analyst note',
},
],
]);
const result = buildLiveDatabaseManifestShards({
connectionType: 'POSTGRESQL',
mapColumnType: (nativeType) => nativeType.toLowerCase(),
existingUsage,
tables: [
{
name: 'orders',
catalog: null,
db: 'public',
usage: {
narrative: 'Fresh generated usage narrative.',
frequencyTier: 'high',
commonFilters: ['status'],
commonGroupBys: ['created_at'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
},
columns: [{ name: 'id', type: 'INTEGER' }],
},
],
joins: [],
});
expect(shardObject(result.shards)).toEqual({
public: {
tables: {
orders: {
table: 'public.orders',
usage: {
ownerNote: 'Pinned analyst note',
narrative: 'Fresh generated usage narrative.',
frequencyTier: 'high',
commonFilters: ['status'],
commonGroupBys: ['created_at'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
},
columns: [{ name: 'id', type: 'integer' }],
},
},
},
});
});
it('renders ordered multi-column joins in both directions', () => {
const result = buildLiveDatabaseManifestShards({
connectionType: 'POSTGRESQL',

View file

@ -1,3 +1,5 @@
import type { TableUsageOutput } from '../historic-sql/skill-schemas.js';
const RELATIONSHIP_MAP: Record<string, string> = {
MANY_TO_ONE: 'many_to_one',
ONE_TO_MANY: 'one_to_many',
@ -11,6 +13,14 @@ const RELATIONSHIP_INVERSE: Record<string, string> = {
};
const SCAN_MANAGED_DESCRIPTION_KEYS = new Set(['db', 'ai']);
const HISTORIC_SQL_MANAGED_USAGE_KEYS = new Set([
'narrative',
'frequencyTier',
'commonFilters',
'commonGroupBys',
'commonJoins',
'staleSince',
]);
export interface LiveDatabaseManifestColumn {
name: string;
@ -30,6 +40,7 @@ export interface LiveDatabaseManifestJoinEntry {
export interface LiveDatabaseManifestTableEntry {
table: string;
descriptions?: Record<string, string>;
usage?: TableUsageOutput;
columns: LiveDatabaseManifestColumn[];
joins?: LiveDatabaseManifestJoinEntry[];
}
@ -43,6 +54,7 @@ export interface LiveDatabaseManifestTableData {
catalog: string | null;
db: string | null;
descriptions?: Record<string, string>;
usage?: TableUsageOutput;
columns: Array<{
name: string;
type: string;
@ -73,6 +85,7 @@ export interface BuildLiveDatabaseManifestShardsInput {
mapColumnType: (nativeType: string) => string;
existingPreservedJoins?: Map<string, LiveDatabaseManifestJoinEntry[]>;
existingDescriptions?: Map<string, LiveDatabaseManifestExistingDescriptions>;
existingUsage?: Map<string, TableUsageOutput>;
}
export interface BuildLiveDatabaseManifestShardsResult {
@ -101,6 +114,28 @@ function mergeDescriptionsPreservingExternal(
return Object.keys(result).length > 0 ? result : undefined;
}
export function mergeUsagePreservingExternal(
existing: TableUsageOutput | undefined,
incoming: TableUsageOutput | undefined,
): TableUsageOutput | undefined {
if (!existing && !incoming) {
return undefined;
}
if (!incoming) {
return existing ? { ...existing } : undefined;
}
const result: Record<string, unknown> = {};
if (existing) {
for (const [key, value] of Object.entries(existing)) {
if (!HISTORIC_SQL_MANAGED_USAGE_KEYS.has(key)) {
result[key] = value;
}
}
}
Object.assign(result, incoming);
return Object.keys(result).length > 0 ? (result as TableUsageOutput) : undefined;
}
function getShardKey(connectionType: string, catalog: string | null, db: string | null): string {
const normalized = connectionType.toUpperCase();
@ -254,6 +289,11 @@ export function buildLiveDatabaseManifestShards(
entry.descriptions = tableDescriptions;
}
const usage = mergeUsagePreservingExternal(input.existingUsage?.get(table.name), table.usage);
if (usage) {
entry.usage = usage;
}
const tableJoins = joinsByTable.get(table.name);
if (tableJoins && tableJoins.length > 0) {
entry.joins = tableJoins;

View file

@ -318,7 +318,8 @@ export { NOTION_ORG_KNOWLEDGE_WARNING } from './adapters/notion/chunk.js';
export { NOTION_DEFAULT_MAX_KNOWLEDGE_CREATES_PER_RUN } from './adapters/notion/types.js';
export { NotionSourceAdapter, type NotionSourceAdapterDeps } from './adapters/notion/notion.adapter.js';
export { NotionClient, type NotionApi, type NotionBotInfo } from './adapters/notion/notion-client.js';
export { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './adapters/historic-sql/chunk.js';
export { bucketDistinctUsers, bucketErrorRate, bucketExecutions, bucketP95Runtime, bucketRecency } from './adapters/historic-sql/buckets.js';
export { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './adapters/historic-sql/chunk-unified.js';
export { detectHistoricSqlStagedDir } from './adapters/historic-sql/detect.js';
export {
HistoricSqlExtensionMissingError,
@ -328,41 +329,55 @@ export {
export { HistoricSqlSourceAdapter } from './adapters/historic-sql/historic-sql.adapter.js';
export { BigQueryHistoricSqlQueryHistoryReader } from './adapters/historic-sql/bigquery-query-history-reader.js';
export type { BigQueryHistoricSqlQueryHistoryReaderOptions } from './adapters/historic-sql/bigquery-query-history-reader.js';
export { PostgresPgssQueryHistoryReader } from './adapters/historic-sql/postgres-pgss-query-history-reader.js';
export { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js';
export { SnowflakeHistoricSqlQueryHistoryReader } from './adapters/historic-sql/snowflake-query-history-reader.js';
export { stageHistoricSqlTemplates } from './adapters/historic-sql/stage.js';
export { stageHistoricSqlAggregatedSnapshot } from './adapters/historic-sql/stage-unified.js';
export {
pgssBaselinePath,
readPgssBaseline,
stagePgStatStatementsTemplates,
writePgssBaselineAtomic,
} from './adapters/historic-sql/stage-pgss.js';
export type { PgssBaseline, StagePgStatStatementsTemplatesResult } from './adapters/historic-sql/stage-pgss.js';
historicSqlEvidenceEnvelopeSchema,
historicSqlEvidencePath,
historicSqlPatternEvidenceSchema,
historicSqlTableUsageEvidenceSchema,
serializeHistoricSqlEvidence,
} from './adapters/historic-sql/evidence.js';
export type {
HistoricSqlEvidenceEnvelope,
HistoricSqlPatternEvidence,
HistoricSqlTableUsageEvidence,
} from './adapters/historic-sql/evidence.js';
export { createEmitHistoricSqlEvidenceTool } from './adapters/historic-sql/evidence-tool.js';
export { HistoricSqlProjectionPostProcessor } from './adapters/historic-sql/post-processor.js';
export { projectHistoricSqlEvidence } from './adapters/historic-sql/projection.js';
export type { HistoricSqlProjectionInput, HistoricSqlProjectionResult } from './adapters/historic-sql/projection.js';
export {
patternOutputSchema,
patternsArraySchema,
tableUsageOutputSchema,
} from './adapters/historic-sql/skill-schemas.js';
export type {
PatternOutput,
TableUsageOutput,
} from './adapters/historic-sql/skill-schemas.js';
export type {
AggregatedTemplate,
HistoricSqlDialect,
HistoricSqlManifest,
HistoricSqlMetadata,
HistoricSqlPullConfig,
HistoricSqlQueryHistoryReader,
HistoricSqlRawQueryRow,
HistoricSqlProbeResult,
HistoricSqlReader,
HistoricSqlSourceAdapterDeps,
HistoricSqlTimeWindow,
HistoricSqlUsage,
HistoricSqlUnifiedPullConfig,
KtxPostgresQueryClient,
PostgresPgssAggregateRow,
PostgresPgssProbeResult,
PostgresPgssReader,
PostgresPgssRow,
PostgresPgssSnapshot,
StagedManifest,
StagedPatternsInput,
StagedTableInput,
} from './adapters/historic-sql/types.js';
export {
HISTORIC_SQL_OBJECT_TYPE,
HISTORIC_SQL_SOURCE_KEY,
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlPullConfigSchema,
historicSqlRawQueryRowSchema,
historicSqlUsageSchema,
aggregatedTemplateSchema,
historicSqlUnifiedPullConfigSchema,
stagedManifestSchema,
stagedPatternsInputSchema,
stagedTableInputSchema,
} from './adapters/historic-sql/types.js';
export type { CanonicalPin } from './canonical-pins.js';
export { buildCanonicalPinsPromptBlock, selectRelevantCanonicalPins } from './canonical-pins.js';

View file

@ -405,44 +405,44 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
);
});
it('reuses document evidence indexing and page triage for historic-SQL WorkUnits', async () => {
it('reuses document evidence indexing and page triage for document WorkUnits', async () => {
const deps = makeDeps();
deps.adapter.source = 'historic-sql';
deps.adapter.skillNames = ['historic_sql_ingest'];
deps.adapter.reconcileSkillNames = ['historic_sql_curator'];
deps.adapter.source = 'notion';
deps.adapter.skillNames = ['notion_synthesize'];
deps.adapter.reconcileSkillNames = [];
deps.adapter.evidenceIndexing = 'documents';
deps.adapter.triageSupported = true;
deps.adapter.chunk.mockResolvedValue({
workUnits: [
{ unitKey: 'full', rawFiles: ['templates/full/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
{ unitKey: 'skip', rawFiles: ['templates/skip/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
{ unitKey: 'full', rawFiles: ['pages/full/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
{ unitKey: 'skip', rawFiles: ['pages/skip/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
],
});
deps.diffSetService.compute.mockResolvedValue({
added: ['templates/full/metadata.json', 'templates/skip/metadata.json'],
added: ['pages/full/metadata.json', 'pages/skip/metadata.json'],
modified: [],
deleted: [],
unchanged: [],
});
deps.pageTriage.triageRun.mockResolvedValue({
enabled: true,
fullRawPaths: new Set(['templates/full/metadata.json']),
fullRawPaths: new Set(['pages/full/metadata.json']),
warnings: [],
});
const runner = buildRunner(deps);
(runner as any).stageRawFilesStage1 = vi.fn().mockResolvedValue({
currentHashes: new Map([
['templates/full/metadata.json', 'h-full'],
['templates/skip/metadata.json', 'h-skip'],
['pages/full/metadata.json', 'h-full'],
['pages/skip/metadata.json', 'h-skip'],
]),
rawDirInWorktree: 'raw-sources/c1/historic-sql/s',
rawDirInWorktree: 'raw-sources/c1/notion/s',
});
(runner as any).resolveStagedDir = vi.fn().mockResolvedValue('/tmp/stage/upload-x');
const result = await runner.run({
jobId: 'j1',
connectionId: 'c1',
sourceKey: 'historic-sql',
sourceKey: 'notion',
trigger: 'upload',
bundleRef: { kind: 'upload', uploadId: 'upload-x' },
});
@ -1428,6 +1428,67 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
expect(deps.sessionWorktreeService.cleanup).toHaveBeenCalledWith(expect.any(Object), 'success');
});
it('includes historic-sql post-processor output in memory-flow saved counts', async () => {
const deps = makeDeps();
deps.adapter.source = 'historic-sql';
deps.registry.get.mockReturnValue(deps.adapter);
deps.adapter.chunk.mockResolvedValue({
workUnits: [
{
unitKey: 'historic-sql-table-public-orders',
rawFiles: ['tables/public/orders.json'],
peerFileIndex: [],
dependencyPaths: [],
},
],
});
const postProcessor = {
run: vi.fn().mockResolvedValue({
result: {
tableUsageMerged: 2,
staleTablesMarked: 1,
patternPagesWritten: 3,
stalePatternPagesMarked: 1,
archivedPatternPages: 1,
legacyPagesDeleted: 1,
},
warnings: [],
errors: [],
touchedSources: [{ connectionId: 'c1', sourceName: 'orders' }],
}),
};
const runner = buildRunner(deps, { postProcessors: { 'historic-sql': postProcessor } });
(runner as any).stageRawFilesStage1 = vi.fn().mockResolvedValue({
currentHashes: new Map([['tables/public/orders.json', 'h1']]),
rawDirInWorktree: 'raw-sources/c1/historic-sql/s',
});
(runner as any).resolveStagedDir = vi.fn().mockResolvedValue('/tmp/stage/upload-x');
const memoryFlow = createMemoryFlowLiveBuffer(bundleReplayInput());
await runner.run(
{
jobId: 'j1',
connectionId: 'c1',
sourceKey: 'historic-sql',
trigger: 'upload',
bundleRef: { kind: 'upload', uploadId: 'upload-x' },
},
{
jobId: 'j1',
memoryFlow,
startPhase: () => new TestJobContext('j1', null, () => Promise.resolve(), () => Promise.resolve()),
},
);
expect(memoryFlow.snapshot().events).toContainEqual(
expect.objectContaining({
type: 'saved',
wikiCount: 6,
slCount: 3,
}),
);
});
it('marks post-processor infrastructure failure as failed and preserves worktree cleanup state', async () => {
const deps = makeDeps();
deps.adapter.source = 'metricflow';

View file

@ -16,6 +16,7 @@ import type { ContextEvidenceIndexSummary, IngestBundleRunnerDeps, PageTriageRun
import { buildSyncId, rawSourcesDirForSync } from './raw-sources-paths.js';
import {
buildStageIndexFromReportBody,
postProcessorSavedMemoryCounts,
type IngestReportPostProcessorOutcome,
type IngestReportSnapshot,
} from './reports.js';
@ -1087,11 +1088,12 @@ export class IngestBundleRunner {
}
const commitSha = mergeResult.touchedPaths.length === 0 ? null : mergeResult.squashSha;
const memoryFlowSavedActions = stageIndex.workUnits.flatMap((wu) => wu.actions).concat(reconcileActions);
const postProcessorMemoryCounts = postProcessorSavedMemoryCounts(postProcessorOutcome);
memoryFlow?.emit({
type: 'saved',
commitSha,
wikiCount: countMemoryFlowActions(memoryFlowSavedActions, 'wiki'),
slCount: countMemoryFlowActions(memoryFlowSavedActions, 'sl'),
wikiCount: countMemoryFlowActions(memoryFlowSavedActions, 'wiki') + postProcessorMemoryCounts.wikiCount,
slCount: countMemoryFlowActions(memoryFlowSavedActions, 'sl') + postProcessorMemoryCounts.slCount,
});
await stage6?.updateProgress(1.0, commitSha ? `Saved changes (${commitSha.slice(0, 8)})` : 'No changes to save');

View file

@ -29,48 +29,10 @@ describe('ingest prompt assets', () => {
expect(prompt).not.toMatch(forbiddenProductPattern());
});
it('pins historic-SQL triage rules with synthetic signal fixtures', async () => {
it('does not route historic-SQL through page-triage prompt examples', async () => {
const prompt = await readFile(new URL('../../prompts/skills/page_triage_classifier.md', import.meta.url), 'utf-8');
expect(prompt).toContain('signals.objectType === "historic_sql_template"');
expect(prompt).toContain('executions_bucket=low AND distinct_users_bucket=solo');
expect(prompt).toContain('service_account_only=true AND below the frequency floor');
expect(prompt).toContain('shared human usage with mid or high execution volume');
const fixtures = [
{
label: 'skip low solo template',
objectType: '"objectType": "historic_sql_template"',
executions: '"executions_bucket": "low"',
users: '"distinct_users_bucket": "solo"',
serviceAccount: '"service_account_only": "false"',
lane: '-> `skip`',
},
{
label: 'light service-account-only template',
objectType: '"objectType": "historic_sql_template"',
executions: '"executions_bucket": "high"',
users: '"distinct_users_bucket": "solo"',
serviceAccount: '"service_account_only": "true"',
lane: '-> `light`',
},
{
label: 'full shared human template',
objectType: '"objectType": "historic_sql_template"',
executions: '"executions_bucket": "high"',
users: '"distinct_users_bucket": "team"',
serviceAccount: '"service_account_only": "false"',
lane: '-> `full`',
},
];
for (const fixture of fixtures) {
expect(prompt).toContain(fixture.label);
expect(prompt).toContain(fixture.objectType);
expect(prompt).toContain(fixture.executions);
expect(prompt).toContain(fixture.users);
expect(prompt).toContain(fixture.serviceAccount);
expect(prompt).toContain(fixture.lane);
}
expect(prompt).not.toContain(['historic_sql', 'template'].join('_'));
expect(prompt).not.toContain('service_account_only=true AND below the frequency floor');
});
});

View file

@ -14,14 +14,14 @@ const adapterSkillNames = [
'metabase_ingest',
'metricflow_ingest',
'notion_synthesize',
'historic_sql_ingest',
'historic_sql_table_digest',
'historic_sql_patterns',
'ingest_triage',
'knowledge_capture',
'sl_capture',
] as const;
const adapterReconcileSkillNames = [
'historic_sql_curator',
'ingest_triage',
'knowledge_capture',
'sl_capture',
@ -58,75 +58,37 @@ describe('ingest runtime assets', () => {
}
await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain('# Page Triage Classifier');
await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain(
'signals.objectType === "historic_sql_template"',
);
await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain(
'service_account_only=true AND below the frequency floor',
);
await expect(prompts.loadPrompt('skills/light_extraction')).resolves.toContain('# Light Context Extraction');
});
it('packages historic-SQL WorkUnit skill guidance from KTX assets', async () => {
it('packages historic-SQL table digest guidance from KTX assets', async () => {
const registry = new SkillsRegistryService({ skillsDir });
const skills = await registry.listSkills(['historic_sql_ingest'], 'memory_agent');
const skills = await registry.listSkills(['historic_sql_table_digest'], 'memory_agent');
expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_ingest']);
expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_table_digest']);
const [skill] = skills;
if (!skill) {
throw new Error('historic_sql_ingest skill missing');
}
expect(skill.path.startsWith(skillsDir)).toBe(true);
const body = await readFile(join(skill.path, 'SKILL.md'), 'utf-8');
expect(body).toContain('# Historic SQL Ingest');
expect(body).toContain('Read exactly one historic-SQL template WorkUnit');
expect(body).toContain('metadata.json');
expect(body).toContain('page.md');
expect(body).toContain('usage.json');
expect(body).toContain('manifest.json');
expect(body).toContain('wiki_write');
expect(body).toContain('key: "queries/<intent_slug>"');
expect(body).toContain('"source": "historic-sql"');
expect(body).toContain('representative_sql');
expect(body).toContain('fingerprints');
expect(body).toContain('usage');
expect(body).toContain('SL proposal threshold');
expect(body).toContain('Do not group sibling templates');
expect(body).toContain('Do not copy sample bound_sql');
expect(body).not.toContain('store historic-SQL provenance in the markdown body');
const body = await readFile(join(skills[0]!.path, 'SKILL.md'), 'utf-8');
expect(body).toContain('# Historic SQL Table Digest');
expect(body).toContain('tables/<schema>.<name>.json');
expect(body).toContain('tableUsageOutputSchema');
expect(body).toContain('emit_historic_sql_evidence');
expect(body).toContain('Do not call wiki_write');
expect(body).toContain('Do not call sl_write_source');
expect(body).not.toMatch(forbiddenProductPattern());
});
it('packages historic-SQL curator reconcile guidance from KTX assets', async () => {
it('packages historic-SQL patterns guidance from KTX assets', async () => {
const registry = new SkillsRegistryService({ skillsDir });
const skills = await registry.listSkills(['historic_sql_curator'], 'memory_agent');
const skills = await registry.listSkills(['historic_sql_patterns'], 'memory_agent');
expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_curator']);
expect(skills.map((skill) => skill.name)).toEqual(['historic_sql_patterns']);
const [skill] = skills;
if (!skill) {
throw new Error('historic_sql_curator skill missing');
}
expect(skill.path.startsWith(skillsDir)).toBe(true);
const body = await readFile(join(skill.path, 'SKILL.md'), 'utf-8');
expect(body).toContain('# Historic SQL Curator');
expect(body).toContain('curator pagination');
expect(body).toContain('stage_list');
expect(body).toContain('stage_diff');
expect(body).toContain('read_raw_span');
expect(body).toContain('wiki_search');
expect(body).toContain('wiki_read');
expect(body).toContain('wiki_write');
expect(body).toContain('emit_artifact_resolution');
expect(body).toContain('emit_eviction_decision');
expect(body).toContain('categorical sub-cluster');
expect(body).toContain('historic-sql-demoted');
expect(body).toContain('Do not call `context_candidate_write`');
const body = await readFile(join(skills[0]!.path, 'SKILL.md'), 'utf-8');
expect(body).toContain('# Historic SQL Patterns');
expect(body).toContain('patterns-input/part-0001.json');
expect(body).toContain('patternsArraySchema');
expect(body).toContain('emit_historic_sql_evidence');
expect(body).toContain('cross-table');
expect(body).not.toMatch(forbiddenProductPattern());
});
});

View file

@ -4,6 +4,7 @@ import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { initKtxProject, type KtxLocalProject, loadKtxProject } from '../project/index.js';
import type { SqlAnalysisPort } from '../sql-analysis/index.js';
import type { HistoricSqlReader } from './adapters/historic-sql/types.js';
import { LocalLookerRuntimeStore } from './adapters/looker/local-runtime-store.js';
import { createDefaultLocalIngestAdapters, localPullConfigForAdapter } from './local-adapters.js';
@ -92,6 +93,9 @@ describe('local ingest adapters', () => {
literalSlots: [],
};
},
async analyzeBatch() {
return new Map();
},
};
const adapters = createDefaultLocalIngestAdapters(project, {
historicSql: {
@ -107,6 +111,44 @@ describe('local ingest adapters', () => {
expect(adapters.map((adapter) => adapter.source)).toContain('historic-sql');
expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.fetch).toBeTypeOf('function');
expect(adapters.find((adapter) => adapter.source === 'historic-sql')?.skillNames).toEqual([
'historic_sql_table_digest',
'historic_sql_patterns',
]);
});
it('registers historic-sql with an injected non-Postgres reader and query client', () => {
const reader: HistoricSqlReader = {
async probe() {
return { warnings: [], info: [] };
},
async *fetchAggregated() {},
};
const queryClient = { executeQuery: async () => ({ headers: [], rows: [], totalRows: 0 }) };
const adapters = createDefaultLocalIngestAdapters(project, {
historicSql: {
sqlAnalysis: {
async analyzeForFingerprint(sql) {
return {
fingerprint: 'fp',
normalizedSql: sql,
tablesTouched: [],
literalSlots: [],
};
},
async analyzeBatch() {
return new Map();
},
},
reader,
queryClient,
},
});
const adapter = adapters.find((candidate) => candidate.source === 'historic-sql');
expect(adapter).toBeDefined();
expect(adapter?.fetch).toBeTypeOf('function');
});
it('builds Postgres historic-sql pull config from a local connection', async () => {
@ -121,6 +163,9 @@ describe('local ingest adapters', () => {
literalSlots: [],
};
},
async analyzeBatch() {
return new Map();
},
},
postgresQueryClient: {
async executeQuery() {
@ -146,11 +191,14 @@ describe('local ingest adapters', () => {
await expect(localPullConfigForAdapter(postgresProject, historicSql!, 'warehouse')).resolves.toEqual({
dialect: 'postgres',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: ['^svc_'],
minExecutions: 7,
concurrency: 12,
filters: {
serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' },
dropTrivialProbes: true,
},
redactionPatterns: [],
maxTemplatesPerRun: 123,
minCalls: 7,
staleArchiveAfterDays: 90,
});
});
@ -166,6 +214,9 @@ describe('local ingest adapters', () => {
literalSlots: [],
};
},
async analyzeBatch() {
return new Map();
},
},
postgresQueryClient: {
async executeQuery() {

View file

@ -6,11 +6,11 @@ import type { SqlAnalysisPort } from '../sql-analysis/index.js';
import { DbtSourceAdapter } from './adapters/dbt/dbt.adapter.js';
import { FakeSourceAdapter } from './adapters/fake/fake.adapter.js';
import { HistoricSqlSourceAdapter } from './adapters/historic-sql/historic-sql.adapter.js';
import { PostgresPgssQueryHistoryReader } from './adapters/historic-sql/postgres-pgss-query-history-reader.js';
import { SnowflakeHistoricSqlQueryHistoryReader } from './adapters/historic-sql/snowflake-query-history-reader.js';
import { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js';
import {
HISTORIC_SQL_SOURCE_KEY,
historicSqlPullConfigSchema,
historicSqlUnifiedPullConfigSchema,
type HistoricSqlReader,
type KtxPostgresQueryClient,
} from './adapters/historic-sql/types.js';
import {
@ -43,7 +43,9 @@ export interface DefaultLocalIngestAdaptersOptions {
databaseIntrospection?: Omit<DaemonLiveDatabaseIntrospectionOptions, 'connections' | 'baseUrl'>;
historicSql?: {
sqlAnalysis: SqlAnalysisPort;
postgresQueryClient: KtxPostgresQueryClient;
reader?: HistoricSqlReader;
queryClient?: unknown;
postgresQueryClient?: KtxPostgresQueryClient;
postgresBaselineRootDir?: string;
now?: () => Date;
};
@ -91,18 +93,16 @@ export function createDefaultLocalIngestAdapters(
];
if (options.historicSql) {
const queryClient = options.historicSql.queryClient ?? options.historicSql.postgresQueryClient;
if (!queryClient) {
throw new Error('Historic SQL local adapter requires queryClient or postgresQueryClient');
}
adapters.push(
new HistoricSqlSourceAdapter({
sqlAnalysis: options.historicSql.sqlAnalysis,
reader: new SnowflakeHistoricSqlQueryHistoryReader(),
queryClient: {
executeQuery: async () => {
throw new Error('Local historic-SQL currently supports Postgres pg_stat_statements only');
},
},
postgresReader: new PostgresPgssQueryHistoryReader(),
postgresQueryClient: options.historicSql.postgresQueryClient,
postgresBaselineRootDir: options.historicSql.postgresBaselineRootDir,
reader: options.historicSql.reader ?? new PostgresPgssReader(),
queryClient,
legacyPostgresBaselineRootDir: options.historicSql.postgresBaselineRootDir,
now: options.historicSql.now,
}),
);
@ -180,9 +180,8 @@ export async function localPullConfigForAdapter(
if (historicSql?.enabled !== true) {
throw new Error(`Connection "${connectionId}" does not have historicSql.enabled: true`);
}
return historicSqlPullConfigSchema.parse({
return historicSqlUnifiedPullConfigSchema.parse({
...historicSql,
lastSuccessfulCursor: stringField(historicSql.lastSuccessfulCursor),
});
}
if (adapter.source === 'looker') {

View file

@ -2,6 +2,7 @@ import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import Database from 'better-sqlite3';
import YAML from 'yaml';
import { AgentRunnerService } from '../agent/index.js';
import { initKtxProject, type KtxLocalProject, loadKtxProject } from '../project/index.js';
import { makeLocalGitRepo } from '../test/make-local-git-repo.js';
@ -10,6 +11,7 @@ import { FakeSourceAdapter } from './adapters/fake/fake.adapter.js';
import { LocalLookerRuntimeStore } from './adapters/looker/local-runtime-store.js';
import { createDefaultLocalIngestAdapters, localPullConfigForAdapter } from './local-adapters.js';
import { getLocalIngestStatus, runLocalIngest } from './local-ingest.js';
import type { ChunkResult, DiffSet, SourceAdapter } from './types.js';
class TestAgentRunner extends AgentRunnerService {
override runLoop = vi.fn().mockResolvedValue({ stopReason: 'natural' as const });
@ -86,6 +88,70 @@ class WikiWritingAgentRunner extends AgentRunnerService {
}
}
class HistoricSqlEvidenceAgentRunner extends AgentRunnerService {
override runLoop = vi.fn(async (params: any) => {
if (
params.telemetryTags?.operationName === 'ingest-bundle-wu' &&
params.telemetryTags?.unitKey === 'historic-sql-table-public-orders'
) {
const emitEvidence = params.toolSet.emit_historic_sql_evidence;
if (!emitEvidence?.execute) {
throw new Error('emit_historic_sql_evidence tool was not available to the historic-SQL WorkUnit');
}
const result = await emitEvidence.execute(
{
kind: 'table_usage',
table: 'public.orders',
rawPath: 'tables/public.orders.json',
usage: {
narrative: 'Orders are repeatedly queried by lifecycle status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [],
staleSince: null,
},
},
{ toolCallId: 'historic-sql-evidence' },
);
if (!String(result).includes('Recorded historic-SQL table_usage evidence')) {
throw new Error(`Unexpected historic-SQL evidence result: ${String(result)}`);
}
}
return { stopReason: 'natural' as const };
});
constructor() {
super({ llmProvider: { getModel: () => ({}) as never } as never });
}
}
class HistoricSqlEvidenceTestAdapter implements SourceAdapter {
readonly source = 'historic-sql';
readonly skillNames = ['historic_sql_table_digest'];
readonly reconcileSkillNames: string[] = [];
readonly triageSupported = false;
detect(): Promise<boolean> {
return Promise.resolve(true);
}
chunk(_stagedDir: string, _diffSet?: DiffSet): Promise<ChunkResult> {
return Promise.resolve({
workUnits: [
{
unitKey: 'historic-sql-table-public-orders',
displayLabel: 'public.orders',
rawFiles: ['tables/public.orders.json'],
peerFileIndex: [],
dependencyPaths: ['manifest.json'],
notes:
'Use historic_sql_table_digest. Read this table usage JSON and emit exactly one table_usage object with emit_historic_sql_evidence.',
},
],
});
}
}
function makeLookerRuntimeClient() {
const lookerModels = {
models: [{ name: 'ecommerce', label: 'Ecommerce', explores: [{ name: 'orders', label: 'Orders' }] }],
@ -308,6 +374,90 @@ describe('canonical local ingest', () => {
}
});
it('runs historic-SQL evidence projection through the local bundle post-processor', async () => {
const projectDir = join(tempDir, 'historic-sql-project');
await initKtxProject({ projectDir, projectName: 'warehouse' });
await writeFile(
join(projectDir, 'ktx.yaml'),
[
'project: warehouse',
'connections:',
' warehouse:',
' driver: postgres',
'ingest:',
' adapters:',
' - historic-sql',
' embeddings:',
' backend: deterministic',
'storage:',
' state: sqlite',
' search: sqlite-fts5',
' git:',
' auto_commit: false',
' author: KTX Test <system@ktx.local>',
'',
].join('\n'),
'utf-8',
);
const historicProject = await loadKtxProject({ projectDir });
await historicProject.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
YAML.stringify({ tables: { orders: { table: 'public.orders', columns: [{ name: 'id', type: 'string' }] } } }),
'KTX Test',
'system@ktx.local',
'Seed schema shard',
);
const sourceDir = join(tempDir, 'historic-sql-source');
await mkdir(join(sourceDir, 'tables'), { recursive: true });
await writeFile(
join(sourceDir, 'manifest.json'),
`${JSON.stringify(
{
source: 'historic-sql',
connectionId: 'warehouse',
dialect: 'postgres',
fetchedAt: '2026-05-11T00:00:00.000Z',
windowStart: '2026-02-10T00:00:00.000Z',
windowEnd: '2026-05-11T00:00:00.000Z',
snapshotRowCount: 1,
touchedTableCount: 1,
parseFailures: 0,
warnings: [],
probeWarnings: [],
staleArchiveAfterDays: 90,
},
null,
2,
)}\n`,
'utf-8',
);
await writeFile(join(sourceDir, 'tables/public.orders.json'), '{"table":"public.orders"}\n', 'utf-8');
await writeFile(join(sourceDir, 'patterns-input.json'), '{"templates":[]}\n', 'utf-8');
const agentRunner = new HistoricSqlEvidenceAgentRunner();
const result = await runLocalIngest({
project: historicProject,
adapters: [new HistoricSqlEvidenceTestAdapter()],
adapter: 'historic-sql',
connectionId: 'warehouse',
sourceDir,
jobId: 'historic-sql-local-projection',
agentRunner,
});
expect(result.result.failedWorkUnits).toEqual([]);
expect(result.report.body.postProcessor).toMatchObject({
sourceKey: 'historic-sql',
status: 'success',
result: { tableUsageMerged: 1 },
touchedSources: [{ connectionId: 'warehouse', sourceName: 'orders' }],
});
await expect(readFile(join(projectDir, 'semantic-layer/warehouse/_schema/public.yaml'), 'utf-8')).resolves.toContain(
'Orders are repeatedly queried by lifecycle status.',
);
});
it('rejects direct Metabase scheduled pulls before requiring a local ingest LLM provider', async () => {
const projectDir = join(tempDir, 'metabase-project');
await initKtxProject({ projectDir, projectName: 'warehouse' });

View file

@ -2,6 +2,7 @@ import { mkdirSync } from 'node:fs';
import { join } from 'node:path';
import { fileURLToPath } from 'node:url';
import type { KtxLlmProvider } from '@ktx/llm';
import type { Tool } from 'ai';
import YAML from 'yaml';
import type { AgentRunnerService } from '../agent/index.js';
import { AgentRunnerService as DefaultAgentRunnerService } from '../agent/index.js';
@ -70,6 +71,8 @@ import {
ContextCandidateCarryforwardService,
CuratorPaginationService,
} from './context-candidates/index.js';
import { createEmitHistoricSqlEvidenceTool } from './adapters/historic-sql/evidence-tool.js';
import { HistoricSqlProjectionPostProcessor } from './adapters/historic-sql/post-processor.js';
import { ContextEvidenceIndexService, SqliteContextEvidenceStore } from './context-evidence/index.js';
import { DiffSetService } from './diff-set.service.js';
import { IngestBundleRunner } from './ingest-bundle.runner.js';
@ -439,10 +442,16 @@ class NoopKnowledgeEventPort implements KnowledgeEventPort {
}
class LocalIngestToolSet implements IngestToolsetLike {
constructor(private readonly tools: BaseTool[]) {}
constructor(
private readonly tools: BaseTool[],
private readonly sourceTools: Record<string, Tool> = {},
) {}
toAiSdkTools(context: ToolContext) {
return Object.fromEntries(this.tools.map((tool) => [tool.name, tool.toAiSdkTool(context)]));
return {
...Object.fromEntries(this.tools.map((tool) => [tool.name, tool.toAiSdkTool(context)])),
...this.sourceTools,
};
}
}
@ -510,9 +519,19 @@ class LocalIngestToolsetFactory implements IngestToolsetFactoryPort {
];
}
createIngestWuToolset(_session: ToolSession, options?: { includeContextEvidenceTools?: boolean }): IngestToolsetLike {
createIngestWuToolset(session: ToolSession, options?: { includeContextEvidenceTools?: boolean }): IngestToolsetLike {
const sourceTools: Record<string, Tool> =
session.ingest?.sourceKey === 'historic-sql'
? {
emit_historic_sql_evidence: createEmitHistoricSqlEvidenceTool({
connectionId: session.connectionId,
session,
}),
}
: {};
return new LocalIngestToolSet(
options?.includeContextEvidenceTools ? [...this.baseTools, ...this.contextTools] : this.baseTools,
sourceTools,
);
}
}
@ -668,6 +687,9 @@ export function createLocalBundleIngestRuntime(
settings: { batchSize: 8, maxPasses: 8, stepBudgetPerPass: 60 },
logger,
}),
postProcessors: {
'historic-sql': new HistoricSqlProjectionPostProcessor(),
},
logger,
};

View file

@ -1,4 +1,4 @@
import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
@ -120,14 +120,6 @@ describe('PageTriageService', () => {
await rm(stagedDir, { recursive: true, force: true });
});
function parseSignalsFromClassifierPrompt(prompt: string): unknown {
const match = /<signals>\n([\s\S]*?)\n<\/signals>/.exec(prompt);
if (!match) {
throw new Error('classifier prompt did not include a <signals> block');
}
return JSON.parse(match[1]);
}
it('writes light-lane candidates and keeps the page out of full WorkUnits', async () => {
generateTextMock
.mockResolvedValueOnce({ text: JSON.stringify({ lane: 'light', reason: 'short durable policy' }) } as any)
@ -282,163 +274,6 @@ describe('PageTriageService', () => {
expect(repository.setDocumentTriageLane).toHaveBeenCalledWith('run-1', 'pages/page-1/page.md', 'light');
});
it.each([
{
name: 'skip low solo template',
propertyHints: {
executions_bucket: 'low',
distinct_users_bucket: 'solo',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 1 runtime',
},
expectedLane: 'skip',
expectedReport: { skip: 1, light: 0, full: 0 },
},
{
name: 'light service-account-only template',
propertyHints: {
executions_bucket: 'high',
distinct_users_bucket: 'solo',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'true',
slot_summary: '1 constant, 0 runtime',
},
expectedLane: 'light',
expectedReport: { skip: 0, light: 1, full: 0 },
},
{
name: 'full shared human template',
propertyHints: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '2 constant, 1 runtime',
},
expectedLane: 'full',
expectedReport: { skip: 0, light: 0, full: 1 },
},
] as const)('triages historic-SQL synthetic signal fixture as $expectedLane for $name', async ({
name,
propertyHints,
expectedLane,
expectedReport,
}) => {
const externalId = name.replace(/[^a-z0-9]+/g, '_');
const templateDir = join(stagedDir, 'templates', externalId);
await mkdir(templateDir, { recursive: true });
await writeFile(
join(templateDir, 'metadata.json'),
JSON.stringify({
id: externalId,
title: `snowflake - analytics.orders [${externalId.slice(0, 6)}]`,
path: `templates/${externalId}/page.md`,
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: externalId,
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: propertyHints,
},
}),
'utf-8',
);
await writeFile(
join(templateDir, 'page.md'),
[
`# ${externalId}`,
'',
'## Normalized SQL',
'```sql',
'SELECT count(*) FROM analytics.orders WHERE status = ?',
'```',
'',
'## Tables touched',
'- analytics.orders',
].join('\n'),
'utf-8',
);
adapter.getTriageSignals.mockResolvedValueOnce({
objectType: 'historic_sql_template',
lastEditedAt: '2026-05-04T12:00:00.000Z',
propertyHints,
});
promptService.loadPrompt.mockImplementation((promptName: string) => {
if (promptName === 'skills/page_triage_classifier') {
return readFile(new URL('../../../prompts/skills/page_triage_classifier.md', import.meta.url), 'utf-8');
}
return Promise.resolve(`prompt:${promptName}`);
});
generateTextMock.mockImplementationOnce((args: any) => {
const prompt = args.messages[0].content as string;
expect(prompt).toContain('signals.objectType === "historic_sql_template"');
expect(prompt).toContain('executions_bucket=low AND distinct_users_bucket=solo');
expect(prompt).toContain('service_account_only=true AND below the frequency floor');
expect(prompt).toContain('shared human usage with mid or high execution volume');
expect(parseSignalsFromClassifierPrompt(prompt)).toEqual({
objectType: 'historic_sql_template',
lastEditedAt: '2026-05-04T12:00:00.000Z',
propertyHints,
});
return { text: JSON.stringify({ lane: expectedLane, reason: `${name} fixture` }) } as any;
});
if (expectedLane === 'light') {
generateTextMock.mockResolvedValueOnce({
text: JSON.stringify({
candidates: [
{
candidateKey: 'historic-sql-service-account-template',
topic: 'Historic SQL Service Account Template',
assertion: 'A service-account-only historic SQL template can remain as light evidence.',
rationale: 'The synthetic historic-SQL fixture is service-account-only and below the frequency floor.',
evidenceChunkIds: ['00000000-0000-0000-0000-000000000101'],
suggestedPageKey: 'historic-sql-service-account-template',
actionHint: 'create',
durabilityScore: 2,
authorityScore: 1,
reuseScore: 2,
noveltyScore: 1,
riskScore: 0,
},
],
}),
} as any);
}
const result = await service.triageRun({
stagedDir,
runId: 'run-1',
connectionId: 'conn-1',
sourceKey: 'historic-sql',
syncId: 'sync-1',
jobId: 'job-1',
diffSet: {
added: [`templates/${externalId}/metadata.json`, `templates/${externalId}/page.md`],
modified: [],
deleted: [],
unchanged: [],
},
adapter: adapter as any,
});
expect(result.report).toMatchObject({ pageCount: 1, ...expectedReport });
expect(repository.setDocumentTriageLane).toHaveBeenCalledWith(
'run-1',
`templates/${externalId}/page.md`,
expectedLane,
);
expect(result.fullRawPaths.has(`templates/${externalId}/metadata.json`)).toBe(expectedLane === 'full');
expect(result.fullRawPaths.has(`templates/${externalId}/page.md`)).toBe(expectedLane === 'full');
});
it('triages Notion data-source row pages without reading data-source metadata as page markdown', async () => {
triageSettings.lightExtractionEnabled = false;

View file

@ -79,6 +79,50 @@ export interface IngestReportSnapshot {
createdAt: string;
}
export interface IngestSavedMemoryCounts {
wikiCount: number;
slCount: number;
}
function numericResultField(result: Record<string, unknown>, field: string): number {
const value = result[field];
return typeof value === 'number' && Number.isFinite(value) && value > 0 ? value : 0;
}
export function postProcessorSavedMemoryCounts(
postProcessor: IngestReportPostProcessorOutcome | undefined,
): IngestSavedMemoryCounts {
if (!postProcessor || postProcessor.sourceKey !== 'historic-sql') {
return { wikiCount: 0, slCount: 0 };
}
const result = postProcessor.result;
if (!result || typeof result !== 'object' || Array.isArray(result)) {
return { wikiCount: 0, slCount: 0 };
}
const record = result as Record<string, unknown>;
return {
wikiCount:
numericResultField(record, 'patternPagesWritten') +
numericResultField(record, 'stalePatternPagesMarked') +
numericResultField(record, 'archivedPatternPages') +
numericResultField(record, 'legacyPagesDeleted'),
slCount: numericResultField(record, 'tableUsageMerged') + numericResultField(record, 'staleTablesMarked'),
};
}
export function savedMemoryCountsForReport(report: IngestReportSnapshot): IngestSavedMemoryCounts {
const actions = report.body.workUnits.flatMap((workUnit) => workUnit.actions);
const directCounts = {
wikiCount: actions.filter((action) => action.target === 'wiki').length,
slCount: actions.filter((action) => action.target === 'sl').length,
};
const postProcessorCounts = postProcessorSavedMemoryCounts(report.body.postProcessor);
return {
wikiCount: directCounts.wikiCount + postProcessorCounts.wikiCount,
slCount: directCounts.slCount + postProcessorCounts.slCount,
};
}
export function buildStageIndexFromReportBody(jobId: string, connectionId: string, body: IngestReportBody): StageIndex {
return {
jobId,

View file

@ -520,6 +520,54 @@ describe('createLocalProjectMcpContextPorts', () => {
});
});
it('returns historic SQL usage frequency and snippet through semantic-layer list search', async () => {
const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' });
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
`tables:
orders:
table: public.orders
usage:
narrative: Analysts inspect paid order lifecycle by customer segment.
frequencyTier: high
commonFilters:
- status
commonGroupBys:
- customer_segment
commonJoins:
- table: public.customers
on:
- customer_id
columns:
- name: order_id
type: string
- name: status
type: string
`,
'ktx',
'ktx@example.com',
'Seed usage-backed manifest shard',
);
const ports = createLocalProjectMcpContextPorts(project);
await expect(
ports.semanticLayer?.listSources({ connectionId: 'warehouse', query: 'paid order lifecycle' }),
).resolves.toEqual({
sources: [
expect.objectContaining({
connectionId: 'warehouse',
connectionName: 'warehouse',
name: 'orders',
frequencyTier: 'high',
snippet: expect.stringContaining('<mark>'),
score: expect.any(Number),
matchReasons: expect.arrayContaining(['lexical']),
}),
],
totalSources: 1,
});
});
it('uses configured local embeddings for semantic-layer search when available', async () => {
const project = await initKtxProject({ projectDir: tempDir, projectName: 'warehouse' });
project.config.ingest.embeddings = { backend: 'none', dimensions: 2 };

View file

@ -479,6 +479,8 @@ export function createLocalProjectMcpContextPorts(
columnCount: source.columnCount,
measureCount: source.measureCount,
joinCount: source.joinCount,
...(hasSlSearchMetadata(source) && source.frequencyTier ? { frequencyTier: source.frequencyTier } : {}),
...(hasSlSearchMetadata(source) && source.snippet ? { snippet: source.snippet } : {}),
...(hasSlSearchMetadata(source) ? { score: source.score } : {}),
...(hasSlSearchMetadata(source) && source.matchReasons ? { matchReasons: source.matchReasons } : {}),
...(hasSlSearchMetadata(source) && source.dictionaryMatches

View file

@ -1,4 +1,4 @@
import type { IngestReportSnapshot, MemoryFlowReplayInput } from '../ingest/index.js';
import type { IngestReportSnapshot, MemoryFlowReplayInput, TableUsageOutput } from '../ingest/index.js';
import type { MemoryCaptureService } from '../memory/index.js';
import type { KtxScanMode, KtxScanReport } from '../scan/index.js';
import type {
@ -131,6 +131,8 @@ export interface KtxSemanticLayerSourceSummary {
columnCount: number;
measureCount: number;
joinCount: number;
frequencyTier?: TableUsageOutput['frequencyTier'];
snippet?: string;
score?: number;
matchReasons?: SlSearchMatchReason[];
dictionaryMatches?: SlDictionaryMatch[];

View file

@ -15,7 +15,8 @@ const expectedSkillHeadings: Record<string, string> = {
sl_capture: '# Semantic Layer',
};
const expectedAdapterSkillHeadings: Record<string, string> = {
historic_sql_ingest: '# Historic SQL Ingest',
historic_sql_patterns: '# Historic SQL Patterns',
historic_sql_table_digest: '# Historic SQL Table Digest',
live_database_ingest: '# Live Database Ingest',
looker_ingest: '# Looker Runtime Ingest',
lookml_ingest: '# LookML to KTX Semantic Layer',

View file

@ -232,14 +232,17 @@ describe('@ktx/context package exports', () => {
expect(ingest.HistoricSqlSourceAdapter).toBeTypeOf('function');
expect(ingest.SnowflakeHistoricSqlQueryHistoryReader).toBeTypeOf('function');
expect(ingest.BigQueryHistoricSqlQueryHistoryReader).toBeTypeOf('function');
expect(ingest.PostgresPgssQueryHistoryReader).toBeTypeOf('function');
expect(ingest.stagePgStatStatementsTemplates).toBeTypeOf('function');
expect(ingest.pgssBaselinePath).toBeTypeOf('function');
expect(ingest.readPgssBaseline).toBeTypeOf('function');
expect(ingest.writePgssBaselineAtomic).toBeTypeOf('function');
expect(ingest.PostgresPgssReader).toBeTypeOf('function');
expect(ingest.HistoricSqlExtensionMissingError).toBeTypeOf('function');
expect(ingest.HistoricSqlVersionUnsupportedError).toBeTypeOf('function');
expect(ingest.HISTORIC_SQL_SOURCE_KEY).toBe('historic-sql');
expect(ingest.historicSqlUnifiedPullConfigSchema).toBeDefined();
expect(ingest.aggregatedTemplateSchema).toBeDefined();
expect(ingest.stagedTableInputSchema).toBeDefined();
expect(ingest.historicSqlEvidenceEnvelopeSchema).toBeDefined();
expect(ingest.historicSqlEvidencePath).toBeTypeOf('function');
expect(ingest.createEmitHistoricSqlEvidenceTool).toBeTypeOf('function');
expect(ingest.HistoricSqlProjectionPostProcessor).toBeTypeOf('function');
expect(ingest.SqliteContextEvidenceStore).toBeTypeOf('function');
expect(ingest.SqliteBundleIngestStore).toBeTypeOf('function');
expect(ingest.CuratorPaginationService).toBeTypeOf('function');

View file

@ -742,6 +742,13 @@ describe('writeLocalScanEnrichmentArtifacts', () => {
orders: {
table: 'public.orders',
descriptions: { user: 'Pinned structural description', ai: 'Old generated text' },
usage: {
narrative: 'Orders are commonly filtered by lifecycle status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
ownerNote: 'Preserve analyst note',
},
columns: [
{
name: 'id',
@ -797,6 +804,7 @@ describe('writeLocalScanEnrichmentArtifacts', () => {
tables: {
orders: {
descriptions: Record<string, string>;
usage?: Record<string, unknown>;
columns: Array<{ name: string; descriptions?: Record<string, string> }>;
joins: Array<{ to: string; on: string; source: string }>;
};
@ -807,6 +815,13 @@ describe('writeLocalScanEnrichmentArtifacts', () => {
user: 'Pinned structural description',
db: 'DB orders table',
});
expect(manifest.tables.orders.usage).toEqual({
narrative: 'Orders are commonly filtered by lifecycle status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
ownerNote: 'Preserve analyst note',
});
expect(manifest.tables.orders.columns.find((column) => column.name === 'id')?.descriptions).toEqual({
user: 'Pinned structural id',
db: 'DB order id',

View file

@ -6,6 +6,7 @@ import {
type LiveDatabaseManifestJoinEntry,
type LiveDatabaseManifestShard,
type LiveDatabaseManifestTableData,
type TableUsageOutput,
} from '../ingest/index.js';
import type { KtxScanRelationshipConfig } from '../project/config.js';
import type { KtxLocalProject } from '../project/index.js';
@ -56,6 +57,7 @@ export interface WriteLocalScanEnrichmentArtifactsResult extends WriteLocalScanM
interface ExistingManifestState {
descriptions: Map<string, LiveDatabaseManifestExistingDescriptions>;
preservedJoins: Map<string, LiveDatabaseManifestJoinEntry[]>;
usage: Map<string, TableUsageOutput>;
}
type LocalDescriptionUpdates = KtxLocalScanEnrichmentResult['descriptionUpdates'];
@ -196,6 +198,7 @@ async function loadExistingManifestState(
): Promise<ExistingManifestState> {
const descriptions = new Map<string, LiveDatabaseManifestExistingDescriptions>();
const preservedJoins = new Map<string, LiveDatabaseManifestJoinEntry[]>();
const usage = new Map<string, TableUsageOutput>();
const validTableNames = new Set(snapshot.tables.map((table) => table.name));
const columnsByTable = validColumns(snapshot);
@ -203,7 +206,7 @@ async function loadExistingManifestState(
try {
files = (await project.fileStore.listFiles(schemaDir(connectionId))).files.filter((file) => file.endsWith('.yaml'));
} catch {
return { descriptions, preservedJoins };
return { descriptions, preservedJoins, usage };
}
for (const file of files) {
@ -225,6 +228,9 @@ async function loadExistingManifestState(
),
),
});
if (entry.usage) {
usage.set(tableName, { ...entry.usage });
}
const joins = (entry.joins ?? []).filter((join) => {
return (
(join.source === 'manual' || join.source === 'inferred') &&
@ -241,7 +247,7 @@ async function loadExistingManifestState(
}
}
return { descriptions, preservedJoins };
return { descriptions, preservedJoins, usage };
}
async function writeJsonArtifact(
@ -276,6 +282,7 @@ export async function writeLocalScanManifestShards(
joins: relationshipJoins(input.snapshot, input.relationshipUpdate),
existingDescriptions: existing.descriptions,
existingPreservedJoins: existing.preservedJoins,
existingUsage: existing.usage,
mapColumnType: (dimensionType) => dimensionType,
});

View file

@ -187,6 +187,53 @@ describe('local semantic-layer helpers', () => {
await expect(access(join(project.projectDir, '.ktx/db.sqlite'))).resolves.toBeUndefined();
});
it('searches historic SQL usage and returns frequency tier plus FTS snippet', async () => {
await project.fileStore.writeFile(
'semantic-layer/warehouse/_schema/public.yaml',
`tables:
orders:
table: public.orders
usage:
narrative: Analysts inspect paid order lifecycle by customer segment.
frequencyTier: high
commonFilters:
- status
- created_at
commonGroupBys:
- customer_segment
commonJoins:
- table: public.customers
on:
- customer_id
columns:
- name: order_id
type: string
- name: status
type: string
`,
'ktx',
'ktx@example.com',
'Add usage-backed manifest shard',
);
const results = await searchLocalSlSources(project, {
connectionId: 'warehouse',
query: 'paid lifecycle customer segment',
});
expect(results).toEqual([
expect.objectContaining({
connectionId: 'warehouse',
name: 'orders',
path: 'semantic-layer/warehouse/_schema/public.yaml#orders',
frequencyTier: 'high',
snippet: expect.stringContaining('<mark>'),
matchReasons: expect.arrayContaining(['lexical']),
}),
]);
expect(results[0]?.snippet).toContain('lifecycle');
});
it('searches all connections with one global hybrid ranking pass', async () => {
await writeLocalSlSource(project, {
connectionId: 'warehouse',

View file

@ -26,6 +26,8 @@ export interface LocalSlSourceSummary {
export interface LocalSlSourceSearchResult extends LocalSlSourceSummary {
score: number;
frequencyTier?: NonNullable<SemanticLayerSource['usage']>['frequencyTier'];
snippet?: string;
matchReasons?: SlSearchMatchReason[];
dictionaryMatches?: SlDictionaryMatch[];
lanes?: SlSearchLaneSummary[];
@ -367,6 +369,10 @@ function candidateKey(summary: LocalSlSourceSummary): string {
return `${summary.connectionId}/${summary.name}`;
}
function searchResultUsageFields(source: SemanticLayerSource): Pick<LocalSlSourceSearchResult, 'frequencyTier'> {
return source.usage?.frequencyTier ? { frequencyTier: source.usage.frequencyTier } : {};
}
function tokenLaneCandidates(candidates: LocalSlSearchCandidate[], terms: readonly string[]) {
if (terms.length === 0) {
return [];
@ -483,6 +489,7 @@ export async function searchLocalSlSources(
...result.candidate.summary,
score: result.score,
matchReasons: ['token'],
...searchResultUsageFields(result.candidate.source),
}))
.sort(
(left, right) =>
@ -500,6 +507,7 @@ export async function searchLocalSlSources(
const finalLimit = input.limit ?? candidates.length;
const core = new HybridSearchCore();
const dictionaryEvidence = new Map<string, SlDictionaryMatch[]>();
const lexicalSnippets = new Map<string, string>();
const generators: SearchCandidateGenerator[] = [
{
@ -510,6 +518,11 @@ export async function searchLocalSlSources(
queryText: args.queryText,
limit: args.laneCandidatePoolLimit,
});
for (const row of rows) {
if (row.snippet) {
lexicalSnippets.set(row.id, row.snippet);
}
}
return {
candidates: rows.map((row) => ({ id: row.id, rank: row.rank, rawScore: row.rawScore })),
};
@ -584,9 +597,12 @@ export async function searchLocalSlSources(
continue;
}
const dictionaryMatches = dictionaryEvidence.get(fused.id);
const snippet = lexicalSnippets.get(fused.id);
hydrated.push({
...candidate.summary,
score: fused.score,
...searchResultUsageFields(candidate.source),
...(snippet ? { snippet } : {}),
matchReasons: fused.matchReasons as SlSearchMatchReason[],
...(dictionaryMatches && dictionaryMatches.length > 0 ? { dictionaryMatches } : {}),
lanes: result.lanes,

View file

@ -554,9 +554,11 @@ export async function searchLocalSlSourcesWithPglitePrototype(
continue;
}
const dictionaryMatches = dictionaryEvidence.get(result.id);
const frequencyTier = candidate.source.usage?.frequencyTier;
hydrated.push({
...candidate.summary,
score: result.score,
...(frequencyTier ? { frequencyTier } : {}),
matchReasons: result.matchReasons as SlSearchMatchReason[],
...(dictionaryMatches && dictionaryMatches.length > 0 ? { dictionaryMatches } : {}),
lanes: fused.lanes,

View file

@ -49,5 +49,5 @@ export interface SlSourcesIndexPort {
queryText: string,
limit: number,
minRrfScore?: number,
): Promise<Array<{ sourceName: string; rrfScore: number }>>;
): Promise<Array<{ sourceName: string; rrfScore: number; snippet?: string }>>;
}

View file

@ -1,4 +1,5 @@
import { z } from 'zod';
import { tableUsageOutputSchema } from '../ingest/adapters/historic-sql/skill-schemas.js';
// Literal vocabularies — kept in lockstep with the Python Pydantic model at
// python/ktx-sl/semantic_layer/models.py (SourceColumn / ColumnRole /
@ -125,6 +126,7 @@ export const sourceDefinitionSchema = z
default_time_dimension: defaultTimeDimensionDbtSchema.optional(),
tags: sourceKeyedStringArraySchema.optional(),
freshness: sourceFreshnessSchema.optional(),
usage: tableUsageOutputSchema.optional(),
})
.strict()
.refine((s) => (s.table || s.sql) && !(s.table && s.sql), {
@ -145,6 +147,7 @@ export const sourceOverlaySchema = z
exclude_columns: z.array(z.string()).optional(),
disable_joins: z.array(z.string()).optional(),
default_time_dimension: defaultTimeDimensionDbtSchema.optional(),
usage: tableUsageOutputSchema.optional(),
})
.strict();

View file

@ -5,6 +5,7 @@ import {
composeOverlay,
enrichColumnsFromManifest,
findDanglingSegmentRefs,
projectManifestEntry,
SemanticLayerService,
} from './semantic-layer.service.js';
import { sourceDefinitionSchema } from './schemas.js';
@ -129,6 +130,39 @@ describe('composeOverlay', () => {
dbt: 'dbt description',
});
});
it('replaces manifest usage only when an overlay explicitly provides usage', () => {
const baseWithUsage: SemanticLayerSource = {
...baseTable,
usage: {
narrative: 'Orders are commonly queried by lifecycle status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
},
};
expect(composeOverlay(baseWithUsage, { name: 'fct_labs', measures: [] }).usage).toEqual(baseWithUsage.usage);
const composed = composeOverlay(baseWithUsage, {
name: 'fct_labs',
usage: {
narrative: 'Overlay-curated usage note.',
frequencyTier: 'mid',
commonFilters: ['created_at'],
commonGroupBys: ['created_at'],
commonJoins: [],
},
});
expect(composed.usage).toEqual({
narrative: 'Overlay-curated usage note.',
frequencyTier: 'mid',
commonFilters: ['created_at'],
commonGroupBys: ['created_at'],
commonJoins: [],
});
});
});
describe('enrichColumnsFromManifest', () => {
@ -299,6 +333,61 @@ describe('sourceDefinitionSchema', () => {
dbt: { loaded_at_field: 'updated_at', raw: { warn_after: { count: 12, period: 'hour' } } },
});
});
it('accepts historic SQL usage on standalone sources', () => {
const result = sourceDefinitionSchema.safeParse({
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'string' }],
joins: [],
measures: [],
usage: {
narrative: 'Orders are queried for fulfillment and revenue analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
externalOwner: 'analytics',
},
});
expect(result.success).toBe(true);
if (!result.success) {
return;
}
expect(result.data.usage).toMatchObject({
narrative: 'Orders are queried for fulfillment and revenue analysis.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
externalOwner: 'analytics',
});
});
});
describe('projectManifestEntry', () => {
it('projects manifest usage onto the semantic-layer source', () => {
const source = projectManifestEntry('orders', {
table: 'public.orders',
usage: {
narrative: 'Orders are frequently filtered by status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
},
columns: [
{ name: 'id', type: 'string', pk: true },
{ name: 'status', type: 'string' },
],
});
expect(source.usage).toEqual({
narrative: 'Orders are frequently filtered by status.',
frequencyTier: 'high',
commonFilters: ['status'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
});
});
});
describe('findManifestEntryByTableRef', () => {

View file

@ -1,6 +1,7 @@
import YAML from 'yaml';
import type { KtxFileStorePort, KtxLogger } from '../core/index.js';
import { noopLogger } from '../core/index.js';
import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js';
import type { SlConnectionCatalogPort, SlPythonPort } from './ports.js';
import { normalizeSemanticLayerDescriptions } from './description-normalization.js';
import { isOverlaySource, sourceDefinitionSchema, sourceOverlaySchema } from './schemas.js';
@ -884,6 +885,7 @@ export interface ManifestTableEntry {
joins?: ManifestJoinEntry[];
tags?: { dbt?: string[] };
freshness?: { dbt?: { raw?: unknown; loaded_at_field?: string | null } };
usage?: TableUsageOutput;
}
/** Migrate legacy flat description/db_description fields to a descriptions map. */
@ -930,6 +932,7 @@ export function projectManifestEntry(name: string, entry: ManifestTableEntry): S
measures: [],
...(entry.tags?.dbt?.length ? { tags: entry.tags } : {}),
...(entry.freshness?.dbt ? { freshness: entry.freshness } : {}),
...(entry.usage ? { usage: entry.usage } : {}),
};
}
@ -1005,6 +1008,7 @@ const COMPOSE_KNOWN_KEYS = new Set([
'exclude_columns',
'disable_joins',
'default_time_dimension',
'usage',
]);
export function composeOverlay(base: SemanticLayerSource, overlay: Record<string, unknown>): SemanticLayerSource {
@ -1028,6 +1032,10 @@ export function composeOverlay(base: SemanticLayerSource, overlay: Record<string
};
}
if (normalizedOverlay.usage !== undefined) {
result.usage = normalizedOverlay.usage as SemanticLayerSource['usage'];
}
// Filter out excluded columns
const excluded = new Set((normalizedOverlay.exclude_columns as string[] | undefined) ?? []);
let columns = result.columns.filter((c) => !excluded.has(c.name));

View file

@ -162,4 +162,65 @@ describe('SlSearchService', () => {
expect(text).toContain('loaded_at=updated_at');
expect(text).toContain('warn_after');
});
it('includes historic SQL usage in semantic-layer search text', () => {
const source: SemanticLayerSource = {
name: 'orders',
descriptions: { user: 'Customer orders' },
table: 'public.orders',
grain: ['order_id'],
columns: [{ name: 'order_id', type: 'string' }],
joins: [],
measures: [],
usage: {
narrative: 'Analysts inspect paid and refunded order lifecycle trends by customer segment.',
frequencyTier: 'high',
commonFilters: ['status', 'created_at'],
commonGroupBys: ['customer_segment'],
commonJoins: [{ table: 'public.customers', on: ['customer_id'] }],
staleSince: '2026-05-01T00:00:00.000Z',
},
};
const text = buildSemanticLayerSourceSearchText(source);
expect(text).toContain('usage: Analysts inspect paid and refunded order lifecycle trends by customer segment.');
expect(text).toContain('frequency: high');
expect(text).toContain('commonly filtered by: status, created_at');
expect(text).toContain('commonly grouped by: customer_segment');
expect(text).toContain('commonly joined to public.customers on customer_id');
expect(text).toContain('stale since 2026-05-01T00:00:00.000Z');
});
it('preserves FTS snippets returned by the source index', async () => {
const service = new SlSearchService(
{
maxBatchSize: 16,
computeEmbedding: vi.fn(async () => [1, 0]),
computeEmbeddingsBulk: vi.fn(),
},
{
upsertSources: vi.fn(),
getExistingSearchTexts: vi.fn(),
deleteStale: vi.fn(),
deleteByConnection: vi.fn(),
deleteByConnectionAndName: vi.fn(),
search: vi.fn(async () => [
{
sourceName: 'orders',
rrfScore: 0.75,
snippet: 'usage: paid <mark>order</mark> lifecycle',
},
]),
},
);
await expect(service.search('warehouse', 'order lifecycle', 10)).resolves.toEqual([
{
sourceName: 'orders',
score: 0.75,
snippet: 'usage: paid <mark>order</mark> lifecycle',
},
]);
});
});

View file

@ -71,6 +71,24 @@ export function buildSemanticLayerSourceSearchText(
}
}
if (source.usage) {
const usage = source.usage;
parts.push(`usage: ${usage.narrative}`);
parts.push(`frequency: ${usage.frequencyTier}`);
if (usage.commonFilters.length > 0) {
parts.push(`commonly filtered by: ${usage.commonFilters.join(', ')}`);
}
if (usage.commonGroupBys?.length) {
parts.push(`commonly grouped by: ${usage.commonGroupBys.join(', ')}`);
}
for (const join of usage.commonJoins) {
parts.push(`commonly joined to ${join.table} on ${join.on.join(',')}`);
}
if (usage.staleSince) {
parts.push(`stale since ${usage.staleSince}`);
}
}
return parts.join('. ');
}
@ -150,7 +168,7 @@ export class SlSearchService {
query: string,
limit = 15,
minRrfScore = 0,
): Promise<Array<{ sourceName: string; score: number }>> {
): Promise<Array<{ sourceName: string; score: number; snippet?: string }>> {
let queryEmbedding: number[] | null = null;
try {
queryEmbedding = await this.embeddingService.computeEmbedding(query);
@ -161,7 +179,11 @@ export class SlSearchService {
}
const results = await this.slSourcesRepository.search(connectionId, queryEmbedding, query, limit, minRrfScore);
return results.map((r) => ({ sourceName: r.sourceName, score: r.rrfScore }));
return results.map((result) => ({
sourceName: result.sourceName,
score: result.rrfScore,
...(result.snippet ? { snippet: result.snippet } : {}),
}));
}
buildSearchText(source: SemanticLayerSource, priority: string[] = DEFAULT_PRIORITY): string {

View file

@ -17,7 +17,7 @@ describe('SqliteSlSourcesIndex', () => {
await rm(tempDir, { recursive: true, force: true });
});
it('creates SQLite tables and searches indexed source text', async () => {
it('creates SQLite tables and searches indexed source text with FTS snippets', async () => {
const index = new SqliteSlSourcesIndex({ dbPath });
await index.upsertSources('warehouse', [
@ -34,10 +34,24 @@ describe('SqliteSlSourcesIndex', () => {
]);
await expect(access(dbPath)).resolves.toBeUndefined();
expect(await index.search('warehouse', null, 'gross revenue', 10)).toEqual([
const directResults = await index.search('warehouse', null, 'gross revenue', 10);
expect(directResults).toEqual([
expect.objectContaining({
sourceName: 'orders',
rrfScore: expect.any(Number),
snippet: expect.stringContaining('<mark>'),
}),
]);
expect(directResults[0]?.snippet).toContain('revenue');
const lexicalCandidates = await index.searchLexicalCandidates({ queryText: 'gross revenue', limit: 10 });
expect(lexicalCandidates).toEqual([
expect.objectContaining({
id: 'warehouse/orders',
connectionId: 'warehouse',
sourceName: 'orders',
snippet: expect.stringContaining('<mark>'),
}),
]);
});

View file

@ -19,6 +19,7 @@ type SearchRow = {
connection_id?: string;
source_name: string;
rank: number;
snippet?: string | null;
};
export interface SlSqliteLaneCandidate {
@ -27,6 +28,7 @@ export interface SlSqliteLaneCandidate {
sourceName: string;
rank: number;
rawScore: number;
snippet?: string;
}
export interface SlSqliteDictionaryCandidate extends SlSqliteLaneCandidate {
@ -334,7 +336,11 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
const rows = this.db
.prepare(
`
SELECT connection_id, source_name, bm25(local_sl_sources_fts) AS rank
SELECT
connection_id,
source_name,
bm25(local_sl_sources_fts) AS rank,
snippet(local_sl_sources_fts, 2, '<mark>', '</mark>', '...', 12) AS snippet
FROM local_sl_sources_fts
WHERE local_sl_sources_fts MATCH ?
${connectionPredicate}
@ -350,6 +356,7 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
sourceName: row.source_name,
rank: index + 1,
rawScore: Number(row.rank),
...(typeof row.snippet === 'string' && row.snippet.length > 0 ? { snippet: row.snippet } : {}),
}));
}
@ -499,7 +506,7 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
queryText: string,
limit: number,
minRrfScore = 0,
): Promise<Array<{ sourceName: string; rrfScore: number }>> {
): Promise<Array<{ sourceName: string; rrfScore: number; snippet?: string }>> {
const ftsQuery = normalizeFtsQuery(queryText);
if (!ftsQuery) {
return [];
@ -508,7 +515,10 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
const rows = this.db
.prepare(
`
SELECT source_name, bm25(local_sl_sources_fts) AS rank
SELECT
source_name,
bm25(local_sl_sources_fts) AS rank,
snippet(local_sl_sources_fts, 2, '<mark>', '</mark>', '...', 12) AS snippet
FROM local_sl_sources_fts
WHERE connection_id = ?
AND local_sl_sources_fts MATCH ?
@ -519,7 +529,11 @@ export class SqliteSlSourcesIndex implements SlSourcesIndexPort {
.all(connectionId, ftsQuery, Math.max(1, limit)) as SearchRow[];
return rows
.map((row) => ({ sourceName: row.source_name, rrfScore: scoreFromRank(row.rank) }))
.map((row) => ({
sourceName: row.source_name,
rrfScore: scoreFromRank(row.rank),
...(typeof row.snippet === 'string' && row.snippet.length > 0 ? { snippet: row.snippet } : {}),
}))
.filter((row) => row.rrfScore >= minRrfScore);
}

View file

@ -1,3 +1,5 @@
import type { TableUsageOutput } from '../ingest/adapters/historic-sql/skill-schemas.js';
export interface SemanticLayerSource {
name: string;
descriptions?: Record<string, string>;
@ -42,6 +44,7 @@ export interface SemanticLayerSource {
default_time_dimension?: { dbt?: string };
tags?: { dbt?: string[] };
freshness?: { dbt?: { raw?: unknown; loaded_at_field?: string | null } };
usage?: TableUsageOutput;
}
export interface SemanticLayerQueryInput {

View file

@ -45,6 +45,85 @@ describe('createHttpSqlAnalysisPort', () => {
});
});
it('calls the SQL batch endpoint and maps snake_case response fields into a Map', async () => {
const requestJson = vi.fn(async () => ({
results: {
orders: {
tables_touched: ['public.orders', 'public.customers'],
columns_by_clause: {
select: ['status'],
where: ['created_at'],
join: ['customer_id', 'id'],
},
error: null,
},
broken: {
tables_touched: [],
columns_by_clause: {},
error: 'Invalid expression / Unexpected token',
},
},
}));
const port = createHttpSqlAnalysisPort({ baseUrl: 'http://python.test', requestJson });
await expect(
port.analyzeBatch(
[
{ id: 'orders', sql: 'select status from public.orders' },
{ id: 'broken', sql: 'select * from where' },
],
'postgres',
),
).resolves.toEqual(
new Map([
[
'orders',
{
tablesTouched: ['public.orders', 'public.customers'],
columnsByClause: {
select: ['status'],
where: ['created_at'],
join: ['customer_id', 'id'],
},
error: null,
},
],
[
'broken',
{
tablesTouched: [],
columnsByClause: {},
error: 'Invalid expression / Unexpected token',
},
],
]),
);
expect(requestJson).toHaveBeenCalledWith('/sql/analyze-batch', {
dialect: 'postgres',
items: [
{ id: 'orders', sql: 'select status from public.orders' },
{ id: 'broken', sql: 'select * from where' },
],
});
});
it('rejects malformed SQL batch responses instead of inventing defaults', async () => {
const requestJson = vi.fn(async () => ({
results: {
orders: {
tables_touched: ['public.orders'],
columns_by_clause: { select: ['status'], where: [42] },
error: null,
},
},
}));
const port = createHttpSqlAnalysisPort({ baseUrl: 'http://python.test', requestJson });
await expect(port.analyzeBatch([{ id: 'orders', sql: 'select status from public.orders' }], 'postgres')).rejects
.toThrow('sql analysis response is missing string[] field columns_by_clause.where');
});
it('rejects malformed daemon responses instead of inventing defaults', async () => {
const requestJson = vi.fn(async () => ({
fingerprint: 'abc',

View file

@ -2,6 +2,8 @@ import { request as httpRequest } from 'node:http';
import { request as httpsRequest } from 'node:https';
import { URL } from 'node:url';
import type {
SqlAnalysisBatchItem,
SqlAnalysisBatchResult,
SqlAnalysisDialect,
SqlAnalysisFingerprintResult,
SqlAnalysisLiteralSlot,
@ -94,6 +96,14 @@ function requiredStringArray(raw: Record<string, unknown>, field: string): strin
return value;
}
function requiredObject(raw: Record<string, unknown>, field: string): Record<string, unknown> {
const value = raw[field];
if (!value || typeof value !== 'object' || Array.isArray(value)) {
throw new Error(`sql analysis response is missing object field ${field}`);
}
return value as Record<string, unknown>;
}
function isLiteralSlotType(value: unknown): value is SqlAnalysisLiteralSlotType {
return (
value === 'string' ||
@ -144,6 +154,39 @@ function mapResult(raw: Record<string, unknown>): SqlAnalysisFingerprintResult {
};
}
function mapColumnsByClause(raw: Record<string, unknown>): SqlAnalysisBatchResult['columnsByClause'] {
const value = requiredObject(raw, 'columns_by_clause');
const result: SqlAnalysisBatchResult['columnsByClause'] = {};
for (const [clause, columns] of Object.entries(value)) {
if (!Array.isArray(columns) || columns.some((item) => typeof item !== 'string')) {
throw new Error(`sql analysis response is missing string[] field columns_by_clause.${clause}`);
}
result[clause] = columns;
}
return result;
}
function mapBatchResult(raw: Record<string, unknown>): SqlAnalysisBatchResult {
const error = optionalString(raw, 'error');
return {
tablesTouched: requiredStringArray(raw, 'tables_touched'),
columnsByClause: mapColumnsByClause(raw),
...(error !== undefined ? { error } : {}),
};
}
function mapBatchResponse(raw: Record<string, unknown>): Map<string, SqlAnalysisBatchResult> {
const results = requiredObject(raw, 'results');
return new Map(
Object.entries(results).map(([id, value]) => {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
throw new Error(`sql analysis response contains invalid batch result ${id}`);
}
return [id, mapBatchResult(value as Record<string, unknown>)];
}),
);
}
export function createHttpSqlAnalysisPort(options: HttpSqlAnalysisPortOptions): SqlAnalysisPort {
const requestJson = options.requestJson ?? postJson(options.baseUrl);
@ -155,5 +198,12 @@ export function createHttpSqlAnalysisPort(options: HttpSqlAnalysisPortOptions):
});
return mapResult(raw);
},
async analyzeBatch(items: SqlAnalysisBatchItem[], dialect: SqlAnalysisDialect) {
const raw = await requestJson('/sql/analyze-batch', {
dialect,
items,
});
return mapBatchResponse(raw);
},
};
}

View file

@ -1,6 +1,9 @@
export { createHttpSqlAnalysisPort } from './http-sql-analysis-port.js';
export type { HttpSqlAnalysisPortOptions, KtxSqlAnalysisHttpJsonRunner } from './http-sql-analysis-port.js';
export type {
SqlAnalysisBatchItem,
SqlAnalysisBatchResult,
SqlAnalysisClause,
SqlAnalysisDialect,
SqlAnalysisFingerprintResult,
SqlAnalysisLiteralSlot,

View file

@ -25,6 +25,23 @@ export interface SqlAnalysisFingerprintResult {
error?: string | null;
}
export type SqlAnalysisClause = 'select' | 'where' | 'join' | 'groupBy' | 'having' | 'orderBy' | (string & {});
export interface SqlAnalysisBatchItem {
id: string;
sql: string;
}
export interface SqlAnalysisBatchResult {
tablesTouched: string[];
columnsByClause: Partial<Record<SqlAnalysisClause, string[]>>;
error?: string | null;
}
export interface SqlAnalysisPort {
analyzeForFingerprint(sql: string, dialect: SqlAnalysisDialect): Promise<SqlAnalysisFingerprintResult>;
analyzeBatch(
items: SqlAnalysisBatchItem[],
dialect: SqlAnalysisDialect,
): Promise<Map<string, SqlAnalysisBatchResult>>;
}

View file

@ -24,6 +24,7 @@ export interface WikiFrontmatter {
representative_sql?: string;
usage?: HistoricSqlWikiUsageFrontmatter;
fingerprints?: string[];
stale_since?: string;
}
export interface WikiPage {