refactor: remove legacy historic sql pipeline

This commit is contained in:
Andrey Avtomonov 2026-05-11 19:06:08 +02:00
parent b3ebba9f88
commit d47826a234
31 changed files with 332 additions and 5419 deletions

View file

@ -75,7 +75,7 @@ function failureDetail(error: unknown): string {
async function defaultPostgresHistoricSqlProbe(
input: PostgresHistoricSqlDoctorProbeInput,
): Promise<PostgresHistoricSqlDoctorProbeResult> {
const [{ PostgresPgssQueryHistoryReader }, { KtxPostgresHistoricSqlQueryClient, isKtxPostgresConnectionConfig }] =
const [{ PostgresPgssReader }, { KtxPostgresHistoricSqlQueryClient, isKtxPostgresConnectionConfig }] =
await Promise.all([import('@ktx/context/ingest'), import('@ktx/connector-postgres')]);
if (!isKtxPostgresConnectionConfig(input.connection)) {
@ -88,7 +88,7 @@ async function defaultPostgresHistoricSqlProbe(
env: input.env,
});
try {
return await new PostgresPgssQueryHistoryReader().probe(client);
return await new PostgresPgssReader().probe(client);
} finally {
await client.cleanup();
}

View file

@ -226,7 +226,7 @@ async function defaultHistoricSqlProbe(input: KtxSetupHistoricSqlProbeInput): Pr
const project = await loadKtxProject({ projectDir: input.projectDir });
const connection = project.config.connections[input.connectionId];
const [{ PostgresPgssQueryHistoryReader }, { KtxPostgresHistoricSqlQueryClient, isKtxPostgresConnectionConfig }] =
const [{ PostgresPgssReader }, { KtxPostgresHistoricSqlQueryClient, isKtxPostgresConnectionConfig }] =
await Promise.all([import('@ktx/context/ingest'), import('@ktx/connector-postgres')]);
const postgresConnection = connection as Parameters<typeof isKtxPostgresConnectionConfig>[0];
@ -242,7 +242,7 @@ async function defaultHistoricSqlProbe(input: KtxSetupHistoricSqlProbeInput): Pr
connection: postgresConnection,
});
try {
const result = await new PostgresPgssQueryHistoryReader().probe(client);
const result = await new PostgresPgssReader().probe(client);
return {
ok: true,
lines: [

View file

@ -18,68 +18,6 @@ Analytics evidence (BI tools like Looker, Metabase, Tableau) is durable knowledg
Treat dashboard/Look filter values, saved aggregations, calculated fields, and named tiles as candidate metric/segment definitions — they are durable. Do **not** mark BI evidence as `skip` solely because it is "configuration" or "tied to a data model"; that is exactly the durable knowledge we want to capture.
Historic SQL query-history evidence is durable when usage signals show a repeated pattern worth memory work. For `signals.objectType === "historic_sql_template"`:
- If `propertyHints.executions_bucket=low AND distinct_users_bucket=solo`, return `skip`. A one-off query by one user is indexed evidence, but it is too weak to produce durable knowledge candidates.
- Else if `propertyHints.service_account_only=true AND below the frequency floor`, return `light`. Treat `executions_bucket=low` or `distinct_users_bucket=solo` as below the frequency floor for this rule. Service-account-only templates can preserve useful SQL evidence, but should not occupy a full WorkUnit unless other signals show shared human usage.
- Otherwise apply the standard full/light/skip logic to the page excerpt. Favor `full` for shared human usage with mid or high execution volume, especially when `tables_touched`, normalized SQL, and slot classifications define a reusable metric, segment, threshold, or operational query pattern.
Historic-SQL synthetic signal examples:
- skip low solo template:
```json
{
"objectType": "historic_sql_template",
"propertyHints": {
"executions_bucket": "low",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"slot_summary": "1 constant, 1 runtime"
}
}
```
-> `skip`
- light service-account-only template:
```json
{
"objectType": "historic_sql_template",
"propertyHints": {
"executions_bucket": "high",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "true",
"slot_summary": "1 constant, 0 runtime"
}
}
```
-> `light`
- full shared human template:
```json
{
"objectType": "historic_sql_template",
"propertyHints": {
"executions_bucket": "high",
"distinct_users_bucket": "team",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"slot_summary": "2 constant, 1 runtime"
}
}
```
-> `full`
Examples:
- `Cold Call Script` with reusable call flow, objection handling, or positioning language -> `light` when short, `full` when multi-section or ambiguous.

View file

@ -1,153 +0,0 @@
---
name: historic_sql_curator
description: Reconcile historic-SQL query knowledge pages by deduping collapsed intents, cross-linking categorical sub-clusters, and demoting stale low-signal pages.
callers: [memory_agent]
---
# Historic SQL Curator
Use this skill during Stage 4 reconciliation for the `historic-sql` source. It runs after `historic_sql_ingest` has written query knowledge pages from full-tier template WorkUnits. The Stage 4 runner may use curator pagination, so treat the current prompt as one bounded page of work and finish every listed item you inspect.
## Input Shape
The reconciliation prompt normally exposes:
- `# Stage Index` with WorkUnit keys, raw paths, and wiki or SL actions from Stage 3.
- `# Eviction Set` with deleted raw paths from retired templates.
- `# Curator Pass State` when curator pagination splits reconciliation into multiple passes.
- `# Source Reconciliation Notes` with run-level notes such as staged template count.
Use tools instead of guessing:
- `stage_list` shows every WorkUnit raw path and action.
- `stage_diff` compares two WorkUnits by written artifact overlap.
- `read_raw_span` reads staged `metadata.json`, `page.md`, `usage.json`, and `manifest.json` snippets when page content is not enough.
- `wiki_search`, `wiki_read`, and `wiki_write` inspect and update query knowledge pages.
- `emit_artifact_resolution` records merged or subsumed wiki pages for provenance.
- `eviction_list` and `emit_eviction_decision` handle deleted raw paths.
## Required Workflow
1. Read the `# Stage Index`, `# Eviction Set`, `# Curator Pass State`, and `# Source Reconciliation Notes` sections first.
2. Call `stage_list` when the prompt omits raw paths or when more than one WorkUnit wrote a `queries/...` page.
3. For each successful historic-SQL WorkUnit that wrote a wiki page, call `wiki_read` on that page before deciding whether to merge, cross-link, or demote it.
4. If the page body does not show fingerprint, sub-cluster, tables, or usage clearly enough, call `read_raw_span` on that WorkUnit's `metadata.json` and `usage.json` raw paths.
5. Build intent clusters using table overlap, representative SQL shape, page summaries, fingerprints, sub-cluster IDs, and usage. Same table is not enough to merge; the business intent must collapse.
6. Deduplicate collapsed intents by electing one canonical page, merging useful variant details into it with `wiki_write`, and recording each merged loser with `emit_artifact_resolution`.
7. Cross-link categorical sub-cluster pages that share the same base fingerprint but differ by `__cat_...` sub-cluster ID.
8. Demote pages whose underlying cluster has decayed below the floor in the most recent 3 windows, or in the current window plus eviction evidence showing the template retired.
9. For every deleted raw path in the Eviction Set that you inspect, call `eviction_list` and then `emit_eviction_decision`.
## Canonical Page Election
When two or more pages describe the same query intent, choose the canonical page with this order:
1. The clearest human-readable intent summary.
2. The page with broader non-service-account usage.
3. The page covering more fingerprints or categorical variants of the same intent.
4. The page with the most recent successful usage.
5. Lexicographically first page key.
After electing the canonical page:
- Read every page that will be merged.
- Update the canonical page so it contains one "Historic SQL Variants" section with fingerprints, sub-cluster IDs, tables, usage summaries, and links to sibling page keys when retained.
- Keep `tags` including `historic-sql` and `query-pattern`.
- Preserve useful `sl_refs`; when replacing refs, include the union of cleanly matched SL refs from merged pages.
- For each merged loser, call `emit_artifact_resolution` with:
```json
{
"rawPath": "<loser WorkUnit metadata.json or page.md raw path>",
"artifactKind": "wiki",
"artifactKey": "<loser wiki page key>",
"actionType": "merged",
"reason": "Historic-SQL query intent collapsed into <canonical wiki page key>."
}
```
Use `actionType: "subsumed"` only when the loser page is a thin duplicate with no unique facts worth retaining in the canonical body.
## Categorical Sub-Cluster Cross-Links
A categorical sub-cluster normally has a staged ID like `<fingerprint>__cat_<hash>` or page content that says `Sub-cluster: <value>`. For sibling pages that share the same base fingerprint:
1. Read all sibling pages visible in the current Stage Index or found through `wiki_search`.
2. Keep one page per meaningful category value.
3. Add or update a "Categorical Variants" section in each sibling page:
```markdown
### Categorical Variants
- `<category value>`: [[queries/<sibling_key>]] - <short intent or parameter note>
```
4. Use `wiki_write` with `refs` containing the sibling page keys so cross-links also live in frontmatter.
5. Do not merge categorical siblings only because they share a fingerprint. Merge them only when the category value no longer changes intent.
## Demotion
Demotion preserves history; it is not deletion. A page is demoted when evidence shows its underlying cluster has fallen below the historic-SQL floor:
- `executions < 3`, or
- `distinct_users < 2`, or
- service-account-only usage below the frequency floor, or
- the template was evicted and no active sibling or replacement page supports the same intent.
Require the low-signal state across the most recent 3 windows when page history is available. If only the current window is visible, demote only when eviction evidence confirms the raw template retired; otherwise add a caveat and leave the page active.
Use `wiki_write` to express demotion with the current wiki frontmatter fields:
- Add the `historic-sql-demoted` tag while preserving `historic-sql` and `query-pattern`.
- Prefix the summary with `Demoted historic-SQL pattern: ` unless it already begins with that phrase.
- Add a `### Demotion` section in the body with the last observed usage window, the floor that failed, and the raw path or fingerprint that supports the decision.
When demoting because of an eviction, also call `emit_eviction_decision`:
```json
{
"rawPath": "<deleted raw path>",
"artifactKind": "wiki",
"artifactKey": "<wiki page key>",
"action": "retained_deprecated",
"reason": "Historic-SQL template retired or decayed below the floor; page retained with historic-sql-demoted frontmatter tag."
}
```
## What To Write
Use `wiki_write` for every page update. The tool supports `summary`, `content`, `tags`, `refs`, and `sl_refs` frontmatter fields.
Canonical pages should keep this body shape:
```markdown
## <Canonical Query Intent>
- Source: historic-sql
- Tables: <tables>
- Fingerprints: <fingerprints and sub-clusters>
- Usage: <executions>, <distinct users>, first seen <date>, last seen <date>
### Representative SQL
```sql
<representative SQL or parameterized SQL>
```
### Historic SQL Variants
- `<fingerprint or sub-cluster>`: <what differs and when to use it>
### Categorical Variants
- `<category value>`: [[queries/<sibling_key>]] - <short intent or parameter note>
### Demotion
- Omit this section unless the page is demoted.
```
## Boundaries
- Do not call `context_candidate_write`; historic-SQL Stage 3 writes query pages directly.
- Do not create new artifact types, stores, ports, or tables.
- Do not group low-tier templates that triage already filtered out.
- Do not merge pages on table overlap alone.
- Do not delete a query page solely because usage is low; demote it unless eviction rules and inbound-reference evidence make removal clearly safer.
- Do not copy unredacted sample `bound_sql`, user emails, account IDs, tokens, or free-text literal values into wiki or SL output.
- Do not edit SL unless the reconciliation prompt shows a concrete same-intent conflict or duplicate that requires an existing SL artifact resolution.
- Do not finish a curator pagination pass while a merged page, demoted page, or inspected eviction lacks the corresponding provenance call.

View file

@ -1,170 +0,0 @@
---
name: historic_sql_ingest
description: Convert one full-tier historic-SQL template WorkUnit into a canonical query knowledge page, linked SL refs, and optional semantic-layer proposals.
callers: [memory_agent]
---
# Historic SQL Ingest
Use this skill when the WorkUnit contains files under `raw-sources/<connectionId>/historic-sql/<syncId>/templates/<templateId>/`.
Read exactly one historic-SQL template WorkUnit. Each WorkUnit represents one staged template or categorical sub-cluster that already survived full-tier page triage. It is not an intent cluster.
## Input Shape
The WorkUnit normally exposes:
- `metadata.json` in `rawFiles`.
- `page.md` in `rawFiles`.
- `usage.json` in `dependencyPaths`.
- `manifest.json` in `dependencyPaths`.
- `peerFileIndex` containing sibling templates that you cannot read.
`metadata.json` has the stable identity:
```json
{
"id": "fp_1",
"title": "snowflake - analytics.orders [fp_1]",
"path": "templates/fp_1/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_1",
"sub_cluster_id": null,
"dialect": "snowflake",
"tables_touched": ["analytics.orders"],
"literal_slots": [
{ "position": 1, "type": "string", "classification": "constant" },
{ "position": 2, "type": "date", "classification": "runtime" }
],
"triage_signals": {
"executions_bucket": "high",
"distinct_users_bucket": "team",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"slot_summary": "1 constant, 1 runtime"
}
}
}
```
`page.md` contains mechanically generated normalized SQL and touched tables:
```text
# fp_1
## Normalized SQL
SELECT date_trunc(?, created_at), count(*) FROM analytics.orders WHERE status = ? AND created_at >= ? GROUP BY 1
## Tables touched
- analytics.orders
```
`usage.json` contains volatile stats, literal top values, and redacted samples. Use it for intent inference and usage summaries. Do not treat usage-only drift as a reason to group this template with siblings.
## Required Workflow
1. Read the WorkUnit section in the prompt first.
2. Call `read_raw_file` for `metadata.json`, `page.md`, `usage.json`, and `manifest.json`.
3. Confirm `metadata.objectType === "historic_sql_template"`. If it is not, call `emit_unmapped_fallback` with `reason: "parse_error"`, `fallback: "flagged"`, and the `metadata.json` raw path.
4. Extract `fingerprint`, `sub_cluster_id`, `dialect`, `tables_touched`, `literal_slots`, normalized SQL, usage stats, top literal values, and sample timestamps.
5. Infer one canonical query intent from this template only. Use table names, selected expressions, aggregations, joins, grouping, constant literal slots, and repeated successful samples. Runtime literal slots are parameters, not fixed business rules.
6. Build a short intent slug in kebab-case. Use `queries/<intent_slug>` as the wiki key.
7. Search existing knowledge with `wiki_search` using the intent phrase and the primary table. Prefer updating an existing `queries/...` page when it is the same intent.
8. Discover touched tables with `sl_discover`. Add cleanly matched source names to `sl_refs`. If a table does not map cleanly, keep it in the page body and do not include it in `sl_refs`.
9. Write or update the query page with `wiki_write`.
10. Apply the SL proposal threshold below. If it passes and a useful generic measure, segment, join, or overlay is clear, update the semantic layer and run `sl_validate`.
11. Exit without reading peer files or grouping sibling templates.
## Wiki Page Shape
Use `wiki_write` for pages. Emit the spec frontmatter fields directly on the query page.
Use this shape:
```json
{
"key": "queries/<intent_slug>",
"summary": "<one sentence canonical intent>",
"tags": ["historic-sql", "query-pattern"],
"sl_refs": ["<clean_source_name>"],
"source": "historic-sql",
"intent": "<human-readable canonical intent>",
"tables": ["<tables_touched>"],
"representative_sql": "<parameterized representative SQL>",
"usage": {
"executions": 47812,
"distinct_users": 12,
"first_seen": "2026-02-01",
"last_seen": "2026-04-30",
"p50_runtime_ms": 320,
"p95_runtime_ms": 1180,
"error_rate": 0.0007
},
"fingerprints": ["<fingerprint or sub-cluster id>"],
"content": "## <Canonical Intent Title>\n\n### Parameters\n- <constant/runtime/categorical slot notes>\n\n### When To Use\n- <concise reusable guidance>\n\n### Caveats\n- <redaction, service-account, low-confidence, or mapping notes if present>"
}
```
For Snowflake templates include `usage.rows_produced` when present in `usage.json`; for BigQuery v1 omit `usage.rows_produced`.
The `key: "queries/<intent_slug>"` value writes to `knowledge/global/queries/<intent_slug>.md` during external ingest because bundle ingests write global wiki pages.
## Representative SQL Rules
- Start from normalized SQL in `page.md`.
- For constant slots, use the dominant `usage.literal_slots[].top_values[0][0]` when it has definitional meaning. Quote string and date values in the representative SQL.
- For runtime slots, render named parameters such as `:start_date`, `:as_of`, `:status`, or `:threshold`.
- For categorical slots, document the known categories and write this WorkUnit's sub-cluster value when `sub_cluster_id` is present.
- Preserve the warehouse dialect named by `metadata.properties.dialect`.
- Do not copy sample bound_sql into the wiki unless it is visibly redacted and safer than the normalized SQL. Prefer normalized SQL plus parameter notes.
## SL Proposal Threshold
Only propose semantic-layer changes when all are true:
1. This WorkUnit reached Stage 3 full tier. The runner normally guarantees this, but treat `executions_bucket=low` plus `distinct_users_bucket=solo` or `service_account_only=true` as a reason to write wiki only.
2. At least one `literal_slots[]` entry has `classification: "constant"` and the value has durable business meaning, such as a status, plan tier, channel, threshold, or fixed category.
3. Every table in `tables_touched` maps cleanly through `sl_discover` to an existing SL source.
When the threshold passes:
- Call `sl_read_source` before editing an existing source.
- Prefer adding a measure, segment, computed dimension, join, or manifest-backed overlay over creating a standalone SQL source.
- Use `sl_write_source` for a manifest-backed overlay only with `name:` plus additive fields such as `measures:`, `segments:`, `description:`, or `joins:`. Do not include `sql:`, `table:`, `grain:`, or `columns:` on manifest-backed overlays.
- Use `sl_edit_source` for targeted edits when the source file already exists.
- Run `sl_validate` after every SL write or edit.
- Keep runtime parameters as caller filters. Do not bake dates, user ids, ids, search strings, or other runtime slots into SL measures.
When the threshold does not pass, write the wiki page and set `sl_refs` for any cleanly discovered touched tables. A wiki-only result is valid.
## Intent Inference Guidance
Prefer canonical intent names that describe the business question, not the SQL shape:
- Good: `queries/monthly-paid-order-count`
- Good: `queries/enterprise-contract-renewal-risk`
- Good: `queries/support-ticket-first-response-time`
- Weak: `queries/fp-1`
- Weak: `queries/count-orders-group-by-date`
Use the SQL shape to infer intent:
- `COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, `GROUP BY`, and date truncation usually indicate metrics or rollups.
- Constant slots often name segments, statuses, tiers, regions, or thresholds.
- Runtime slots usually represent time windows, selected entities, or caller filters.
- Repeated successful samples from multiple human users make the page more durable.
- High error rates, service-account-only use, or old `last_seen` values belong in caveats.
## Boundaries
- Do not group sibling templates. Stage 4 `historic_sql_curator` owns cross-template clustering and dedupe.
- Do not read paths listed only in `peerFileIndex`.
- Do not create or update `historic_sql_curator`.
- Do not call `context_candidate_write`; historic-SQL Stage 3 writes final wiki and optional SL artifacts directly.
- Do not invent joins, measures, or definitions that are not supported by the normalized SQL, touched tables, literal slots, or existing SL sources.
- Do not copy unredacted sample `bound_sql`, user emails, account ids, tokens, or free-text literal values into wiki or SL output.
- Do not write SL changes when any touched table lacks a clean SL mapping.
- Do not finish after only an SL write. Always write or update the query knowledge page first so the canonical SQL pattern is searchable.

View file

@ -1,146 +0,0 @@
{
"name": "eviction-churn",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": [
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn"
]
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 3,
"rows": [
{
"queryid": "501",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 20,
"totalExecTime": 500,
"meanExecTime": 25,
"totalRows": 40
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": null,
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q501": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 20,
"totalExecTime": 500,
"totalRows": 40
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T08:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn",
"pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn",
"baseline_first_run:no_previous_pgss_baseline"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 3,
"templates": [
{
"id": "db5_q501",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q501/page.md"
}
]
}
},
"templates/db5_q501/metadata.json": {
"json": {
"id": "db5_q501",
"title": "postgres · analytics.orders [db5_q501]",
"path": "templates/db5_q501/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q501/page.md": {
"text": "# db5_q501\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q501/usage.json": {
"json": {
"stats": {
"executions": 20,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 25,
"error_rate": 0,
"rows_produced": 40
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -1,144 +0,0 @@
{
"name": "first-run",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "101",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 10,
"totalExecTime": 250,
"meanExecTime": 25,
"totalRows": 20
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [
"^svc_"
],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": null,
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q101": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 10,
"totalExecTime": 250,
"totalRows": 20
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T08:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_first_run:no_previous_pgss_baseline"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q101",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q101/page.md"
}
]
}
},
"templates/db5_q101/metadata.json": {
"json": {
"id": "db5_q101",
"title": "postgres · analytics.orders [db5_q101]",
"path": "templates/db5_q101/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q101/page.md": {
"text": "# db5_q101\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q101/usage.json": {
"json": {
"stats": {
"executions": 10,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 25,
"error_rate": 0,
"rows_produced": 20
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -1,181 +0,0 @@
{
"name": "normal-delta",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "201",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 12,
"totalExecTime": 160,
"meanExecTime": 13.333333333333334,
"totalRows": 58
},
{
"queryid": "201",
"userid": "12",
"username": "svc_loader",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 5,
"totalExecTime": 50,
"meanExecTime": 10,
"totalRows": 25
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [
"^svc_"
],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q201": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 10,
"totalExecTime": 100,
"totalRows": 50
},
"12": {
"calls": 5,
"totalExecTime": 50,
"totalRows": 25
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q201": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 12,
"totalExecTime": 160,
"totalRows": 58
},
"12": {
"calls": 5,
"totalExecTime": 50,
"totalRows": 25
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": false,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q201",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q201/page.md"
}
]
}
},
"templates/db5_q201/metadata.json": {
"json": {
"id": "db5_q201",
"title": "postgres · analytics.orders [db5_q201]",
"path": "templates/db5_q201/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "low",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q201/page.md": {
"text": "# db5_q201\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q201/usage.json": {
"json": {
"stats": {
"executions": 2,
"distinct_users": 1,
"first_seen": "2026-05-08T09:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 30,
"error_rate": 0,
"rows_produced": 8
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -1,159 +0,0 @@
{
"name": "reset-detected",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T11:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "301",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 3,
"totalExecTime": 90,
"meanExecTime": 30,
"totalRows": 9
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q301": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 100,
"totalExecTime": 1000,
"totalRows": 500
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T11:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q301": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 3,
"totalExecTime": 90,
"totalRows": 9
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z"
],
"degraded": true,
"statsResetAt": "2026-05-08T11:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q301",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q301/page.md"
}
]
}
},
"templates/db5_q301/metadata.json": {
"json": {
"id": "db5_q301",
"title": "postgres · analytics.orders [db5_q301]",
"path": "templates/db5_q301/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q301/page.md": {
"text": "# db5_q301\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q301/usage.json": {
"json": {
"stats": {
"executions": 3,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 30,
"error_rate": 0,
"rows_produced": 9
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -1,159 +0,0 @@
{
"name": "version-change",
"now": "2026-05-08T12:00:00.000Z",
"connectionId": "warehouse",
"probe": {
"pgServerVersion": "PostgreSQL 16.4",
"warnings": []
},
"snapshot": {
"statsResetAt": "2026-05-08T08:00:00.000Z",
"deallocCount": 0,
"rows": [
{
"queryid": "401",
"userid": "11",
"username": "analyst",
"dbid": "5",
"database": "analytics",
"query": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"calls": 4,
"totalExecTime": 80,
"meanExecTime": 20,
"totalRows": 8
}
]
},
"pullConfig": {
"dialect": "postgres",
"windowDays": 90,
"lastSuccessfulCursor": null,
"serviceAccountUserPatterns": [],
"redactionPatterns": [],
"maxTemplatesPerRun": 5000,
"minCalls": 5
},
"analysisBySql": {
"SELECT count(*) FROM analytics.orders WHERE status = $1": {
"fingerprint": "fp_orders_status",
"normalizedSql": "SELECT count(*) FROM analytics.orders WHERE status = $1",
"tablesTouched": [
"analytics.orders"
],
"literalSlots": []
}
},
"baseline": {
"version": 1,
"fetchedAt": "2026-05-08T10:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 15.7",
"templates": {
"db5_q401": {
"firstObservedAt": "2026-05-08T09:00:00.000Z",
"perUser": {
"11": {
"calls": 100,
"totalExecTime": 1000,
"totalRows": 500
}
}
}
}
},
"expectedBaseline": {
"version": 1,
"fetchedAt": "2026-05-08T12:00:00.000Z",
"statsResetAt": "2026-05-08T08:00:00.000Z",
"pgServerVersion": "PostgreSQL 16.4",
"templates": {
"db5_q401": {
"firstObservedAt": "2026-05-08T12:00:00.000Z",
"perUser": {
"11": {
"calls": 4,
"totalExecTime": 80,
"totalRows": 8
}
}
}
}
},
"expectedFiles": {
"manifest.json": {
"json": {
"source": "historic-sql",
"connectionId": "warehouse",
"dialect": "postgres",
"fetchedAt": "2026-05-08T12:00:00.000Z",
"windowStart": "2026-05-08T10:00:00.000Z",
"windowEnd": "2026-05-08T12:00:00.000Z",
"nextSuccessfulCursor": "2026-05-08T12:00:00.000Z",
"templateCount": 1,
"capped": false,
"warnings": [
"baseline_reset:pg_server_major changed from 15 to 16"
],
"degraded": true,
"statsResetAt": "2026-05-08T08:00:00.000Z",
"baselineFirstRun": true,
"pgServerVersion": "PostgreSQL 16.4",
"deallocCount": 0,
"templates": [
{
"id": "db5_q401",
"fingerprint": "fp_orders_status",
"subClusterId": null,
"path": "templates/db5_q401/page.md"
}
]
}
},
"templates/db5_q401/metadata.json": {
"json": {
"id": "db5_q401",
"title": "postgres · analytics.orders [db5_q401]",
"path": "templates/db5_q401/page.md",
"objectType": "historic_sql_template",
"lastEditedAt": null,
"properties": {
"fingerprint": "fp_orders_status",
"sub_cluster_id": null,
"dialect": "postgres",
"tables_touched": [
"analytics.orders"
],
"literal_slots": [],
"triage_signals": {
"executions_bucket": "mid",
"distinct_users_bucket": "solo",
"error_rate_bucket": "ok",
"recency_bucket": "active",
"service_account_only": "false",
"runtime_bucket": "fast"
}
}
}
},
"templates/db5_q401/page.md": {
"text": "# db5_q401\n\n## Normalized SQL\n```sql\nSELECT count(*) FROM analytics.orders WHERE status = $1\n```\n\n## Tables touched\n- analytics.orders\n"
},
"templates/db5_q401/usage.json": {
"json": {
"stats": {
"executions": 4,
"distinct_users": 1,
"first_seen": "2026-05-08T12:00:00.000Z",
"last_seen": "2026-05-08T12:00:00.000Z",
"p50_runtime_ms": null,
"p95_runtime_ms": null,
"mean_runtime_ms": 20,
"error_rate": 0,
"rows_produced": 8
},
"literal_slots": [],
"samples": []
}
}
}
}

View file

@ -1,251 +0,0 @@
import { mkdir, mkdtemp, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './chunk.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-chunk-'));
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
const target = join(root, relPath);
await mkdir(join(target, '..'), { recursive: true });
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
async function writeTemplate(root: string): Promise<void> {
await writeJson(root, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 1,
capped: false,
warnings: ['source warning'],
templates: [{ id: 'fp_1', fingerprint: 'fp_1', subClusterId: null, path: 'templates/fp_1/page.md' }],
});
await writeJson(root, 'templates/fp_1/metadata.json', {
id: 'fp_1',
title: 'snowflake · analytics.orders [fp_1]',
path: 'templates/fp_1/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_1',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
},
});
await writeFile(join(root, 'templates/fp_1/page.md'), '# fp_1\n', 'utf-8');
await writeJson(root, 'templates/fp_1/usage.json', {
stats: {
executions: 20,
distinct_users: 3,
first_seen: '2026-05-01T00:00:00.000Z',
last_seen: '2026-05-04T11:55:00.000Z',
p50_runtime_ms: 100,
p95_runtime_ms: 200,
error_rate: 0,
rows_produced: 20,
},
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 20]] }],
samples: [],
});
}
async function writeSubclusterTemplates(root: string): Promise<void> {
await writeJson(root, 'manifest.json', {
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 2,
capped: false,
warnings: [],
templates: [
{
id: 'fp_order_status__cat_2b2ff2318877',
fingerprint: 'fp_order_status',
subClusterId: 'cat_2b2ff2318877',
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
},
{
id: 'fp_order_status__cat_34f037ddcbfa',
fingerprint: 'fp_order_status',
subClusterId: 'cat_34f037ddcbfa',
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
},
],
});
for (const template of [
{ id: 'fp_order_status__cat_2b2ff2318877', subClusterId: 'cat_2b2ff2318877' },
{ id: 'fp_order_status__cat_34f037ddcbfa', subClusterId: 'cat_34f037ddcbfa' },
]) {
await writeJson(root, `templates/${template.id}/metadata.json`, {
id: template.id,
title: `snowflake · analytics.orders [fp_ord:${template.subClusterId.slice(-6)}]`,
path: `templates/${template.id}/page.md`,
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_order_status',
sub_cluster_id: template.subClusterId,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
triage_signals: {
executions_bucket: 'mid',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '0 constant, 0 runtime',
},
},
});
await writeFile(join(root, `templates/${template.id}/page.md`), `# ${template.id}\n`, 'utf-8');
await writeJson(root, `templates/${template.id}/usage.json`, {
stats: {
executions: 3,
distinct_users: 3,
first_seen: '2026-05-04T10:00:00.000Z',
last_seen: '2026-05-04T10:05:00.000Z',
p50_runtime_ms: 120,
p95_runtime_ms: 150,
error_rate: 0,
rows_produced: 36,
},
literal_slots: [{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }],
samples: [],
});
}
}
describe('chunkHistoricSqlStagedDir', () => {
it('emits one WorkUnit per changed template and keeps usage as dependency', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: ['templates/fp_1/metadata.json'],
modified: [],
deleted: [],
unchanged: ['templates/fp_1/page.md', 'templates/fp_1/usage.json', 'manifest.json'],
});
expect(result.workUnits).toEqual([
{
unitKey: 'historic-sql-fp-1',
displayLabel: 'snowflake · analytics.orders [fp_1]',
rawFiles: ['templates/fp_1/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_1/usage.json'],
peerFileIndex: ['templates/fp_1/page.md'],
notes:
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
},
]);
expect(result.contextReport).toEqual({ capped: false, warnings: ['source warning'] });
});
it('emits one WorkUnit per changed categorical sub-cluster', async () => {
const stagedDir = await tempDir();
await writeSubclusterTemplates(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [
'templates/fp_order_status__cat_2b2ff2318877/metadata.json',
'templates/fp_order_status__cat_34f037ddcbfa/metadata.json',
],
modified: [],
deleted: [],
unchanged: [
'manifest.json',
'templates/fp_order_status__cat_2b2ff2318877/page.md',
'templates/fp_order_status__cat_2b2ff2318877/usage.json',
'templates/fp_order_status__cat_34f037ddcbfa/page.md',
'templates/fp_order_status__cat_34f037ddcbfa/usage.json',
],
});
expect(
result.workUnits.map((unit) => ({
unitKey: unit.unitKey,
displayLabel: unit.displayLabel,
rawFiles: unit.rawFiles,
dependencyPaths: unit.dependencyPaths,
})),
).toEqual([
{
unitKey: 'historic-sql-fp-order-status-cat-2b2ff2318877',
displayLabel: 'snowflake · analytics.orders [fp_ord:318877]',
rawFiles: ['templates/fp_order_status__cat_2b2ff2318877/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_2b2ff2318877/usage.json'],
},
{
unitKey: 'historic-sql-fp-order-status-cat-34f037ddcbfa',
displayLabel: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
rawFiles: ['templates/fp_order_status__cat_34f037ddcbfa/metadata.json'],
dependencyPaths: ['manifest.json', 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'],
},
]);
});
it('emits zero WorkUnits for usage-only diffs', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [],
modified: ['templates/fp_1/usage.json'],
deleted: [],
unchanged: ['templates/fp_1/metadata.json', 'templates/fp_1/page.md', 'manifest.json'],
});
expect(result.workUnits).toEqual([]);
expect(result.eviction).toBeUndefined();
});
it('emits eviction only for deleted metadata or page files', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const result = await chunkHistoricSqlStagedDir(stagedDir, {
added: [],
modified: [],
deleted: ['templates/fp_1/usage.json', 'templates/fp_2/page.md'],
unchanged: [],
});
expect(result.eviction).toEqual({ deletedRawPaths: ['templates/fp_2/page.md'] });
});
it('describes historic-sql scope without including unrelated paths', async () => {
const stagedDir = await tempDir();
await writeTemplate(stagedDir);
const scope = await describeHistoricSqlScope(stagedDir);
expect(scope.fingerprint).toHaveLength(64);
expect(scope.isPathInScope('manifest.json')).toBe(true);
expect(scope.isPathInScope('templates/fp_1/usage.json')).toBe(true);
expect(scope.isPathInScope('pages/notion/page.md')).toBe(false);
});
});

View file

@ -1,86 +0,0 @@
import { createHash } from 'node:crypto';
import { readFile, readdir } from 'node:fs/promises';
import { join, relative } from 'node:path';
import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js';
import { historicSqlManifestSchema, historicSqlMetadataSchema } from './types.js';
async function walk(root: string): Promise<string[]> {
const entries = await readdir(root, { withFileTypes: true, recursive: true });
return entries
.filter((entry) => entry.isFile())
.map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/'))
.sort();
}
function safeUnitKey(id: string): string {
return `historic-sql-${id.replace(/[^a-zA-Z0-9]+/g, '-').replace(/^-+|-+$/g, '')}`;
}
async function readManifest(stagedDir: string) {
try {
return historicSqlManifestSchema.parse(JSON.parse(await readFile(join(stagedDir, 'manifest.json'), 'utf-8')));
} catch (error) {
throw new Error(`Invalid historic-SQL manifest: ${error instanceof Error ? error.message : String(error)}`);
}
}
export async function chunkHistoricSqlStagedDir(stagedDir: string, diffSet?: DiffSet): Promise<ChunkResult> {
const files = await walk(stagedDir);
const manifest = await readManifest(stagedDir);
const touched = diffSet ? new Set([...diffSet.added, ...diffSet.modified]) : null;
const workUnits: WorkUnit[] = [];
for (const pagePath of files.filter((path) => /^templates\/[^/]+\/page\.md$/.test(path))) {
const metadataPath = pagePath.replace(/\/page\.md$/, '/metadata.json');
const usagePath = pagePath.replace(/\/page\.md$/, '/usage.json');
const primary = [metadataPath, pagePath].filter((path) => files.includes(path));
if (touched && !primary.some((path) => touched.has(path))) {
continue;
}
const metadata = historicSqlMetadataSchema.parse(JSON.parse(await readFile(join(stagedDir, metadataPath), 'utf-8')));
const rawFiles = touched ? primary.filter((path) => touched.has(path)).sort() : primary.sort();
const dependencyPaths = ['manifest.json', files.includes(usagePath) ? usagePath : null]
.filter((path): path is string => typeof path === 'string' && !rawFiles.includes(path))
.sort();
const excluded = new Set([...rawFiles, ...dependencyPaths]);
const peerFileIndex = files.filter((path) => !excluded.has(path)).sort();
workUnits.push({
unitKey: safeUnitKey(metadata.id),
displayLabel: metadata.title,
rawFiles,
dependencyPaths,
peerFileIndex,
notes:
'Infer canonical query intent for this single historic-SQL template only. Read metadata.json, page.md, and usage.json for this template; do not group sibling templates in this WorkUnit.',
});
}
const deletedPrimary = diffSet?.deleted.filter((path) => /^templates\/[^/]+\/(metadata\.json|page\.md)$/.test(path));
return {
workUnits,
eviction: deletedPrimary && deletedPrimary.length > 0 ? { deletedRawPaths: deletedPrimary.sort() } : undefined,
reconcileNotes: [`Historic-SQL staged templates=${manifest.templateCount}`],
contextReport: {
capped: manifest.capped,
warnings: manifest.warnings,
},
};
}
export async function describeHistoricSqlScope(stagedDir: string): Promise<ScopeDescriptor> {
const manifest = await readManifest(stagedDir);
const scopeKey = JSON.stringify({
connectionId: manifest.connectionId,
dialect: manifest.dialect,
windowStart: manifest.windowStart,
windowEnd: manifest.windowEnd,
});
const fingerprint = createHash('sha256').update(scopeKey).digest('hex');
return {
fingerprint,
isPathInScope: (rawPath) => rawPath === 'manifest.json' || rawPath.startsWith('templates/'),
};
}

View file

@ -3,13 +3,7 @@ import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import { detectHistoricSqlStagedDir } from './detect.js';
import {
HISTORIC_SQL_SOURCE_KEY,
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlPullConfigSchema,
historicSqlUsageSchema,
} from './types.js';
import { HISTORIC_SQL_SOURCE_KEY, stagedManifestSchema } from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-detect-'));
@ -21,32 +15,35 @@ async function writeJson(root: string, relPath: string, value: unknown): Promise
await writeFile(target, `${JSON.stringify(value, null, 2)}\n`, 'utf-8');
}
function manifest() {
return stagedManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_1',
dialect: 'postgres',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
snapshotRowCount: 0,
touchedTableCount: 0,
parseFailures: 0,
warnings: [],
probeWarnings: [],
});
}
describe('historic-sql staged dir detection', () => {
it('detects manifest source', async () => {
const stagedDir = await tempDir();
await writeJson(stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_1',
dialect: 'snowflake',
fetchedAt: '2026-05-04T12:00:00.000Z',
windowStart: '2026-02-03T12:00:00.000Z',
windowEnd: '2026-05-04T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-04T11:55:00.000Z',
templateCount: 0,
capped: false,
warnings: [],
templates: [],
});
await writeJson(stagedDir, 'manifest.json', manifest());
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
});
it('detects document-shaped template structure without manifest', async () => {
it('detects unified table and patterns structure without manifest', async () => {
const stagedDir = await tempDir();
await writeFile(join(stagedDir, 'not-a-match.txt'), 'x', 'utf-8');
await mkdir(join(stagedDir, 'templates', 'fp_1'), { recursive: true });
await writeFile(join(stagedDir, 'templates', 'fp_1', 'metadata.json'), '{}', 'utf-8');
await writeFile(join(stagedDir, 'templates', 'fp_1', 'page.md'), '# fp_1\n', 'utf-8');
await writeJson(stagedDir, 'patterns-input.json', { templates: [] });
await writeJson(stagedDir, 'tables/public.orders.json', { table: 'public.orders' });
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(true);
});
@ -58,140 +55,3 @@ describe('historic-sql staged dir detection', () => {
await expect(detectHistoricSqlStagedDir(stagedDir)).resolves.toBe(false);
});
});
describe('historic-sql schemas', () => {
it('defaults disabled optional pull-config fields through the parser', () => {
expect(
historicSqlPullConfigSchema.parse({
dialect: 'bigquery',
}),
).toEqual({
dialect: 'bigquery',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
});
});
it('accepts postgres pull config with a minCalls floor', () => {
expect(
historicSqlPullConfigSchema.parse({
dialect: 'postgres',
minCalls: 12,
}),
).toEqual({
dialect: 'postgres',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 12,
});
});
it('accepts postgres manifest fields with defaults for older dialects', () => {
expect(
historicSqlManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_pg',
dialect: 'postgres',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowStart: '2026-05-08T11:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
templateCount: 0,
capped: false,
warnings: [],
templates: [],
degraded: true,
statsResetAt: '2026-05-01T00:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 3,
}),
).toMatchObject({
dialect: 'postgres',
degraded: true,
statsResetAt: '2026-05-01T00:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 3,
});
expect(
historicSqlManifestSchema.parse({
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: 'conn_sf',
dialect: 'snowflake',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowStart: '2026-05-01T12:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: null,
templateCount: 0,
capped: false,
warnings: [],
templates: [],
}),
).toMatchObject({
degraded: false,
statsResetAt: null,
baselineFirstRun: false,
pgServerVersion: null,
deallocCount: null,
});
});
it('accepts postgres usage stats with mean_runtime_ms and empty samples', () => {
const parsed = historicSqlUsageSchema.parse({
stats: {
executions: 25,
distinct_users: 2,
first_seen: '2026-05-08T10:00:00.000Z',
last_seen: '2026-05-08T12:00:00.000Z',
p50_runtime_ms: null,
p95_runtime_ms: null,
mean_runtime_ms: 32.5,
error_rate: 0,
rows_produced: 1042,
},
literal_slots: [],
samples: [],
});
expect(parsed.stats.mean_runtime_ms).toBe(32.5);
expect(parsed.samples).toEqual([]);
});
it('pins the Notion-compatible metadata envelope', () => {
const parsed = historicSqlMetadataSchema.parse({
id: 'fp_1',
title: 'snowflake · analytics.orders [fp_1]',
path: 'templates/fp_1/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_1',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 0 runtime',
},
},
});
expect(parsed.objectType).toBe('historic_sql_template');
expect(parsed.lastEditedAt).toBeNull();
expect(parsed.properties.triage_signals.service_account_only).toBe('false');
});
});

View file

@ -16,21 +16,9 @@ export async function detectHistoricSqlStagedDir(stagedDir: string): Promise<boo
}
try {
const entries = await readdir(join(stagedDir, 'templates'), { withFileTypes: true, recursive: true });
const metadataDirs = new Set<string>();
const pageDirs = new Set<string>();
for (const entry of entries) {
if (!entry.isFile()) {
continue;
}
if (entry.name === 'metadata.json') {
metadataDirs.add(entry.parentPath);
}
if (entry.name === 'page.md') {
pageDirs.add(entry.parentPath);
}
}
return [...metadataDirs].some((dir) => pageDirs.has(dir));
await readFile(join(stagedDir, 'patterns-input.json'), 'utf-8');
const entries = await readdir(join(stagedDir, 'tables'), { withFileTypes: true });
return entries.some((entry) => entry.isFile() && entry.name.endsWith('.json'));
} catch {
return false;
}

View file

@ -29,7 +29,7 @@ export class HistoricSqlSourceAdapter implements SourceAdapter {
now: this.deps.now?.(),
});
if (this.deps.legacyPostgresBaselineRootDir) {
await rm(join(this.deps.legacyPostgresBaselineRootDir, ctx.connectionId, 'pgss-baseline.json'), {
await rm(join(this.deps.legacyPostgresBaselineRootDir, ctx.connectionId, ['pgss', 'baseline.json'].join('-')), {
force: true,
});
}

View file

@ -1,281 +0,0 @@
import { describe, expect, it, vi } from 'vitest';
import {
HistoricSqlExtensionMissingError,
HistoricSqlGrantsMissingError,
HistoricSqlVersionUnsupportedError,
} from './errors.js';
import { PostgresPgssQueryHistoryReader } from './postgres-pgss-query-history-reader.js';
interface FakeQueryResult {
headers: string[];
rows: unknown[][];
totalRows?: number;
error?: string;
}
function queryClient(results: Array<FakeQueryResult | Error>) {
const executeQuery = vi.fn(async (_query: string, _params?: unknown[]) => {
const next = results.shift();
if (!next) {
throw new Error('unexpected query');
}
if (next instanceof Error) {
throw next;
}
return next;
});
return { executeQuery };
}
function executedSql(client: ReturnType<typeof queryClient>, index: number): string {
const call = client.executeQuery.mock.calls[index];
if (!call) {
throw new Error(`expected query client call ${index}`);
}
return call[0];
}
describe('PostgresPgssQueryHistoryReader', () => {
it('probes version, extension presence, grants, and tracking state', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4 on x86_64-apple-darwin']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['top']] },
{ headers: ['max'], rows: [['5000']] },
]);
const reader = new PostgresPgssQueryHistoryReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4 on x86_64-apple-darwin',
warnings: [],
});
expect(executedSql(client, 0)).toContain("current_setting('server_version_num')::int");
expect(executedSql(client, 1)).toBe('SELECT 1 FROM pg_stat_statements LIMIT 1');
expect(executedSql(client, 2)).toBe(
"SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role",
);
expect(executedSql(client, 3)).toBe("SELECT current_setting('pg_stat_statements.track') AS track");
expect(executedSql(client, 4)).toBe("SELECT current_setting('pg_stat_statements.max') AS max");
});
it('rejects PostgreSQL versions older than 14 without probing the extension', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[130012, 'PostgreSQL 13.12']],
},
{
headers: ['stats_reset', 'dealloc'],
rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]],
},
]);
const reader = new PostgresPgssQueryHistoryReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlVersionUnsupportedError',
dialect: 'postgres',
detectedVersion: 'PostgreSQL 13.12',
minimumVersion: 'PostgreSQL 14',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlVersionUnsupportedError);
expect(client.executeQuery).toHaveBeenCalledTimes(1);
});
it('maps a missing pg_stat_statements relation to HistoricSqlExtensionMissingError', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
new Error('relation "pg_stat_statements" does not exist'),
]);
const reader = new PostgresPgssQueryHistoryReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlExtensionMissingError',
dialect: 'postgres',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlExtensionMissingError);
});
it('maps pg_stat_statements preload failures to HistoricSqlExtensionMissingError with preload remediation', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
new Error('pg_stat_statements must be loaded via shared_preload_libraries'),
]);
const reader = new PostgresPgssQueryHistoryReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlExtensionMissingError',
dialect: 'postgres',
message: 'pg_stat_statements is installed but not loaded via shared_preload_libraries.',
remediation: expect.stringContaining("shared_preload_libraries includes 'pg_stat_statements'"),
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlExtensionMissingError);
});
it('maps missing pg_read_all_stats membership to HistoricSqlGrantsMissingError', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[false]] },
]);
const reader = new PostgresPgssQueryHistoryReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlGrantsMissingError',
dialect: 'postgres',
remediation: 'GRANT pg_read_all_stats TO <connection role>;',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('returns a warning instead of failing when pg_stat_statements.track is none', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['none']] },
{ headers: ['max'], rows: [['5000']] },
]);
const reader = new PostgresPgssQueryHistoryReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4',
warnings: [
"pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config",
],
});
});
it('warns when pg_stat_statements.max is below the recommended floor', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['top']] },
{ headers: ['max'], rows: [['1000']] },
]);
const reader = new PostgresPgssQueryHistoryReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4',
warnings: [
'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn',
],
});
});
it('reads a parameterized pg_stat_statements snapshot and stats info', async () => {
const client = queryClient([
{
headers: [
'queryid',
'userid',
'username',
'dbid',
'database',
'query',
'calls',
'total_exec_time',
'mean_exec_time',
'total_rows',
],
rows: [
[
'922337203685477580',
'16384',
'analyst',
'16385',
'warehouse',
'SELECT count(*) FROM public.orders WHERE status = $1',
'42',
'2100.5',
'50.0119',
'9001',
],
[
'922337203685477581',
'16386',
'unknown',
'16385',
'warehouse',
'SELECT * FROM public.customers WHERE id = $1',
5,
30,
6,
5,
],
],
},
{
headers: ['stats_reset', 'dealloc'],
rows: [[new Date('2026-05-01T00:00:00.000Z'), 7]],
},
]);
const reader = new PostgresPgssQueryHistoryReader();
await expect(reader.readSnapshot(client, { minCalls: 5, maxTemplates: 500 })).resolves.toEqual({
statsResetAt: '2026-05-01T00:00:00.000Z',
deallocCount: 7,
rows: [
{
queryid: '922337203685477580',
userid: '16384',
username: 'analyst',
dbid: '16385',
database: 'warehouse',
query: 'SELECT count(*) FROM public.orders WHERE status = $1',
calls: 42,
totalExecTime: 2100.5,
meanExecTime: 50.0119,
totalRows: 9001,
},
{
queryid: '922337203685477581',
userid: '16386',
username: 'unknown',
dbid: '16385',
database: 'warehouse',
query: 'SELECT * FROM public.customers WHERE id = $1',
calls: 5,
totalExecTime: 30,
meanExecTime: 6,
totalRows: 5,
},
],
});
const snapshotSql = executedSql(client, 0);
expect(snapshotSql).toContain('FROM pg_stat_statements s');
expect(snapshotSql).toContain('LEFT JOIN pg_roles');
expect(snapshotSql).toContain('LEFT JOIN pg_database');
expect(snapshotSql).toContain('WHERE s.toplevel = true');
expect(snapshotSql).toContain('AND s.calls >= $1');
expect(snapshotSql).toContain('ORDER BY s.total_exec_time DESC');
expect(snapshotSql).toContain('LIMIT $2');
expect(client.executeQuery.mock.calls[0]?.[1]).toEqual([5, 500]);
expect(executedSql(client, 1)).toBe('SELECT stats_reset, dealloc FROM pg_stat_statements_info');
});
});

View file

@ -1,262 +0,0 @@
import {
HistoricSqlExtensionMissingError,
HistoricSqlGrantsMissingError,
HistoricSqlVersionUnsupportedError,
} from './errors.js';
import type {
KtxPostgresQueryClient,
PostgresPgssProbeResult,
PostgresPgssReader,
PostgresPgssRow,
PostgresPgssSnapshot,
} from './types.js';
interface QueryResultLike {
headers: string[];
rows: unknown[][];
totalRows?: number;
error?: string;
}
const VERSION_SQL = `
SELECT current_setting('server_version_num')::int AS server_version_num,
version() AS server_version
`.trim();
const EXTENSION_PROBE_SQL = 'SELECT 1 FROM pg_stat_statements LIMIT 1';
const GRANTS_PROBE_SQL = "SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role";
const TRACKING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.track') AS track";
const MAX_SETTING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.max') AS max";
const RECOMMENDED_PGSS_MAX = 5000;
const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info';
const SNAPSHOT_SQL = `
SELECT
s.queryid::text AS queryid,
s.userid::text AS userid,
COALESCE(r.rolname, 'unknown') AS username,
s.dbid::text AS dbid,
d.datname AS database,
s.query,
s.calls,
s.total_exec_time,
s.mean_exec_time,
s.rows AS total_rows
FROM pg_stat_statements s
LEFT JOIN pg_roles r ON s.userid = r.oid
LEFT JOIN pg_database d ON s.dbid = d.oid
WHERE s.toplevel = true
AND s.calls >= $1
ORDER BY s.total_exec_time DESC
LIMIT $2
`.trim();
const POSTGRES_EXTENSION_REMEDIATION = [
'Run CREATE EXTENSION pg_stat_statements; against the connection database.',
"Ensure shared_preload_libraries includes 'pg_stat_statements' in the Postgres parameter group or config.",
].join(' ');
const POSTGRES_GRANTS_REMEDIATION = 'GRANT pg_read_all_stats TO <connection role>;';
function queryClient(client: unknown): KtxPostgresQueryClient {
if (
client &&
typeof client === 'object' &&
'executeQuery' in client &&
typeof (client as { executeQuery?: unknown }).executeQuery === 'function'
) {
return client as KtxPostgresQueryClient;
}
throw new Error('Historic SQL Postgres PGSS reader requires a query client with executeQuery(sql, params?)');
}
async function execute(client: KtxPostgresQueryClient, sql: string, params?: unknown[]): Promise<QueryResultLike> {
const result = await client.executeQuery(sql, params);
if ('error' in result && typeof result.error === 'string' && result.error.length > 0) {
throw new Error(result.error);
}
return result;
}
function indexes(headers: string[]): Map<string, number> {
const out = new Map<string, number>();
headers.forEach((header, index) => out.set(header.toLowerCase(), index));
return out;
}
function value(row: unknown[], headerIndexes: Map<string, number>, header: string): unknown {
const index = headerIndexes.get(header.toLowerCase());
return index === undefined ? null : row[index];
}
function nullableString(raw: unknown): string | null {
if (raw === null || raw === undefined) {
return null;
}
const text = String(raw);
return text.length > 0 ? text : null;
}
function requiredString(raw: unknown, field: string): string {
const text = nullableString(raw);
if (!text) {
throw new Error(`Postgres pg_stat_statements row is missing ${field}`);
}
return text;
}
function requiredFiniteNumber(raw: unknown, field: string): number {
const number = typeof raw === 'number' ? raw : Number(raw);
if (!Number.isFinite(number)) {
throw new Error(`Postgres pg_stat_statements row has invalid ${field}: ${String(raw)}`);
}
return number;
}
function nullableInteger(raw: unknown): number | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
const number = typeof raw === 'number' ? raw : Number(raw);
return Number.isFinite(number) ? Math.trunc(number) : null;
}
function nullableIsoTimestamp(raw: unknown): string | null {
if (raw === null || raw === undefined || raw === '') {
return null;
}
if (raw instanceof Date) {
return raw.toISOString();
}
const date = new Date(String(raw));
return Number.isNaN(date.getTime()) ? null : date.toISOString();
}
function firstRow(result: QueryResultLike, context: string): { row: unknown[]; headers: Map<string, number> } {
const row = result.rows[0];
if (!row) {
throw new Error(`Postgres historic-SQL ${context} query returned no rows`);
}
return { row, headers: indexes(result.headers) };
}
function isMissingPgssRelation(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /relation ["']?pg_stat_statements["']? does not exist/i.test(message);
}
function isPgssPreloadRequired(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /pg_stat_statements.*shared_preload_libraries/i.test(message);
}
function extensionMissingError(cause: unknown, message?: string): HistoricSqlExtensionMissingError {
return new HistoricSqlExtensionMissingError({
dialect: 'postgres',
message: message ?? 'pg_stat_statements extension is not installed in the connection database.',
remediation: POSTGRES_EXTENSION_REMEDIATION,
cause,
});
}
function grantsMissingError(): HistoricSqlGrantsMissingError {
return new HistoricSqlGrantsMissingError({
dialect: 'postgres',
message: 'Postgres connection role lacks pg_read_all_stats for historic-SQL ingest.',
remediation: POSTGRES_GRANTS_REMEDIATION,
});
}
function mapSnapshotRow(row: unknown[], headerIndexes: Map<string, number>): PostgresPgssRow {
return {
queryid: requiredString(value(row, headerIndexes, 'queryid'), 'queryid'),
userid: requiredString(value(row, headerIndexes, 'userid'), 'userid'),
username: nullableString(value(row, headerIndexes, 'username')),
dbid: requiredString(value(row, headerIndexes, 'dbid'), 'dbid'),
database: nullableString(value(row, headerIndexes, 'database')),
query: requiredString(value(row, headerIndexes, 'query'), 'query'),
calls: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'calls'), 'calls')),
totalExecTime: requiredFiniteNumber(value(row, headerIndexes, 'total_exec_time'), 'total_exec_time'),
meanExecTime: requiredFiniteNumber(value(row, headerIndexes, 'mean_exec_time'), 'mean_exec_time'),
totalRows: Math.trunc(requiredFiniteNumber(value(row, headerIndexes, 'total_rows'), 'total_rows')),
};
}
export class PostgresPgssQueryHistoryReader implements PostgresPgssReader {
async probe(client: unknown): Promise<PostgresPgssProbeResult> {
const pgClient = queryClient(client);
const versionResult = await execute(pgClient, VERSION_SQL);
const { row: versionRow, headers: versionHeaders } = firstRow(versionResult, 'version probe');
const serverVersionNum = requiredFiniteNumber(
value(versionRow, versionHeaders, 'server_version_num'),
'server_version_num',
);
const pgServerVersion = requiredString(value(versionRow, versionHeaders, 'server_version'), 'server_version');
if (serverVersionNum < 140000) {
throw new HistoricSqlVersionUnsupportedError({
dialect: 'postgres',
detectedVersion: pgServerVersion,
minimumVersion: 'PostgreSQL 14',
});
}
try {
await execute(pgClient, EXTENSION_PROBE_SQL);
} catch (error) {
if (isMissingPgssRelation(error)) {
throw extensionMissingError(error);
}
if (isPgssPreloadRequired(error)) {
throw extensionMissingError(
error,
'pg_stat_statements is installed but not loaded via shared_preload_libraries.',
);
}
throw error;
}
const grantsResult = await execute(pgClient, GRANTS_PROBE_SQL);
const { row: grantsRow, headers: grantsHeaders } = firstRow(grantsResult, 'grant probe');
if (value(grantsRow, grantsHeaders, 'has_role') !== true) {
throw grantsMissingError();
}
const trackingResult = await execute(pgClient, TRACKING_PROBE_SQL);
const { row: trackingRow, headers: trackingHeaders } = firstRow(trackingResult, 'tracking probe');
const track = nullableString(value(trackingRow, trackingHeaders, 'track'));
const maxResult = await execute(pgClient, MAX_SETTING_PROBE_SQL);
const { row: maxRow, headers: maxHeaders } = firstRow(maxResult, 'max-setting probe');
const pgssMax = nullableInteger(value(maxRow, maxHeaders, 'max'));
const warnings: string[] = [];
if (track === 'none') {
warnings.push('pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config');
}
if (pgssMax !== null && pgssMax < RECOMMENDED_PGSS_MAX) {
warnings.push(
`pg_stat_statements.max is ${pgssMax}; set it to at least ${RECOMMENDED_PGSS_MAX} to reduce query-template eviction churn`,
);
}
return { pgServerVersion, warnings };
}
async readSnapshot(
client: unknown,
options: { minCalls: number; maxTemplates: number },
): Promise<PostgresPgssSnapshot> {
const pgClient = queryClient(client);
const snapshotResult = await execute(pgClient, SNAPSHOT_SQL, [options.minCalls, options.maxTemplates]);
const snapshotHeaders = indexes(snapshotResult.headers);
const statsResult = await execute(pgClient, STATS_INFO_SQL);
const { row: statsRow, headers: statsHeaders } = firstRow(statsResult, 'stats-info');
return {
statsResetAt: nullableIsoTimestamp(value(statsRow, statsHeaders, 'stats_reset')),
deallocCount: nullableInteger(value(statsRow, statsHeaders, 'dealloc')),
rows: snapshotResult.rows.map((row) => mapSnapshotRow(row, snapshotHeaders)),
};
}
}

View file

@ -1,7 +1,188 @@
import { describe, expect, it, vi } from 'vitest';
import {
HistoricSqlExtensionMissingError,
HistoricSqlGrantsMissingError,
HistoricSqlVersionUnsupportedError,
} from './errors.js';
import { PostgresPgssReader } from './postgres-pgss-reader.js';
interface FakeQueryResult {
headers: string[];
rows: unknown[][];
totalRows?: number;
error?: string;
}
function queryClient(results: Array<FakeQueryResult | Error>) {
const executeQuery = vi.fn(async (_query: string, _params?: unknown[]) => {
const next = results.shift();
if (!next) {
throw new Error('unexpected query');
}
if (next instanceof Error) {
throw next;
}
return next;
});
return { executeQuery };
}
function executedSql(client: ReturnType<typeof queryClient>, index: number): string {
const call = client.executeQuery.mock.calls[index];
if (!call) {
throw new Error(`expected query client call ${index}`);
}
return call[0];
}
describe('PostgresPgssReader aggregate path', () => {
it('probes version, extension presence, grants, and tracking state', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4 on x86_64-apple-darwin']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['top']] },
{ headers: ['max'], rows: [['5000']] },
]);
const reader = new PostgresPgssReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4 on x86_64-apple-darwin',
warnings: [],
});
expect(executedSql(client, 0)).toContain("current_setting('server_version_num')::int");
expect(executedSql(client, 1)).toBe('SELECT 1 FROM pg_stat_statements LIMIT 1');
expect(executedSql(client, 2)).toBe(
"SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role",
);
expect(executedSql(client, 3)).toBe("SELECT current_setting('pg_stat_statements.track') AS track");
expect(executedSql(client, 4)).toBe("SELECT current_setting('pg_stat_statements.max') AS max");
});
it('rejects PostgreSQL versions older than 14 without probing the extension', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[130012, 'PostgreSQL 13.12']],
},
]);
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlVersionUnsupportedError',
dialect: 'postgres',
detectedVersion: 'PostgreSQL 13.12',
minimumVersion: 'PostgreSQL 14',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlVersionUnsupportedError);
expect(client.executeQuery).toHaveBeenCalledTimes(1);
});
it('maps a missing pg_stat_statements relation to HistoricSqlExtensionMissingError', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
new Error('relation "pg_stat_statements" does not exist'),
]);
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlExtensionMissingError',
dialect: 'postgres',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlExtensionMissingError);
});
it('maps pg_stat_statements preload failures to HistoricSqlExtensionMissingError with preload remediation', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
new Error('pg_stat_statements must be loaded via shared_preload_libraries'),
]);
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlExtensionMissingError',
dialect: 'postgres',
message: 'pg_stat_statements is installed but not loaded via shared_preload_libraries.',
remediation: expect.stringContaining("shared_preload_libraries includes 'pg_stat_statements'"),
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlExtensionMissingError);
});
it('maps missing pg_read_all_stats membership to HistoricSqlGrantsMissingError', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[false]] },
]);
const reader = new PostgresPgssReader();
const promise = reader.probe(client);
await expect(promise).rejects.toMatchObject({
name: 'HistoricSqlGrantsMissingError',
dialect: 'postgres',
remediation: 'GRANT pg_read_all_stats TO <connection role>;',
});
await expect(promise).rejects.toBeInstanceOf(HistoricSqlGrantsMissingError);
});
it('returns a warning instead of failing when pg_stat_statements.track is none', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['none']] },
{ headers: ['max'], rows: [['5000']] },
]);
const reader = new PostgresPgssReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4',
warnings: [
"pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config",
],
});
});
it('warns when pg_stat_statements.max is below the recommended floor', async () => {
const client = queryClient([
{
headers: ['server_version_num', 'server_version'],
rows: [[160004, 'PostgreSQL 16.4']],
},
{ headers: ['?column?'], rows: [[1]] },
{ headers: ['has_role'], rows: [[true]] },
{ headers: ['track'], rows: [['top']] },
{ headers: ['max'], rows: [['1000']] },
]);
const reader = new PostgresPgssReader();
await expect(reader.probe(client)).resolves.toEqual({
pgServerVersion: 'PostgreSQL 16.4',
warnings: [
'pg_stat_statements.max is 1000; set it to at least 5000 to reduce query-template eviction churn',
],
});
});
it('aggregates pg_stat_statements rows by queryid and query', async () => {
const executeQuery = vi.fn(async (sql: string, params?: unknown[]) => {
if (sql.includes('pg_stat_statements_info')) {

View file

@ -1,4 +1,8 @@
import { PostgresPgssQueryHistoryReader } from './postgres-pgss-query-history-reader.js';
import {
HistoricSqlExtensionMissingError,
HistoricSqlGrantsMissingError,
HistoricSqlVersionUnsupportedError,
} from './errors.js';
import {
aggregatedTemplateSchema,
type AggregatedTemplate,
@ -16,6 +20,15 @@ interface QueryResultLike {
}
const STATS_INFO_SQL = 'SELECT stats_reset, dealloc FROM pg_stat_statements_info';
const VERSION_SQL = `
SELECT current_setting('server_version_num')::int AS server_version_num,
version() AS server_version
`.trim();
const EXTENSION_PROBE_SQL = 'SELECT 1 FROM pg_stat_statements LIMIT 1';
const GRANTS_PROBE_SQL = "SELECT pg_has_role(current_user, 'pg_read_all_stats', 'USAGE') AS has_role";
const TRACKING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.track') AS track";
const MAX_SETTING_PROBE_SQL = "SELECT current_setting('pg_stat_statements.max') AS max";
const RECOMMENDED_PGSS_MAX = 5000;
const AGGREGATE_SQL = `
SELECT queryid::text AS template_id,
@ -37,6 +50,13 @@ HAVING SUM(calls) >= $1
ORDER BY SUM(total_exec_time) DESC
`.trim();
const POSTGRES_EXTENSION_REMEDIATION = [
'Run CREATE EXTENSION pg_stat_statements; against the connection database.',
"Ensure shared_preload_libraries includes 'pg_stat_statements' in the Postgres parameter group or config.",
].join(' ');
const POSTGRES_GRANTS_REMEDIATION = 'GRANT pg_read_all_stats TO <connection role>;';
function queryClient(client: unknown): KtxPostgresQueryClient {
if (
client &&
@ -128,6 +148,33 @@ function firstRow(result: QueryResultLike, context: string): { row: unknown[]; h
return { row, headers: indexByHeader(result.headers) };
}
function isMissingPgssRelation(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /relation ["']?pg_stat_statements["']? does not exist/i.test(message);
}
function isPgssPreloadRequired(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
return /pg_stat_statements.*shared_preload_libraries/i.test(message);
}
function extensionMissingError(cause: unknown, message?: string): HistoricSqlExtensionMissingError {
return new HistoricSqlExtensionMissingError({
dialect: 'postgres',
message: message ?? 'pg_stat_statements extension is not installed in the connection database.',
remediation: POSTGRES_EXTENSION_REMEDIATION,
cause,
});
}
function grantsMissingError(): HistoricSqlGrantsMissingError {
return new HistoricSqlGrantsMissingError({
dialect: 'postgres',
message: 'Postgres connection role lacks pg_read_all_stats for historic-SQL ingest.',
remediation: POSTGRES_GRANTS_REMEDIATION,
});
}
function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: number }> {
const text = nullableString(raw);
if (!text) {
@ -152,10 +199,64 @@ function parseTopUsers(raw: unknown): Array<{ user: string | null; executions: n
}
export class PostgresPgssReader {
private readonly legacyReader = new PostgresPgssQueryHistoryReader();
async probe(client: unknown): Promise<PostgresPgssProbeResult> {
const pgClient = queryClient(client);
const versionResult = await execute(pgClient, VERSION_SQL);
const { row: versionRow, headers: versionHeaders } = firstRow(versionResult, 'version probe');
const serverVersionNum = requiredFiniteNumber(
value(versionRow, versionHeaders, 'server_version_num'),
'server_version_num',
);
const pgServerVersion = requiredString(value(versionRow, versionHeaders, 'server_version'), 'server_version');
probe(client: unknown): Promise<PostgresPgssProbeResult> {
return this.legacyReader.probe(client);
if (serverVersionNum < 140000) {
throw new HistoricSqlVersionUnsupportedError({
dialect: 'postgres',
detectedVersion: pgServerVersion,
minimumVersion: 'PostgreSQL 14',
});
}
try {
await execute(pgClient, EXTENSION_PROBE_SQL);
} catch (error) {
if (isMissingPgssRelation(error)) {
throw extensionMissingError(error);
}
if (isPgssPreloadRequired(error)) {
throw extensionMissingError(
error,
'pg_stat_statements is installed but not loaded via shared_preload_libraries.',
);
}
throw error;
}
const grantsResult = await execute(pgClient, GRANTS_PROBE_SQL);
const { row: grantsRow, headers: grantsHeaders } = firstRow(grantsResult, 'grant probe');
if (value(grantsRow, grantsHeaders, 'has_role') !== true) {
throw grantsMissingError();
}
const trackingResult = await execute(pgClient, TRACKING_PROBE_SQL);
const { row: trackingRow, headers: trackingHeaders } = firstRow(trackingResult, 'tracking probe');
const track = nullableString(value(trackingRow, trackingHeaders, 'track'));
const maxResult = await execute(pgClient, MAX_SETTING_PROBE_SQL);
const { row: maxRow, headers: maxHeaders } = firstRow(maxResult, 'max-setting probe');
const pgssMax = nullableInteger(value(maxRow, maxHeaders, 'max'));
const warnings: string[] = [];
if (track === 'none') {
warnings.push('pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config');
}
if (pgssMax !== null && pgssMax < RECOMMENDED_PGSS_MAX) {
warnings.push(
`pg_stat_statements.max is ${pgssMax}; set it to at least ${RECOMMENDED_PGSS_MAX} to reduce query-template eviction churn`,
);
}
return { pgServerVersion, warnings };
}
async *fetchAggregated(

View file

@ -1,155 +0,0 @@
import { mkdir, mkdtemp, readdir, readFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { dirname, join, relative } from 'node:path';
import { describe, expect, it } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import { stagePgStatStatementsTemplates, writePgssBaselineAtomic, type PgssBaseline } from './stage-pgss.js';
import type { HistoricSqlPullConfig, KtxPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js';
const FIXTURE_ROOT = join(__dirname, '__fixtures__/postgres');
interface GoldenFixture {
name: string;
now: string;
connectionId: string;
probe: {
pgServerVersion: string;
warnings: string[];
};
snapshot: {
statsResetAt: string | null;
deallocCount: number | null;
rows: PostgresPgssRow[];
};
pullConfig: HistoricSqlPullConfig & { dialect: 'postgres' };
analysisBySql: Record<
string,
{
fingerprint: string;
normalizedSql: string;
tablesTouched: string[];
literalSlots: [];
error?: string;
}
>;
baseline: PgssBaseline | null;
expectedBaseline: PgssBaseline;
expectedFiles: Record<string, { json?: unknown; text?: string }>;
}
async function readFixture(name: string): Promise<GoldenFixture> {
return JSON.parse(await readFile(join(FIXTURE_ROOT, name, 'input.json'), 'utf-8')) as GoldenFixture;
}
async function tempDir(prefix: string): Promise<string> {
return mkdtemp(join(tmpdir(), prefix));
}
function fakePgClient(): KtxPostgresQueryClient {
return {
async executeQuery() {
return { headers: [], rows: [] };
},
};
}
function fixtureReader(fixture: GoldenFixture): PostgresPgssReader {
return {
async probe() {
return fixture.probe;
},
async readSnapshot(_client, options) {
return {
statsResetAt: fixture.snapshot.statsResetAt,
deallocCount: fixture.snapshot.deallocCount,
rows: fixture.snapshot.rows.slice(0, options.maxTemplates),
};
},
};
}
function fixtureSqlAnalysis(fixture: GoldenFixture): SqlAnalysisPort {
return {
async analyzeForFingerprint(sql) {
const result = fixture.analysisBySql[sql];
if (!result) {
return {
fingerprint: '',
normalizedSql: '',
tablesTouched: [],
literalSlots: [],
error: `missing fixture analysis for ${sql}`,
};
}
return result;
},
async analyzeBatch() {
return new Map();
},
};
}
async function writeFixtureBaseline(path: string, baseline: PgssBaseline | null): Promise<void> {
if (!baseline) {
return;
}
await writePgssBaselineAtomic(path, baseline);
}
async function listFiles(root: string, current = root): Promise<string[]> {
const entries = await readdir(current, { withFileTypes: true });
const files: string[] = [];
for (const entry of entries) {
const fullPath = join(current, entry.name);
if (entry.isDirectory()) {
files.push(...(await listFiles(root, fullPath)));
} else {
files.push(relative(root, fullPath));
}
}
return files;
}
async function expectGoldenFiles(stagedDir: string, expectedFiles: GoldenFixture['expectedFiles']): Promise<void> {
const actualFiles = await listFiles(stagedDir);
const expectedPaths = Object.keys(expectedFiles).sort();
expect(actualFiles.sort()).toEqual(expectedPaths);
for (const path of expectedPaths) {
const expected = expectedFiles[path];
const actual = await readFile(join(stagedDir, path), 'utf-8');
if ('json' in expected) {
expect(JSON.parse(actual)).toEqual(expected.json);
} else {
expect(actual).toBe(expected.text);
}
}
}
describe('stagePgStatStatementsTemplates golden fixtures', () => {
it.each(['first-run', 'normal-delta', 'reset-detected', 'version-change', 'eviction-churn'] as const)(
'matches the committed %s golden output',
async (fixtureName) => {
const fixture = await readFixture(fixtureName);
const root = await tempDir(`pgss-golden-${fixtureName}-`);
const stagedDir = join(root, 'staged');
const baselinePath = join(root, 'cache', fixture.connectionId, 'pgss-baseline.json');
await mkdir(dirname(baselinePath), { recursive: true });
await writeFixtureBaseline(baselinePath, fixture.baseline);
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: fixture.connectionId,
queryClient: fakePgClient(),
reader: fixtureReader(fixture),
sqlAnalysis: fixtureSqlAnalysis(fixture),
pullConfig: fixture.pullConfig,
baselinePath,
now: new Date(fixture.now),
});
await expectGoldenFiles(stagedDir, fixture.expectedFiles);
expect(result.baseline).toEqual(fixture.expectedBaseline);
},
);
});

View file

@ -1,655 +0,0 @@
import { mkdtemp, readFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it, vi } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import {
pgssBaselinePath,
readPgssBaseline,
stagePgStatStatementsTemplates,
writePgssBaselineAtomic,
type PgssBaseline,
} from './stage-pgss.js';
import { historicSqlManifestSchema, historicSqlMetadataSchema, historicSqlUsageSchema } from './types.js';
import type { KtxPostgresQueryClient, PostgresPgssReader, PostgresPgssRow } from './types.js';
async function tempDir(prefix: string): Promise<string> {
return mkdtemp(join(tmpdir(), prefix));
}
async function readJson<T>(root: string, relPath: string): Promise<T> {
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
}
function fakePgClient(): KtxPostgresQueryClient {
return {
async executeQuery() {
return { headers: [], rows: [] };
},
};
}
function row(overrides: Partial<PostgresPgssRow> & Pick<PostgresPgssRow, 'queryid' | 'query'>): PostgresPgssRow {
return {
userid: '11',
username: 'analyst',
dbid: '5',
database: 'warehouse',
calls: 10,
totalExecTime: 250,
meanExecTime: 25,
totalRows: 20,
...overrides,
};
}
function fakeReader(input: {
pgServerVersion?: string;
warnings?: string[];
statsResetAt?: string | null;
deallocCount?: number | null;
rows: PostgresPgssRow[];
}): PostgresPgssReader {
return {
probe: vi.fn(async () => ({
pgServerVersion: input.pgServerVersion ?? 'PostgreSQL 16.4',
warnings: input.warnings ?? [],
})),
readSnapshot: vi.fn(async (_client, options) => ({
statsResetAt: input.statsResetAt ?? '2026-05-08T08:00:00.000Z',
deallocCount: input.deallocCount ?? 0,
rows: input.rows.slice(0, options.maxTemplates),
})),
};
}
const sqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
if (sql.includes('broken')) {
return {
fingerprint: '',
normalizedSql: '',
tablesTouched: [],
literalSlots: [],
error: 'parse failed',
};
}
if (sql.includes('customers')) {
return {
fingerprint: 'fp_customers',
normalizedSql: 'SELECT count(*) FROM analytics.customers',
tablesTouched: ['analytics.customers'],
literalSlots: [],
};
}
return {
fingerprint: 'fp_orders',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
tablesTouched: ['analytics.orders'],
literalSlots: [],
};
},
async analyzeBatch() {
return new Map();
},
};
function postgresPullConfig(maxTemplatesPerRun = 5000) {
return {
dialect: 'postgres' as const,
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: ['^svc_'],
redactionPatterns: ['secret'],
maxTemplatesPerRun,
minCalls: 5,
};
}
describe('stagePgStatStatementsTemplates', () => {
it('stages first-run PGSS templates as degraded aggregate templates and builds a next baseline', async () => {
const stagedDir = await tempDir('pgss-stage-first-');
const baselineRootDir = await tempDir('pgss-baseline-first-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
warnings: ['pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config'],
deallocCount: 2,
rows: [
row({
queryid: '101',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 10,
totalExecTime: 250,
totalRows: 20,
}),
row({
queryid: '102',
query: 'SELECT * FROM pg_catalog.pg_class',
calls: 50,
totalExecTime: 500,
}),
row({
queryid: '103',
query: 'BEGIN',
calls: 75,
totalExecTime: 75,
}),
row({
queryid: '104',
query: 'SELECT broken FROM analytics.orders',
calls: 8,
totalExecTime: 80,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest).toMatchObject({
source: 'historic-sql',
connectionId: 'conn_pg',
dialect: 'postgres',
fetchedAt: '2026-05-08T12:00:00.000Z',
windowEnd: '2026-05-08T12:00:00.000Z',
nextSuccessfulCursor: '2026-05-08T12:00:00.000Z',
templateCount: 1,
capped: false,
degraded: true,
statsResetAt: '2026-05-08T08:00:00.000Z',
baselineFirstRun: true,
pgServerVersion: 'PostgreSQL 16.4',
deallocCount: 2,
});
expect(manifest.warnings).toEqual([
'pg_stat_statements.track is none; set it to top or all in the Postgres parameter group or config',
'pgss_dealloc_count:2; pg_stat_statements.max may be too low, causing template eviction churn',
'baseline_first_run:no_previous_pgss_baseline',
'analysis_failed:db5_q104',
]);
expect(manifest.templates).toEqual([
{
id: 'db5_q101',
fingerprint: 'fp_orders',
subClusterId: null,
path: 'templates/db5_q101/page.md',
},
]);
const metadata = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q101/metadata.json'));
expect(metadata).toMatchObject({
id: 'db5_q101',
title: 'postgres · analytics.orders [db5_q101]',
path: 'templates/db5_q101/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_orders',
sub_cluster_id: null,
dialect: 'postgres',
tables_touched: ['analytics.orders'],
literal_slots: [],
},
});
expect(metadata.properties.triage_signals).toEqual({
executions_bucket: 'mid',
distinct_users_bucket: 'solo',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
runtime_bucket: 'fast',
});
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q101/usage.json'));
expect(usage).toEqual({
stats: {
executions: 10,
distinct_users: 1,
first_seen: '2026-05-08T12:00:00.000Z',
last_seen: '2026-05-08T12:00:00.000Z',
p50_runtime_ms: null,
p95_runtime_ms: null,
mean_runtime_ms: 25,
error_rate: 0,
rows_produced: 20,
},
literal_slots: [],
samples: [],
});
expect(await readFile(join(stagedDir, 'templates/db5_q101/page.md'), 'utf-8')).toContain(
'SELECT count(*) FROM analytics.orders WHERE status = $1',
);
expect(result.baselinePath).toBe(baselinePath);
expect(result.baseline.templates.db5_q101.perUser['11']).toEqual({
calls: 10,
totalExecTime: 250,
totalRows: 20,
});
await expect(readPgssBaseline(baselinePath)).resolves.toBeNull();
});
it('warns when pg_stat_statements reports dealloc churn', async () => {
const root = await tempDir('pgss-churn-');
const stagedDir = join(root, 'staged');
const baselinePath = join(root, 'cache', 'warehouse', 'pgss-baseline.json');
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'warehouse',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '901',
query: 'SELECT COUNT(*) FROM public.orders WHERE status = $1',
calls: 20,
totalExecTime: 500,
meanExecTime: 25,
}),
],
deallocCount: 3,
}),
sqlAnalysis,
pullConfig: postgresPullConfig(50),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = await readJson<{ warnings: string[]; deallocCount: number }>(stagedDir, 'manifest.json');
expect(manifest.deallocCount).toBe(3);
expect(manifest.warnings).toContain(
'pgss_dealloc_count:3; pg_stat_statements.max may be too low, causing template eviction churn',
);
});
it('uses the saved cumulative baseline to stage only positive deltas on later runs', async () => {
const stagedDir = await tempDir('pgss-stage-delta-');
const baselineRootDir = await tempDir('pgss-baseline-delta-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
const baseline: PgssBaseline = {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q201: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 10, totalExecTime: 100, totalRows: 50 },
'12': { calls: 5, totalExecTime: 50, totalRows: 25 },
},
},
},
};
await writePgssBaselineAtomic(baselinePath, baseline);
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '201',
userid: '11',
username: 'analyst',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 12,
totalExecTime: 160,
totalRows: 58,
}),
row({
queryid: '201',
userid: '12',
username: 'svc_loader',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 5,
totalExecTime: 50,
totalRows: 25,
}),
row({
queryid: '202',
userid: '13',
username: 'analyst_2',
query: 'SELECT count(*) FROM analytics.customers',
calls: 7,
totalExecTime: 210,
totalRows: 7,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.baselineFirstRun).toBe(false);
expect(manifest.windowStart).toBe('2026-05-08T10:00:00.000Z');
expect(manifest.templateCount).toBe(2);
expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q202', 'db5_q201']);
const usage201 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q201/usage.json'));
expect(usage201.stats).toMatchObject({
executions: 2,
distinct_users: 1,
first_seen: '2026-05-08T09:00:00.000Z',
last_seen: '2026-05-08T12:00:00.000Z',
mean_runtime_ms: 30,
rows_produced: 8,
});
const metadata201 = historicSqlMetadataSchema.parse(await readJson(stagedDir, 'templates/db5_q201/metadata.json'));
expect(metadata201.properties.triage_signals.service_account_only).toBe('false');
const usage202 = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q202/usage.json'));
expect(usage202.stats).toMatchObject({
executions: 7,
distinct_users: 1,
first_seen: '2026-05-08T12:00:00.000Z',
mean_runtime_ms: 30,
rows_produced: 7,
});
});
it('keeps matching queryid values from different databases as distinct templates and baseline entries', async () => {
const stagedDir = await tempDir('pgss-stage-db-key-');
const baselineRootDir = await tempDir('pgss-baseline-db-key-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(baselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q701: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 10, totalExecTime: 100, totalRows: 50 },
},
},
db6_q701: {
firstObservedAt: '2026-05-08T09:30:00.000Z',
perUser: {
'11': { calls: 4, totalExecTime: 40, totalRows: 20 },
},
},
},
});
const result = await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '701',
dbid: '5',
database: 'warehouse',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 12,
totalExecTime: 160,
totalRows: 58,
}),
row({
queryid: '701',
dbid: '6',
database: 'app',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 9,
totalExecTime: 130,
totalRows: 35,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templates.map((template) => template.id).sort()).toEqual(['db5_q701', 'db6_q701']);
const warehouseUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q701/usage.json'));
expect(warehouseUsage.stats).toMatchObject({
executions: 2,
rows_produced: 8,
first_seen: '2026-05-08T09:00:00.000Z',
});
const appUsage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db6_q701/usage.json'));
expect(appUsage.stats).toMatchObject({
executions: 5,
rows_produced: 15,
first_seen: '2026-05-08T09:30:00.000Z',
});
expect(result.baseline.templates.db5_q701.perUser['11']).toEqual({
calls: 12,
totalExecTime: 160,
totalRows: 58,
});
expect(result.baseline.templates.db6_q701.perUser['11']).toEqual({
calls: 9,
totalExecTime: 130,
totalRows: 35,
});
});
it('treats stats_reset advancement and major-version changes as fresh baselines', async () => {
const resetStagedDir = await tempDir('pgss-stage-reset-');
const resetBaselineRootDir = await tempDir('pgss-baseline-reset-');
const resetBaselinePath = pgssBaselinePath(resetBaselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(resetBaselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q301: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
},
},
},
});
await stagePgStatStatementsTemplates({
stagedDir: resetStagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
statsResetAt: '2026-05-08T11:00:00.000Z',
rows: [
row({
queryid: '301',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 3,
totalExecTime: 90,
totalRows: 9,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath: resetBaselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const resetManifest = historicSqlManifestSchema.parse(await readJson(resetStagedDir, 'manifest.json'));
expect(resetManifest.baselineFirstRun).toBe(true);
expect(resetManifest.warnings).toContain(
'baseline_reset:stats_reset advanced from 2026-05-08T08:00:00.000Z to 2026-05-08T11:00:00.000Z',
);
const resetUsage = historicSqlUsageSchema.parse(await readJson(resetStagedDir, 'templates/db5_q301/usage.json'));
expect(resetUsage.stats.executions).toBe(3);
const versionStagedDir = await tempDir('pgss-stage-version-');
const versionBaselineRootDir = await tempDir('pgss-baseline-version-');
const versionBaselinePath = pgssBaselinePath(versionBaselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(versionBaselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 15.7',
templates: {
db5_q302: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
},
},
},
});
await stagePgStatStatementsTemplates({
stagedDir: versionStagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
pgServerVersion: 'PostgreSQL 16.4',
rows: [
row({
queryid: '302',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 4,
totalExecTime: 80,
totalRows: 8,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath: versionBaselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const versionManifest = historicSqlManifestSchema.parse(await readJson(versionStagedDir, 'manifest.json'));
expect(versionManifest.baselineFirstRun).toBe(true);
expect(versionManifest.warnings).toContain('baseline_reset:pg_server_major changed from 15 to 16');
});
it('handles scoped counter regressions without forcing a global first-run baseline', async () => {
const stagedDir = await tempDir('pgss-stage-scoped-');
const baselineRootDir = await tempDir('pgss-baseline-scoped-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
await writePgssBaselineAtomic(baselinePath, {
version: 1,
fetchedAt: '2026-05-08T10:00:00.000Z',
statsResetAt: '2026-05-08T08:00:00.000Z',
pgServerVersion: 'PostgreSQL 16.4',
templates: {
db5_q401: {
firstObservedAt: '2026-05-08T09:00:00.000Z',
perUser: {
'11': { calls: 100, totalExecTime: 1000, totalRows: 500 },
'12': { calls: 50, totalExecTime: 500, totalRows: 250 },
},
},
},
});
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
statsResetAt: '2026-05-08T08:00:00.000Z',
rows: [
row({
queryid: '401',
userid: '11',
username: 'analyst',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 2,
totalExecTime: 30,
totalRows: 6,
}),
row({
queryid: '401',
userid: '12',
username: 'svc_loader',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 55,
totalExecTime: 650,
totalRows: 275,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.baselineFirstRun).toBe(false);
expect(manifest.warnings).toContain('scoped_reset:dbid=5 queryid=401 userid=11');
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/db5_q401/usage.json'));
expect(usage.stats).toMatchObject({
executions: 7,
distinct_users: 2,
mean_runtime_ms: 25.714285714285715,
rows_produced: 31,
});
});
it('ranks and caps selected PGSS templates after skip and analysis filtering', async () => {
const stagedDir = await tempDir('pgss-stage-cap-');
const baselineRootDir = await tempDir('pgss-baseline-cap-');
const baselinePath = pgssBaselinePath(baselineRootDir, 'conn_pg');
await stagePgStatStatementsTemplates({
stagedDir,
connectionId: 'conn_pg',
queryClient: fakePgClient(),
reader: fakeReader({
rows: [
row({
queryid: '501',
username: 'analyst_a',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 2,
totalExecTime: 20,
}),
row({
queryid: '502',
username: 'analyst_b',
query: 'SELECT count(*) FROM analytics.customers',
calls: 20,
totalExecTime: 200,
}),
row({
queryid: '503',
username: 'analyst_c',
query: 'SELECT count(*) FROM analytics.orders WHERE status = $1',
calls: 10,
totalExecTime: 100,
}),
],
}),
sqlAnalysis,
pullConfig: postgresPullConfig(2),
baselinePath,
now: new Date('2026-05-08T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.capped).toBe(true);
expect(manifest.warnings).toContain('templates_truncated: kept 2 of 3 templates');
expect(manifest.templates.map((template) => template.id)).toEqual(['db5_q502', 'db5_q503']);
});
});

View file

@ -1,508 +0,0 @@
import { mkdir, readFile, rename, writeFile } from 'node:fs/promises';
import { dirname, join } from 'node:path';
import { z } from 'zod';
import type { SqlAnalysisFingerprintResult, SqlAnalysisPort } from '../../../sql-analysis/index.js';
import {
HISTORIC_SQL_OBJECT_TYPE,
HISTORIC_SQL_SOURCE_KEY,
historicSqlPullConfigSchema,
type HistoricSqlManifest,
type HistoricSqlMetadata,
type HistoricSqlPullConfig,
type HistoricSqlUsage,
type KtxPostgresQueryClient,
type PostgresPgssAggregateRow,
type PostgresPgssReader,
type PostgresPgssRow,
} from './types.js';
const PGSS_BASELINE_VERSION = 1 as const;
const pgssCounterSchema = z.object({
calls: z.number().int().nonnegative(),
totalExecTime: z.number().nonnegative(),
totalRows: z.number().int().nonnegative(),
});
const pgssBaselineSchema = z.object({
version: z.literal(PGSS_BASELINE_VERSION),
fetchedAt: z.string().datetime(),
statsResetAt: z.string().datetime().nullable(),
pgServerVersion: z.string(),
templates: z.record(
z.string(),
z.object({
firstObservedAt: z.string().datetime(),
perUser: z.record(z.string(), pgssCounterSchema),
}),
),
});
export type PgssBaseline = z.infer<typeof pgssBaselineSchema>;
export interface StagePgStatStatementsTemplatesInput {
stagedDir: string;
connectionId: string;
queryClient: KtxPostgresQueryClient;
reader: PostgresPgssReader;
sqlAnalysis: SqlAnalysisPort;
pullConfig: HistoricSqlPullConfig;
baselinePath: string;
now?: Date;
}
export interface StagePgStatStatementsTemplatesResult {
baselinePath: string;
baseline: PgssBaseline;
}
interface PgssBaselineCounter {
calls: number;
totalExecTime: number;
totalRows: number;
}
interface PgssAggregateMutable {
id: string;
queryid: string;
dbid: string;
database: string | null;
query: string;
deltaCalls: number;
deltaExecTime: number;
deltaRows: number;
users: Set<string>;
firstObservedAt: string;
}
interface AnalyzedPgssTemplate {
aggregate: PostgresPgssAggregateRow;
analysis: SqlAnalysisFingerprintResult;
}
const ZERO_COUNTER: PgssBaselineCounter = {
calls: 0,
totalExecTime: 0,
totalRows: 0,
};
const PGSS_SNAPSHOT_READ_LIMIT = 5000;
const PGSS_HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET|BEGIN|COMMIT|ROLLBACK|VACUUM|ANALYZE)\b/i;
const PGSS_HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|pg_catalog\.|pg_toast\.|pg_stat_)/i;
function pgssTemplateId(row: Pick<PostgresPgssRow, 'dbid' | 'queryid'>): string {
return `db${row.dbid}_q${row.queryid}`;
}
export function pgssBaselinePath(rootDir: string | undefined, connectionId: string): string {
return join(rootDir ?? join(process.cwd(), '.ktx/cache/historic-sql'), connectionId, 'pgss-baseline.json');
}
export async function readPgssBaseline(path: string): Promise<PgssBaseline | null> {
try {
return pgssBaselineSchema.parse(JSON.parse(await readFile(path, 'utf-8')));
} catch (error) {
if (error && typeof error === 'object' && 'code' in error && error.code === 'ENOENT') {
return null;
}
throw error;
}
}
export async function writePgssBaselineAtomic(path: string, baseline: PgssBaseline): Promise<void> {
const parsed = pgssBaselineSchema.parse(baseline);
await mkdir(dirname(path), { recursive: true });
const tempPath = `${path}.tmp`;
await writeFile(tempPath, `${JSON.stringify(parsed, null, 2)}\n`, 'utf-8');
await rename(tempPath, path);
}
export async function stagePgStatStatementsTemplates(
input: StagePgStatStatementsTemplatesInput,
): Promise<StagePgStatStatementsTemplatesResult> {
const config = historicSqlPullConfigSchema.parse(input.pullConfig);
if (config.dialect !== 'postgres') {
throw new Error(`stagePgStatStatementsTemplates requires dialect postgres, got ${config.dialect}`);
}
const now = input.now ?? new Date();
const fetchedAt = now.toISOString();
const probe = await input.reader.probe(input.queryClient);
const warnings = [...probe.warnings];
const baseline = await readPgssBaseline(input.baselinePath);
const snapshot = await input.reader.readSnapshot(input.queryClient, {
minCalls: config.minCalls,
maxTemplates: PGSS_SNAPSHOT_READ_LIMIT,
});
if (snapshot.deallocCount !== null && snapshot.deallocCount > 0) {
warnings.push(
`pgss_dealloc_count:${snapshot.deallocCount}; pg_stat_statements.max may be too low, causing template eviction churn`,
);
}
const reset = detectBaselineReset({
baseline,
snapshotStatsResetAt: snapshot.statsResetAt,
currentPgServerVersion: probe.pgServerVersion,
});
warnings.push(...reset.warnings);
const aggregates = aggregatePgssRows({
rows: snapshot.rows,
baseline,
baselineFirstRun: reset.baselineFirstRun,
fetchedAt,
warnings,
}).filter((aggregate) => !shouldSkipPgssSql(aggregate.query));
const analyzed: AnalyzedPgssTemplate[] = [];
for (const aggregate of aggregates) {
const analysis = await input.sqlAnalysis.analyzeForFingerprint(aggregate.query, 'postgres');
if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) {
warnings.push(`analysis_failed:${aggregate.id}`);
continue;
}
analyzed.push({ aggregate, analysis });
}
const selected = selectPgssTemplates(analyzed, config.maxTemplatesPerRun);
if (selected.length < analyzed.length) {
warnings.push(`templates_truncated: kept ${selected.length} of ${analyzed.length} templates`);
}
await mkdir(input.stagedDir, { recursive: true });
const templates: HistoricSqlManifest['templates'] = [];
for (const template of selected) {
const staged = buildPgssStagedTemplate(template, config, now);
const basePath = `templates/${staged.metadata.id}`;
await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata);
await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown);
await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage);
templates.push({
id: staged.metadata.id,
fingerprint: staged.metadata.properties.fingerprint,
subClusterId: staged.metadata.properties.sub_cluster_id,
path: staged.metadata.path,
});
}
await writeJson(input.stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: input.connectionId,
dialect: 'postgres',
fetchedAt,
windowStart: baseline?.fetchedAt ?? snapshot.statsResetAt ?? fetchedAt,
windowEnd: fetchedAt,
nextSuccessfulCursor: fetchedAt,
templateCount: selected.length,
capped: selected.length < analyzed.length,
warnings,
degraded: true,
statsResetAt: snapshot.statsResetAt,
baselineFirstRun: reset.baselineFirstRun,
pgServerVersion: probe.pgServerVersion,
deallocCount: snapshot.deallocCount,
templates,
} satisfies HistoricSqlManifest);
return {
baselinePath: input.baselinePath,
baseline: buildNextBaseline({
rows: snapshot.rows,
fetchedAt,
statsResetAt: snapshot.statsResetAt,
pgServerVersion: probe.pgServerVersion,
previousBaseline: reset.baselineFirstRun ? null : baseline,
}),
};
}
function detectBaselineReset(input: {
baseline: PgssBaseline | null;
snapshotStatsResetAt: string | null;
currentPgServerVersion: string;
}): { baselineFirstRun: boolean; warnings: string[] } {
if (!input.baseline) {
return { baselineFirstRun: true, warnings: ['baseline_first_run:no_previous_pgss_baseline'] };
}
const warnings: string[] = [];
if (
input.baseline.statsResetAt &&
input.snapshotStatsResetAt &&
input.baseline.statsResetAt < input.snapshotStatsResetAt
) {
warnings.push(
`baseline_reset:stats_reset advanced from ${input.baseline.statsResetAt} to ${input.snapshotStatsResetAt}`,
);
}
const previousMajor = postgresMajor(input.baseline.pgServerVersion);
const currentMajor = postgresMajor(input.currentPgServerVersion);
if (previousMajor && currentMajor && previousMajor !== currentMajor) {
warnings.push(`baseline_reset:pg_server_major changed from ${previousMajor} to ${currentMajor}`);
}
return { baselineFirstRun: warnings.length > 0, warnings };
}
function postgresMajor(version: string): string | null {
return version.match(/PostgreSQL\s+(\d+)/i)?.[1] ?? version.match(/^(\d+)(?:\.|$)/)?.[1] ?? null;
}
function aggregatePgssRows(input: {
rows: PostgresPgssRow[];
baseline: PgssBaseline | null;
baselineFirstRun: boolean;
fetchedAt: string;
warnings: string[];
}): PostgresPgssAggregateRow[] {
const aggregates = new Map<string, PgssAggregateMutable>();
for (const row of input.rows) {
const templateId = pgssTemplateId(row);
const baselineTemplate = input.baselineFirstRun ? undefined : input.baseline?.templates[templateId];
const baselineCounter = baselineTemplate?.perUser[row.userid];
const previous = scopedCounterBaseline(row, baselineCounter, input.baselineFirstRun, input.warnings);
const deltaCalls = row.calls - previous.calls;
const deltaExecTime = row.totalExecTime - previous.totalExecTime;
const deltaRows = row.totalRows - previous.totalRows;
if (deltaCalls === 0 && !input.baselineFirstRun) {
continue;
}
const existing =
aggregates.get(templateId) ??
({
id: templateId,
queryid: row.queryid,
dbid: row.dbid,
database: row.database,
query: row.query,
deltaCalls: 0,
deltaExecTime: 0,
deltaRows: 0,
users: new Set<string>(),
firstObservedAt: baselineTemplate?.firstObservedAt ?? input.fetchedAt,
} satisfies PgssAggregateMutable);
existing.deltaCalls += Math.max(0, deltaCalls);
existing.deltaExecTime += Math.max(0, deltaExecTime);
existing.deltaRows += Math.max(0, deltaRows);
if (deltaCalls > 0) {
existing.users.add(row.username ?? 'unknown');
}
aggregates.set(templateId, existing);
}
return [...aggregates.values()]
.filter((aggregate) => aggregate.deltaCalls > 0)
.map((aggregate) => ({
id: aggregate.id,
queryid: aggregate.queryid,
dbid: aggregate.dbid,
database: aggregate.database,
query: aggregate.query,
deltaCalls: aggregate.deltaCalls,
deltaExecTime: aggregate.deltaExecTime,
deltaRows: aggregate.deltaRows,
meanExecTime: aggregate.deltaExecTime / Math.max(aggregate.deltaCalls, 1),
distinctUsersDelta: aggregate.users.size,
users: [...aggregate.users].sort(),
firstObservedAt: aggregate.firstObservedAt,
}));
}
function scopedCounterBaseline(
row: PostgresPgssRow,
baselineCounter: PgssBaselineCounter | undefined,
baselineFirstRun: boolean,
warnings: string[],
): PgssBaselineCounter {
if (!baselineCounter || baselineFirstRun) {
return ZERO_COUNTER;
}
if (
baselineCounter.calls > row.calls ||
baselineCounter.totalExecTime > row.totalExecTime ||
baselineCounter.totalRows > row.totalRows
) {
warnings.push(`scoped_reset:dbid=${row.dbid} queryid=${row.queryid} userid=${row.userid}`);
return ZERO_COUNTER;
}
return baselineCounter;
}
function shouldSkipPgssSql(sql: string): boolean {
return PGSS_HARD_SKIP_PREFIX_RE.test(sql) || PGSS_HARD_SKIP_TABLE_RE.test(sql);
}
function selectPgssTemplates(templates: AnalyzedPgssTemplate[], maxTemplatesPerRun: number): AnalyzedPgssTemplate[] {
return templates
.map((template) => ({
template,
score: template.aggregate.users.length * Math.log1p(template.aggregate.deltaCalls),
}))
.sort(
(left, right) => right.score - left.score || left.template.aggregate.id.localeCompare(right.template.aggregate.id),
)
.slice(0, maxTemplatesPerRun)
.map((entry) => entry.template);
}
function buildPgssStagedTemplate(
template: AnalyzedPgssTemplate,
config: HistoricSqlPullConfig,
now: Date,
): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } {
const tablesTouched = [...template.analysis.tablesTouched].sort();
const firstTable = tablesTouched[0] ?? 'query';
const id = template.aggregate.id;
const metadata: HistoricSqlMetadata = {
id,
title: `postgres · ${firstTable} [${id.slice(0, 12)}]`,
path: `templates/${id}/page.md`,
objectType: HISTORIC_SQL_OBJECT_TYPE,
lastEditedAt: null,
properties: {
fingerprint: template.analysis.fingerprint,
sub_cluster_id: null,
dialect: 'postgres',
tables_touched: tablesTouched,
literal_slots: [],
triage_signals: buildPgssTriageSignals({
executions: template.aggregate.deltaCalls,
distinctUsers: template.aggregate.distinctUsersDelta,
firstSeen: template.aggregate.firstObservedAt,
lastSeen: now.toISOString(),
meanRuntimeMs: template.aggregate.meanExecTime,
serviceAccountOnly: isServiceAccountOnly(template.aggregate.users, config.serviceAccountUserPatterns),
now,
}),
},
};
return {
metadata,
pageMarkdown: renderTemplatePage(id, template.analysis.normalizedSql, tablesTouched),
usage: {
stats: {
executions: template.aggregate.deltaCalls,
distinct_users: template.aggregate.distinctUsersDelta,
first_seen: template.aggregate.firstObservedAt,
last_seen: now.toISOString(),
p50_runtime_ms: null,
p95_runtime_ms: null,
mean_runtime_ms: template.aggregate.meanExecTime,
error_rate: 0,
rows_produced: template.aggregate.deltaRows,
},
literal_slots: [],
samples: [],
},
};
}
function buildPgssTriageSignals(input: {
executions: number;
distinctUsers: number;
firstSeen: string;
lastSeen: string;
meanRuntimeMs: number;
serviceAccountOnly: boolean;
now: Date;
}): Record<string, string> {
return {
executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high',
distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad',
error_rate_bucket: 'ok',
recency_bucket: recencyBucket(input.lastSeen, input.now),
service_account_only: String(input.serviceAccountOnly),
runtime_bucket: runtimeBucket(input.meanRuntimeMs),
};
}
function runtimeBucket(meanRuntimeMs: number): string {
if (meanRuntimeMs < 100) {
return 'fast';
}
if (meanRuntimeMs < 1000) {
return 'moderate';
}
return 'slow';
}
function recencyBucket(lastSeen: string, now: Date): string {
const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / 86400000);
if (ageDays <= 14) {
return 'active';
}
if (ageDays <= 60) {
return 'warm';
}
return 'cold';
}
function isServiceAccountOnly(users: string[], patterns: string[]): boolean {
if (users.length === 0 || patterns.length === 0) {
return false;
}
const regexes = patterns.map((pattern) => new RegExp(pattern));
return users.every((user) => regexes.some((regex) => regex.test(user)));
}
function renderTemplatePage(id: string, normalizedSql: string, tablesTouched: string[]): string {
return [
`# ${id}`,
'',
'## Normalized SQL',
'```sql',
normalizedSql,
'```',
'',
'## Tables touched',
...tablesTouched.map((table) => `- ${table}`),
'',
].join('\n');
}
function buildNextBaseline(input: {
rows: PostgresPgssRow[];
fetchedAt: string;
statsResetAt: string | null;
pgServerVersion: string;
previousBaseline: PgssBaseline | null;
}): PgssBaseline {
const templates: PgssBaseline['templates'] = {};
for (const row of input.rows) {
const templateId = pgssTemplateId(row);
const previous = input.previousBaseline?.templates[templateId];
const template = templates[templateId] ?? {
firstObservedAt: previous?.firstObservedAt ?? input.fetchedAt,
perUser: {},
};
template.perUser[row.userid] = {
calls: row.calls,
totalExecTime: row.totalExecTime,
totalRows: row.totalRows,
};
templates[templateId] = template;
}
return {
version: PGSS_BASELINE_VERSION,
fetchedAt: input.fetchedAt,
statsResetAt: input.statsResetAt,
pgServerVersion: input.pgServerVersion,
templates,
};
}
async function writeJson(root: string, relPath: string, value: unknown): Promise<void> {
await writeText(root, relPath, `${JSON.stringify(value, null, 2)}\n`);
}
async function writeText(root: string, relPath: string, value: string): Promise<void> {
const target = join(root, relPath);
await mkdir(dirname(target), { recursive: true });
await writeFile(target, value, 'utf-8');
}

View file

@ -1,816 +0,0 @@
import { mkdtemp, readFile, readdir } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { describe, expect, it } from 'vitest';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
import { stageHistoricSqlTemplates } from './stage.js';
import {
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlUsageSchema,
type HistoricSqlQueryHistoryReader,
type HistoricSqlRawQueryRow,
} from './types.js';
async function tempDir(): Promise<string> {
return mkdtemp(join(tmpdir(), 'historic-sql-stage-'));
}
async function readJson<T>(root: string, relPath: string): Promise<T> {
return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T;
}
function fakeReader(rows: HistoricSqlRawQueryRow[]): HistoricSqlQueryHistoryReader {
return {
async probe() {},
async *fetch() {
for (const row of rows) {
yield row;
}
},
};
}
const fakeSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
if (sql.includes('paid')) {
return {
fingerprint: 'fp_paid_orders',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?',
tablesTouched: ['analytics.orders'],
literalSlots: [
{ position: 1, type: 'string', exampleValue: 'paid' },
{ position: 2, type: 'date', exampleValue: '2026-04-01' },
],
};
}
return {
fingerprint: 'fp_refunds',
normalizedSql: 'SELECT count(*) FROM analytics.refunds WHERE state = ?',
tablesTouched: ['analytics.refunds'],
literalSlots: [{ position: 1, type: 'string', exampleValue: 'complete' }],
};
},
async analyzeBatch() {
return new Map();
},
};
const categoricalSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
const status = sql.includes("'refunded'") ? 'refunded' : 'paid';
return {
fingerprint: 'fp_order_status',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: status }],
};
},
async analyzeBatch() {
return new Map();
},
};
function categoricalRows(): HistoricSqlRawQueryRow[] {
return [
{
id: 'paid-1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-a',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 11,
success: true,
errorMessage: null,
},
{
id: 'paid-2',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-b',
startedAt: '2026-05-04T10:01:00.000Z',
endedAt: null,
runtimeMs: 110,
rowsProduced: 12,
success: true,
errorMessage: null,
},
{
id: 'paid-3',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-c',
startedAt: '2026-05-04T10:02:00.000Z',
endedAt: null,
runtimeMs: 120,
rowsProduced: 13,
success: true,
errorMessage: null,
},
{
id: 'refunded-1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
user: 'analyst-a',
startedAt: '2026-05-04T10:03:00.000Z',
endedAt: null,
runtimeMs: 130,
rowsProduced: 21,
success: true,
errorMessage: null,
},
{
id: 'refunded-2',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
user: 'analyst-b',
startedAt: '2026-05-04T10:04:00.000Z',
endedAt: null,
runtimeMs: 140,
rowsProduced: 22,
success: true,
errorMessage: null,
},
{
id: 'refunded-3',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'refunded'",
user: 'analyst-c',
startedAt: '2026-05-04T10:05:00.000Z',
endedAt: null,
runtimeMs: 150,
rowsProduced: 23,
success: true,
errorMessage: null,
},
];
}
const diverseSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
const value = sql.match(/status = '([^']+)'/)?.[1] ?? 'unknown';
return {
fingerprint: 'fp_diverse_samples',
normalizedSql: 'SELECT count(*) FROM analytics.orders WHERE status = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: value }],
};
},
async analyzeBatch() {
return new Map();
},
};
const classificationMatrixSqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
if (sql.includes('stale_orders')) {
return {
fingerprint: 'fp_stale_date',
normalizedSql: 'SELECT count(*) FROM analytics.stale_orders WHERE created_at >= ?',
tablesTouched: ['analytics.stale_orders'],
literalSlots: [{ position: 1, type: 'date', exampleValue: '2026-04-01' }],
};
}
const stringValue = (field: string): string => sql.match(new RegExp(`${field} = '([^']+)'`))?.[1] ?? 'unknown';
const amount = sql.match(/amount >= (\d+)/)?.[1] ?? '0';
const asOf = sql.match(/created_at >= '([^']+)'/)?.[1] ?? '2026-05-01';
return {
fingerprint: 'fp_classification_matrix',
normalizedSql:
'SELECT count(*) FROM analytics.orders WHERE region = ? AND plan = ? AND status = ? AND amount >= ? AND created_at >= ?',
tablesTouched: ['analytics.orders'],
literalSlots: [
{ position: 1, type: 'string', exampleValue: stringValue('region') },
{ position: 2, type: 'string', exampleValue: stringValue('plan') },
{ position: 3, type: 'string', exampleValue: stringValue('status') },
{ position: 4, type: 'number', exampleValue: amount },
{ position: 5, type: 'date', exampleValue: asOf },
],
};
},
async analyzeBatch() {
return new Map();
},
};
function classificationMatrixRows(): HistoricSqlRawQueryRow[] {
const rows: HistoricSqlRawQueryRow[] = Array.from({ length: 20 }, (_, index) => {
const status = index < 10 ? 'paid' : 'refunded';
const plan = index === 19 ? 'self_serve' : 'enterprise';
const amount = 100 + index;
const asOf = `2026-05-${String(1 + Math.floor(index / 5)).padStart(2, '0')}`;
return {
id: `matrix-${index + 1}`,
sql: `SELECT count(*) FROM analytics.orders WHERE region = 'us' AND plan = '${plan}' AND status = '${status}' AND amount >= ${amount} AND created_at >= '${asOf}'`,
user: `analyst-${(index % 4) + 1}`,
startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`,
endedAt: null,
runtimeMs: 100 + index,
rowsProduced: 1,
success: true,
errorMessage: null,
};
});
return [
...rows,
{
id: 'stale-date-1',
sql: "SELECT count(*) FROM analytics.stale_orders WHERE created_at >= '2026-04-01'",
user: 'analyst-1',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: null,
runtimeMs: 75,
rowsProduced: 1,
success: true,
errorMessage: null,
},
];
}
describe('stageHistoricSqlTemplates', () => {
it('compresses rows by fingerprint into document-shaped staged templates', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'q1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01' AND email = 'analyst@example.com'",
user: 'analyst@example.com',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: '2026-05-04T10:00:01.000Z',
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
{
id: 'q2',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-05-01' AND email = 'analyst-2@example.com'",
user: 'analyst-2@example.com',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: '2026-05-04T11:00:01.000Z',
runtimeMs: 300,
rowsProduced: 1,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: fakeSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: ['^svc_'],
redactionPatterns: ['[\\w.+-]+@[\\w-]+\\.[\\w.-]+'],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest).toMatchObject({
source: 'historic-sql',
connectionId: 'conn_1',
dialect: 'snowflake',
nextSuccessfulCursor: '2026-05-04T11:00:00.000Z',
templateCount: 1,
capped: false,
});
const files = (await readdir(join(stagedDir, 'templates', 'fp_paid_orders'))).sort();
expect(files).toEqual(['metadata.json', 'page.md', 'usage.json']);
const metadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, 'templates/fp_paid_orders/metadata.json'),
);
expect(metadata).toEqual({
id: 'fp_paid_orders',
title: 'snowflake · analytics.orders [fp_pai]',
path: 'templates/fp_paid_orders/page.md',
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: 'fp_paid_orders',
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [
{ position: 1, type: 'string', classification: 'constant' },
{ position: 2, type: 'date', classification: 'runtime' },
],
triage_signals: {
executions_bucket: 'low',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 1 runtime',
},
},
});
const page = await readFile(join(stagedDir, 'templates/fp_paid_orders/page.md'), 'utf-8');
expect(page).toContain('## Normalized SQL');
expect(page).toContain('SELECT count(*) FROM analytics.orders WHERE status = ? AND created_at >= ?');
expect(page).toContain('- analytics.orders');
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json'));
expect(usage.stats).toMatchObject({
executions: 2,
distinct_users: 2,
first_seen: '2026-05-04T10:00:00.000Z',
last_seen: '2026-05-04T11:00:00.000Z',
p50_runtime_ms: 100,
p95_runtime_ms: 300,
error_rate: 0,
});
expect(usage.samples).toHaveLength(1);
expect(usage.samples[0].bound_sql).toContain('<redacted>');
expect(usage.samples[0].bound_sql).not.toContain('analyst@example.com');
expect(usage.samples[0].bound_sql).not.toContain('analyst-2@example.com');
});
it('skips hard-noise SQL and caps templates deterministically', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'show-1',
sql: 'SHOW TABLES',
user: 'analyst',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: null,
success: true,
errorMessage: null,
},
{
id: 'q3',
sql: "SELECT count(*) FROM analytics.refunds WHERE state = 'complete'",
user: 'analyst',
startedAt: '2026-05-04T11:00:00.000Z',
endedAt: null,
runtimeMs: 50,
success: true,
errorMessage: null,
},
{
id: 'q4',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid' AND created_at >= '2026-04-01'",
user: 'analyst',
startedAt: '2026-05-04T11:30:00.000Z',
endedAt: null,
runtimeMs: 40,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: fakeSqlAnalysis,
pullConfig: {
dialect: 'bigquery',
windowDays: 7,
lastSuccessfulCursor: '2026-05-01T00:00:00.000Z',
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 1,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templateCount).toBe(1);
expect(manifest.capped).toBe(true);
expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']);
expect(manifest.templates.map((template) => template.id)).toEqual(['fp_paid_orders']);
});
it('splits categorical fingerprints into one document directory per dominant value', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(categoricalRows()),
sqlAnalysis: categoricalSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const templates = manifest.templates
.map((template) => ({
id: template.id,
fingerprint: template.fingerprint,
subClusterId: template.subClusterId,
path: template.path,
}))
.sort((left, right) => left.id.localeCompare(right.id));
expect(manifest.templateCount).toBe(2);
expect(templates).toEqual([
{
id: 'fp_order_status__cat_2b2ff2318877',
fingerprint: 'fp_order_status',
subClusterId: 'cat_2b2ff2318877',
path: 'templates/fp_order_status__cat_2b2ff2318877/page.md',
},
{
id: 'fp_order_status__cat_34f037ddcbfa',
fingerprint: 'fp_order_status',
subClusterId: 'cat_34f037ddcbfa',
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
},
]);
const paidMetadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/metadata.json'),
);
expect(paidMetadata).toMatchObject({
id: 'fp_order_status__cat_34f037ddcbfa',
title: 'snowflake · analytics.orders [fp_ord:ddcbfa]',
path: 'templates/fp_order_status__cat_34f037ddcbfa/page.md',
properties: {
fingerprint: 'fp_order_status',
sub_cluster_id: 'cat_34f037ddcbfa',
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'categorical' }],
},
});
const paidUsage = historicSqlUsageSchema.parse(
await readJson(stagedDir, 'templates/fp_order_status__cat_34f037ddcbfa/usage.json'),
);
expect(paidUsage.stats).toMatchObject({
executions: 3,
distinct_users: 3,
first_seen: '2026-05-04T10:00:00.000Z',
last_seen: '2026-05-04T10:02:00.000Z',
rows_produced: 36,
});
expect(paidUsage.literal_slots).toEqual([{ position: 1, distinct_values: 1, top_values: [['paid', 3]] }]);
const refundedUsage = historicSqlUsageSchema.parse(
await readJson(stagedDir, 'templates/fp_order_status__cat_2b2ff2318877/usage.json'),
);
expect(refundedUsage.stats).toMatchObject({
executions: 3,
distinct_users: 3,
first_seen: '2026-05-04T10:03:00.000Z',
last_seen: '2026-05-04T10:05:00.000Z',
rows_produced: 66,
});
expect(refundedUsage.literal_slots).toEqual([
{ position: 1, distinct_values: 1, top_values: [['refunded', 3]] },
]);
});
it('classifies literal slots across the spec matrix and stale-date demotion', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(classificationMatrixRows()),
sqlAnalysis: classificationMatrixSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const matrixTemplates = manifest.templates.filter((template) => template.fingerprint === 'fp_classification_matrix');
expect(matrixTemplates).toHaveLength(2);
expect(matrixTemplates.every((template) => template.subClusterId?.startsWith('cat_'))).toBe(true);
const matrixTemplate = matrixTemplates[0];
if (!matrixTemplate) {
throw new Error('expected classification matrix template');
}
const matrixMetadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, matrixTemplate.path.replace('/page.md', '/metadata.json')),
);
expect(matrixMetadata.properties.literal_slots).toMatchInlineSnapshot(`
[
{
"classification": "constant",
"position": 1,
"type": "string",
},
{
"classification": "constant",
"position": 2,
"type": "string",
},
{
"classification": "categorical",
"position": 3,
"type": "string",
},
{
"classification": "runtime",
"position": 4,
"type": "number",
},
{
"classification": "runtime",
"position": 5,
"type": "date",
},
]
`);
expect(matrixMetadata.properties.triage_signals.slot_summary).toBe('2 constant, 2 runtime');
const staleMetadata = historicSqlMetadataSchema.parse(
await readJson(stagedDir, 'templates/fp_stale_date/metadata.json'),
);
expect(staleMetadata.properties.literal_slots).toMatchInlineSnapshot(`
[
{
"classification": "runtime",
"position": 1,
"type": "date",
},
]
`);
expect(staleMetadata.properties.triage_signals.slot_summary).toBe('0 constant, 1 runtime');
});
it('applies the templates-per-run cap after categorical expansion', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(categoricalRows()),
sqlAnalysis: categoricalSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 1,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templateCount).toBe(1);
expect(manifest.capped).toBe(true);
expect(manifest.warnings).toEqual(['templates_truncated: kept 1 of 2 templates']);
expect(manifest.templates).toHaveLength(1);
expect(manifest.templates[0].id).toMatch(/^fp_order_status__cat_/);
});
it('omits rows_produced for BigQuery templates when reader rows have no row counts', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_bq',
queryClient: {},
reader: fakeReader([
{
id: 'bq-1',
sql: "SELECT count(*) FROM analytics.orders WHERE status = 'paid'",
user: 'analyst-a@example.com',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: fakeSqlAnalysis,
pullConfig: {
dialect: 'bigquery',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_paid_orders/usage.json'));
expect(usage.stats).not.toHaveProperty('rows_produced');
expect(usage.samples[0]).not.toHaveProperty('rows_produced');
});
it('keeps at most five diverse samples, preferring recent successful representatives per literal tuple', async () => {
const stagedDir = await tempDir();
const statuses = [
'paid',
'refunded',
'pending',
'failed',
'trial',
'cancelled',
'draft',
'returned',
'review',
'held',
'archived',
];
const rows: HistoricSqlRawQueryRow[] = statuses.flatMap((status, index) => [
{
id: `${status}-old`,
sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`,
user: 'analyst-a',
startedAt: `2026-05-04T10:${String(index).padStart(2, '0')}:00.000Z`,
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: false,
errorMessage: 'old failed sample',
},
{
id: `${status}-new`,
sql: `SELECT count(*) FROM analytics.orders WHERE status = '${status}'`,
user: 'analyst-a',
startedAt: `2026-05-04T11:${String(index).padStart(2, '0')}:00.000Z`,
endedAt: null,
runtimeMs: 90,
rowsProduced: 2,
success: true,
errorMessage: null,
},
]);
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader(rows),
sqlAnalysis: diverseSqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_diverse_samples/usage.json'));
expect(usage.samples).toHaveLength(5);
expect(usage.samples.every((sample) => sample.success)).toBe(true);
expect(new Set(usage.samples.map((sample) => sample.bound_sql.match(/status = '([^']+)'/)?.[1])).size).toBe(5);
expect(usage.samples.map((sample) => sample.started_at)).toEqual([
'2026-05-04T11:10:00.000Z',
'2026-05-04T11:09:00.000Z',
'2026-05-04T11:08:00.000Z',
'2026-05-04T11:07:00.000Z',
'2026-05-04T11:06:00.000Z',
]);
});
it('uses recency as a tie-breaker when the templates-per-run cap overflows', async () => {
const stagedDir = await tempDir();
const sqlAnalysis: SqlAnalysisPort = {
async analyzeForFingerprint(sql) {
const table = sql.includes('fresh_orders') ? 'fresh_orders' : 'stale_orders';
return {
fingerprint: `fp_${table}`,
normalizedSql: `SELECT count(*) FROM analytics.${table}`,
tablesTouched: [`analytics.${table}`],
literalSlots: [],
};
},
async analyzeBatch() {
return new Map();
},
};
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'stale-1',
sql: 'SELECT count(*) FROM analytics.stale_orders',
user: 'analyst-a',
startedAt: '2026-02-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
{
id: 'fresh-1',
sql: 'SELECT count(*) FROM analytics.fresh_orders',
user: 'analyst-a',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
]),
sqlAnalysis,
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: [],
maxTemplatesPerRun: 1,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
expect(manifest.templates.map((template) => template.id)).toEqual(['fp_fresh_orders']);
});
it('does not persist bound SQL samples when redaction patterns are invalid', async () => {
const stagedDir = await tempDir();
await stageHistoricSqlTemplates({
stagedDir,
connectionId: 'conn_1',
queryClient: {},
reader: fakeReader([
{
id: 'q1',
sql: "SELECT * FROM analytics.orders WHERE email = 'analyst@example.com'",
user: 'analyst@example.com',
startedAt: '2026-05-04T10:00:00.000Z',
endedAt: null,
runtimeMs: 100,
rowsProduced: 1,
success: true,
errorMessage: null,
},
]),
sqlAnalysis: {
async analyzeForFingerprint() {
return {
fingerprint: 'fp_redaction',
normalizedSql: 'SELECT * FROM analytics.orders WHERE email = ?',
tablesTouched: ['analytics.orders'],
literalSlots: [{ position: 1, type: 'string', exampleValue: 'analyst@example.com' }],
};
},
async analyzeBatch() {
return new Map();
},
},
pullConfig: {
dialect: 'snowflake',
windowDays: 90,
lastSuccessfulCursor: null,
serviceAccountUserPatterns: [],
redactionPatterns: ['['],
maxTemplatesPerRun: 5000,
minCalls: 5,
},
now: new Date('2026-05-04T12:00:00.000Z'),
});
const manifest = historicSqlManifestSchema.parse(await readJson(stagedDir, 'manifest.json'));
const usage = historicSqlUsageSchema.parse(await readJson(stagedDir, 'templates/fp_redaction/usage.json'));
expect(manifest.warnings.some((warning) => warning.startsWith('redaction_skipped:invalid_redaction_pattern'))).toBe(
true,
);
expect(usage.samples).toEqual([]);
});
});

View file

@ -1,630 +0,0 @@
import { createHash } from 'node:crypto';
import { mkdir, writeFile } from 'node:fs/promises';
import { dirname, join } from 'node:path';
import type {
SqlAnalysisFingerprintResult,
SqlAnalysisLiteralSlot,
SqlAnalysisLiteralSlotType,
SqlAnalysisPort,
} from '../../../sql-analysis/index.js';
import {
HISTORIC_SQL_OBJECT_TYPE,
HISTORIC_SQL_SOURCE_KEY,
historicSqlPullConfigSchema,
historicSqlRawQueryRowSchema,
type HistoricSqlLiteralSlotClassification,
type HistoricSqlManifest,
type HistoricSqlMetadata,
type HistoricSqlPullConfig,
type HistoricSqlQueryHistoryReader,
type HistoricSqlRawQueryRow,
type HistoricSqlUsage,
} from './types.js';
interface StageHistoricSqlTemplatesInput {
stagedDir: string;
connectionId: string;
queryClient: unknown;
reader: HistoricSqlQueryHistoryReader;
sqlAnalysis: SqlAnalysisPort;
pullConfig: HistoricSqlPullConfig;
now?: Date;
}
interface SlotObservation {
value: string;
rowStartedAt: string;
}
interface SlotStats {
position: number;
type: SqlAnalysisLiteralSlotType;
values: Map<string, number>;
observations: SlotObservation[];
}
interface TemplateAccumulator {
fingerprint: string;
normalizedSql: string;
tablesTouched: Set<string>;
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
slotStats: Map<number, SlotStats>;
}
interface ClassifiedLiteralSlot {
position: number;
type: SqlAnalysisLiteralSlotType;
classification: HistoricSqlLiteralSlotClassification;
}
interface TemplateVariant {
id: string;
fingerprint: string;
subClusterId: string | null;
normalizedSql: string;
tablesTouched: Set<string>;
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
slotStats: Map<number, SlotStats>;
slotClassifications: ClassifiedLiteralSlot[];
}
interface CategoricalTupleEntry {
position: number;
value: string;
}
interface RedactionPolicy {
redactors: RegExp[];
samplesAllowed: boolean;
}
const HARD_SKIP_PREFIX_RE = /^\s*(SHOW|DESCRIBE|DESC|EXPLAIN|USE|SET)\b/i;
const HARD_SKIP_TABLE_RE = /\b(INFORMATION_SCHEMA|SNOWFLAKE\.ACCOUNT_USAGE|pg_|system\.)/i;
export async function stageHistoricSqlTemplates(input: StageHistoricSqlTemplatesInput): Promise<void> {
const config = historicSqlPullConfigSchema.parse(input.pullConfig);
const now = input.now ?? new Date();
const windowStart = config.lastSuccessfulCursor
? new Date(config.lastSuccessfulCursor)
: new Date(now.getTime() - config.windowDays * 24 * 60 * 60 * 1000);
const warnings: string[] = [];
const redaction = compileRedactors(config.redactionPatterns, warnings);
const groups = new Map<string, TemplateAccumulator>();
let nextSuccessfulCursor: string | null = null;
await input.reader.probe(input.queryClient);
for await (const rawRow of input.reader.fetch(
input.queryClient,
{ start: windowStart, end: now },
config.lastSuccessfulCursor,
)) {
const row = historicSqlRawQueryRowSchema.parse(rawRow);
if (!nextSuccessfulCursor || row.startedAt > nextSuccessfulCursor) {
nextSuccessfulCursor = row.startedAt;
}
if (shouldSkipSql(row.sql)) {
continue;
}
const analysis = await input.sqlAnalysis.analyzeForFingerprint(row.sql, config.dialect);
if (analysis.error || !analysis.fingerprint || !analysis.normalizedSql) {
warnings.push(`analysis_failed:${row.id}`);
continue;
}
const group =
groups.get(analysis.fingerprint) ??
{
fingerprint: analysis.fingerprint,
normalizedSql: analysis.normalizedSql,
tablesTouched: new Set<string>(),
rows: [],
slotStats: new Map<number, SlotStats>(),
};
for (const table of analysis.tablesTouched) {
group.tablesTouched.add(table);
}
for (const slot of analysis.literalSlots) {
recordSlot(group.slotStats, slot, redaction.redactors, row.startedAt);
}
group.rows.push({ row, analysis });
groups.set(analysis.fingerprint, group);
}
const expandedTemplates = expandCategoricalTemplates([...groups.values()], redaction.redactors);
const selected = selectTemplates(expandedTemplates, config.maxTemplatesPerRun, now);
if (selected.length < expandedTemplates.length) {
warnings.push(`templates_truncated: kept ${selected.length} of ${expandedTemplates.length} templates`);
}
await mkdir(input.stagedDir, { recursive: true });
const templates: HistoricSqlManifest['templates'] = [];
for (const template of selected) {
const staged = buildStagedTemplate(template, config, redaction, now);
const basePath = `templates/${staged.metadata.id}`;
await writeJson(input.stagedDir, `${basePath}/metadata.json`, staged.metadata);
await writeText(input.stagedDir, `${basePath}/page.md`, staged.pageMarkdown);
await writeJson(input.stagedDir, `${basePath}/usage.json`, staged.usage);
templates.push({
id: staged.metadata.id,
fingerprint: staged.metadata.properties.fingerprint,
subClusterId: staged.metadata.properties.sub_cluster_id,
path: staged.metadata.path,
});
}
await writeJson(input.stagedDir, 'manifest.json', {
source: HISTORIC_SQL_SOURCE_KEY,
connectionId: input.connectionId,
dialect: config.dialect,
fetchedAt: now.toISOString(),
windowStart: windowStart.toISOString(),
windowEnd: now.toISOString(),
nextSuccessfulCursor,
templateCount: selected.length,
capped: selected.length < expandedTemplates.length,
warnings,
degraded: false,
statsResetAt: null,
baselineFirstRun: false,
pgServerVersion: null,
deallocCount: null,
templates,
} satisfies HistoricSqlManifest);
}
function shouldSkipSql(sql: string): boolean {
return HARD_SKIP_PREFIX_RE.test(sql) || HARD_SKIP_TABLE_RE.test(sql);
}
function recordSlot(
slotStats: Map<number, SlotStats>,
slot: SqlAnalysisLiteralSlot,
redactors: RegExp[],
rowStartedAt: string,
): void {
const existing = slotStats.get(slot.position) ?? {
position: slot.position,
type: slot.type,
values: new Map<string, number>(),
observations: [],
};
const persistedValue = redactText(slot.exampleValue, redactors);
existing.values.set(persistedValue, (existing.values.get(persistedValue) ?? 0) + 1);
existing.observations.push({ value: persistedValue, rowStartedAt });
slotStats.set(slot.position, existing);
}
function expandCategoricalTemplates(groups: TemplateAccumulator[], redactors: RegExp[]): TemplateVariant[] {
return groups.flatMap((group) => expandTemplateGroup(group, redactors));
}
function expandTemplateGroup(group: TemplateAccumulator, redactors: RegExp[]): TemplateVariant[] {
const rows = [...group.rows].sort((left, right) => left.row.startedAt.localeCompare(right.row.startedAt));
const firstSeen = rows[0]?.row.startedAt;
if (!firstSeen) {
return [];
}
const slotClassifications = classifySlots(group.slotStats, rows.length, firstSeen);
const categoricalPositions = slotClassifications
.filter((slot) => slot.classification === 'categorical')
.map((slot) => slot.position)
.sort((left, right) => left - right);
if (categoricalPositions.length === 0) {
return [
{
id: group.fingerprint,
fingerprint: group.fingerprint,
subClusterId: null,
normalizedSql: group.normalizedSql,
tablesTouched: group.tablesTouched,
rows,
slotStats: group.slotStats,
slotClassifications,
},
];
}
const byTuple = new Map<
string,
{
tuple: CategoricalTupleEntry[];
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>;
}
>();
for (const entry of rows) {
const tuple = categoricalTuple(entry.analysis.literalSlots, categoricalPositions, redactors);
const key = JSON.stringify(tuple);
const existing = byTuple.get(key) ?? { tuple, rows: [] };
existing.rows.push(entry);
byTuple.set(key, existing);
}
return [...byTuple.values()]
.map(({ tuple, rows: tupleRows }) => {
const subClusterId = subClusterIdForTuple(tuple);
return {
id: `${group.fingerprint}__${subClusterId}`,
fingerprint: group.fingerprint,
subClusterId,
normalizedSql: group.normalizedSql,
tablesTouched: group.tablesTouched,
rows: tupleRows,
slotStats: collectSlotStats(tupleRows, redactors),
slotClassifications,
};
})
.sort((left, right) => left.id.localeCompare(right.id));
}
function classifySlots(
slotStats: Map<number, SlotStats>,
executions: number,
firstSeen: string,
): ClassifiedLiteralSlot[] {
return [...slotStats.values()]
.sort((left, right) => left.position - right.position)
.map((slot) => ({
position: slot.position,
type: slot.type,
classification: classifySlot(slot, executions, firstSeen),
}));
}
function collectSlotStats(
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>,
redactors: RegExp[],
): Map<number, SlotStats> {
const slotStats = new Map<number, SlotStats>();
for (const entry of rows) {
for (const slot of entry.analysis.literalSlots) {
recordSlot(slotStats, slot, redactors, entry.row.startedAt);
}
}
return slotStats;
}
function categoricalTuple(
literalSlots: SqlAnalysisLiteralSlot[],
categoricalPositions: number[],
redactors: RegExp[],
): CategoricalTupleEntry[] {
const valuesByPosition = new Map(
literalSlots.map((slot) => [slot.position, redactText(slot.exampleValue, redactors)] as const),
);
return categoricalPositions.map((position) => ({
position,
value: valuesByPosition.get(position) ?? '<missing>',
}));
}
function subClusterIdForTuple(tuple: CategoricalTupleEntry[]): string {
return `cat_${createHash('sha256').update(JSON.stringify(tuple)).digest('hex').slice(0, 12)}`;
}
function buildStagedTemplate(
template: TemplateVariant,
config: HistoricSqlPullConfig,
redaction: RedactionPolicy,
now: Date,
): { metadata: HistoricSqlMetadata; pageMarkdown: string; usage: HistoricSqlUsage } {
const rows = template.rows
.map((entry) => entry.row)
.sort((left, right) => left.startedAt.localeCompare(right.startedAt));
const firstSeen = rows[0].startedAt;
const lastSeen = rows[rows.length - 1].startedAt;
const distinctUsers = new Set(rows.map((row) => row.user).filter((user): user is string => !!user)).size;
const errorCount = rows.filter((row) => !row.success).length;
const runtimes = rows
.map((row) => row.runtimeMs)
.filter((runtime): runtime is number => typeof runtime === 'number')
.sort((left, right) => left - right);
const triageSignals = buildTriageSignals({
executions: rows.length,
distinctUsers,
errorRate: rows.length === 0 ? 0 : errorCount / rows.length,
lastSeen,
now,
serviceAccountOnly: isServiceAccountOnly(rows, config.serviceAccountUserPatterns),
slotClassifications: template.slotClassifications.map((slot) => slot.classification),
});
const tablesTouched = [...template.tablesTouched].sort();
const firstTable = tablesTouched[0] ?? 'query';
const id = template.id;
const rowsProduced = sumRowsProduced(rows);
const metadata: HistoricSqlMetadata = {
id,
title: buildTemplateTitle(config.dialect, firstTable, template.fingerprint, template.subClusterId),
path: `templates/${id}/page.md`,
objectType: HISTORIC_SQL_OBJECT_TYPE,
lastEditedAt: null,
properties: {
fingerprint: template.fingerprint,
sub_cluster_id: template.subClusterId,
dialect: config.dialect,
tables_touched: tablesTouched,
literal_slots: template.slotClassifications,
triage_signals: triageSignals,
},
};
return {
metadata,
pageMarkdown: renderTemplatePage(id, template.normalizedSql, tablesTouched),
usage: {
stats: {
executions: rows.length,
distinct_users: distinctUsers,
first_seen: firstSeen,
last_seen: lastSeen,
p50_runtime_ms: percentile(runtimes, 0.5),
p95_runtime_ms: percentile(runtimes, 0.95),
error_rate: rows.length === 0 ? 0 : errorCount / rows.length,
...(rowsProduced === null ? {} : { rows_produced: rowsProduced }),
},
literal_slots: [...template.slotStats.values()]
.sort((left, right) => left.position - right.position)
.map((slot) => ({
position: slot.position,
distinct_values: slot.values.size,
top_values: [...slot.values.entries()]
.sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))
.slice(0, 10),
})),
samples: selectSamples(template.rows, redaction),
},
};
}
const TEMPORAL_SLOT_TYPES = new Set<SqlAnalysisLiteralSlotType>(['date', 'timestamp']);
function isStaleDateConstant(slot: SlotStats, value: string, firstSeen: string): boolean {
return slot.type === 'date' && parseTemporalSlotValue(value) !== null && value < firstSeen.slice(0, 10);
}
function isMovingTemporalSlot(slot: SlotStats): boolean {
if (!TEMPORAL_SLOT_TYPES.has(slot.type) || slot.values.size < 2) {
return false;
}
const observations: Array<{ rowStartedAt: number; literalTime: number }> = [];
for (const observation of slot.observations) {
const rowStartedAt = Date.parse(observation.rowStartedAt);
const literalTime = parseTemporalSlotValue(observation.value);
if (Number.isNaN(rowStartedAt) || literalTime === null) {
return false;
}
observations.push({ rowStartedAt, literalTime });
}
const literalTimes = observations
.sort((left, right) => left.rowStartedAt - right.rowStartedAt)
.map((observation) => observation.literalTime);
return isMonotonic(literalTimes);
}
function parseTemporalSlotValue(value: string): number | null {
const parsed = Date.parse(value);
return Number.isNaN(parsed) ? null : parsed;
}
function isMonotonic(values: number[]): boolean {
if (values.length < 2) {
return false;
}
let nonDecreasing = true;
let nonIncreasing = true;
for (let index = 1; index < values.length; index += 1) {
if (values[index] < values[index - 1]) {
nonDecreasing = false;
}
if (values[index] > values[index - 1]) {
nonIncreasing = false;
}
}
return nonDecreasing || nonIncreasing;
}
function classifySlot(
slot: SlotStats,
executions: number,
firstSeen: string,
): HistoricSqlLiteralSlotClassification {
const ordered = [...slot.values.entries()].sort((left, right) => right[1] - left[1]);
const distinct = ordered.length;
const topCount = ordered[0]?.[1] ?? 0;
const topValue = ordered[0]?.[0] ?? '';
const staleDateConstant = isStaleDateConstant(slot, topValue, firstSeen);
if (distinct === 1 && !staleDateConstant) {
return 'constant';
}
if (executions > 0 && topCount / executions >= 0.95 && !staleDateConstant) {
return 'constant';
}
if (isMovingTemporalSlot(slot)) {
return 'runtime';
}
if (executions > 0 && distinct >= 2 && distinct <= 10 && ordered.every(([, count]) => count / executions >= 0.05)) {
return 'categorical';
}
return 'runtime';
}
function buildTriageSignals(input: {
executions: number;
distinctUsers: number;
errorRate: number;
lastSeen: string;
now: Date;
serviceAccountOnly: boolean;
slotClassifications: HistoricSqlLiteralSlotClassification[];
}): Record<string, string> {
const runtimeCount = input.slotClassifications.filter((classification) => classification === 'runtime').length;
const constantCount = input.slotClassifications.filter((classification) => classification === 'constant').length;
return {
executions_bucket: input.executions < 3 ? 'low' : input.executions < 50 ? 'mid' : 'high',
distinct_users_bucket: input.distinctUsers <= 1 ? 'solo' : input.distinctUsers <= 5 ? 'team' : 'broad',
error_rate_bucket: input.errorRate <= 0.01 ? 'ok' : input.errorRate <= 0.1 ? 'noisy' : 'broken',
recency_bucket: recencyBucket(input.lastSeen, input.now),
service_account_only: String(input.serviceAccountOnly),
slot_summary: `${constantCount} constant, ${runtimeCount} runtime`,
};
}
function recencyBucket(lastSeen: string, now: Date): string {
const ageDays = Math.max(0, (now.getTime() - new Date(lastSeen).getTime()) / (24 * 60 * 60 * 1000));
if (ageDays <= 14) {
return 'active';
}
if (ageDays <= 60) {
return 'warm';
}
return 'cold';
}
function isServiceAccountOnly(rows: HistoricSqlRawQueryRow[], patterns: string[]): boolean {
const users = rows.map((row) => row.user).filter((user): user is string => !!user);
if (users.length === 0 || patterns.length === 0) {
return false;
}
const regexes = patterns.map((pattern) => new RegExp(pattern));
return users.every((user) => regexes.some((regex) => regex.test(user)));
}
function buildTemplateTitle(
dialect: HistoricSqlPullConfig['dialect'],
firstTable: string,
fingerprint: string,
subClusterId: string | null,
): string {
if (!subClusterId) {
return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}]`;
}
return `${dialect} · ${firstTable} [${fingerprint.slice(0, 6)}:${subClusterId.slice(-6)}]`;
}
function renderTemplatePage(fingerprint: string, normalizedSql: string, tablesTouched: string[]): string {
return [
`# ${fingerprint}`,
'',
'## Normalized SQL',
'```sql',
normalizedSql,
'```',
'',
'## Tables touched',
...tablesTouched.map((table) => `- ${table}`),
'',
].join('\n');
}
function selectSamples(
rows: Array<{ row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>,
redaction: RedactionPolicy,
): HistoricSqlUsage['samples'] {
if (!redaction.samplesAllowed) {
return [];
}
const byLiteralTuple = new Map<string, { row: HistoricSqlRawQueryRow; analysis: SqlAnalysisFingerprintResult }>();
const preferred = [...rows].sort((left, right) => {
if (left.row.success !== right.row.success) {
return left.row.success ? -1 : 1;
}
return right.row.startedAt.localeCompare(left.row.startedAt);
});
for (const entry of preferred) {
const key = [...entry.analysis.literalSlots]
.sort((left, right) => left.position - right.position)
.map((slot) => slot.exampleValue)
.join('\u001f');
if (!byLiteralTuple.has(key)) {
byLiteralTuple.set(key, entry);
}
}
return [...byLiteralTuple.values()]
.sort((left, right) => right.row.startedAt.localeCompare(left.row.startedAt))
.slice(0, 5)
.map(({ row }) => ({
started_at: row.startedAt,
user: row.user,
bound_sql: redactText(row.sql, redaction.redactors),
...(row.rowsProduced === undefined ? {} : { rows_produced: row.rowsProduced ?? null }),
runtime_ms: row.runtimeMs,
success: row.success,
}));
}
function selectTemplates(templates: TemplateVariant[], maxTemplatesPerRun: number, now: Date): TemplateVariant[] {
return templates
.map((template) => ({ template, score: rankTemplate(template, now) }))
.sort((left, right) => right.score - left.score || left.template.id.localeCompare(right.template.id))
.slice(0, maxTemplatesPerRun)
.map((entry) => entry.template);
}
function rankTemplate(template: TemplateVariant, now: Date): number {
const users = new Set(template.rows.map(({ row }) => row.user).filter((user): user is string => !!user)).size;
const latestStartedAt = template.rows.reduce<string | null>(
(latest, { row }) => (latest === null || row.startedAt > latest ? row.startedAt : latest),
null,
);
const ageDays =
latestStartedAt === null ? 365 : Math.max(0, (now.getTime() - new Date(latestStartedAt).getTime()) / 86400000);
const recencyWeight = 1 / (1 + ageDays / 30);
return users * Math.log1p(template.rows.length) * recencyWeight;
}
function percentile(values: number[], percentileValue: number): number | null {
if (values.length === 0) {
return null;
}
const index = Math.min(values.length - 1, Math.max(0, Math.ceil(values.length * percentileValue) - 1));
return values[index];
}
function sumRowsProduced(rows: HistoricSqlRawQueryRow[]): number | null {
const values = rows.map((row) => row.rowsProduced).filter((value): value is number => typeof value === 'number');
return values.length > 0 ? values.reduce((sum, value) => sum + value, 0) : null;
}
function compileRedactors(patterns: string[], warnings: string[]): RedactionPolicy {
let samplesAllowed = true;
const redactors = patterns.flatMap((pattern) => {
try {
return [new RegExp(pattern, 'g')];
} catch (error) {
samplesAllowed = false;
warnings.push(
`redaction_skipped:invalid_redaction_pattern:${pattern}:${error instanceof Error ? error.message : String(error)}`,
);
return [];
}
});
return { redactors, samplesAllowed };
}
function redactText(value: string, redactors: RegExp[]): string {
return redactors.reduce((current, regex) => current.replace(regex, '<redacted>'), value);
}
async function writeJson(stagedDir: string, relPath: string, value: unknown): Promise<void> {
await writeText(stagedDir, relPath, `${JSON.stringify(value, null, 2)}\n`);
}
async function writeText(stagedDir: string, relPath: string, value: string): Promise<void> {
const target = join(stagedDir, relPath);
await mkdir(dirname(target), { recursive: true });
await writeFile(target, value, 'utf-8');
}

View file

@ -2,22 +2,10 @@ import { z } from 'zod';
import type { SqlAnalysisPort } from '../../../sql-analysis/index.js';
export const HISTORIC_SQL_SOURCE_KEY = 'historic-sql' as const;
export const HISTORIC_SQL_OBJECT_TYPE = 'historic_sql_template' as const;
const historicSqlDialectSchema = z.enum(['snowflake', 'bigquery', 'postgres']);
export type HistoricSqlDialect = z.infer<typeof historicSqlDialectSchema>;
export const historicSqlPullConfigSchema = z.object({
dialect: historicSqlDialectSchema,
windowDays: z.number().int().min(1).max(365).default(90),
lastSuccessfulCursor: z.string().datetime().nullable().default(null),
serviceAccountUserPatterns: z.array(z.string()).default([]),
redactionPatterns: z.array(z.string()).default([]),
maxTemplatesPerRun: z.number().int().min(1).max(5000).default(5000),
minCalls: z.number().int().min(1).default(5),
});
export type HistoricSqlPullConfig = z.infer<typeof historicSqlPullConfigSchema>;
const filterModeSchema = z.enum(['exclude', 'include', 'mark-only']);
function isRecord(value: unknown): value is Record<string, unknown> {
@ -154,28 +142,6 @@ export interface HistoricSqlTimeWindow {
end: Date;
}
export const historicSqlRawQueryRowSchema = z.object({
id: z.string().min(1),
sql: z.string().min(1),
user: z.string().nullable().default(null),
startedAt: z.string().datetime(),
endedAt: z.string().datetime().nullable().default(null),
runtimeMs: z.number().nonnegative().nullable().default(null),
rowsProduced: z.number().int().nonnegative().nullable().optional(),
success: z.boolean().default(true),
errorMessage: z.string().nullable().default(null),
});
export type HistoricSqlRawQueryRow = z.infer<typeof historicSqlRawQueryRowSchema>;
export interface HistoricSqlQueryHistoryReader {
probe(client: unknown): Promise<void>;
fetch(
client: unknown,
window: HistoricSqlTimeWindow,
cursor?: string | null,
): AsyncIterable<HistoricSqlRawQueryRow>;
}
export interface KtxPostgresQueryClient {
executeQuery(sql: string, params?: unknown[]): Promise<{ headers: string[]; rows: unknown[][]; totalRows?: number }>;
}
@ -185,48 +151,6 @@ export interface PostgresPgssProbeResult {
warnings: string[];
}
export interface PostgresPgssSnapshot {
statsResetAt: string | null;
deallocCount: number | null;
rows: PostgresPgssRow[];
}
export interface PostgresPgssReader {
probe(client: KtxPostgresQueryClient): Promise<PostgresPgssProbeResult>;
readSnapshot(
client: KtxPostgresQueryClient,
options: { minCalls: number; maxTemplates: number },
): Promise<PostgresPgssSnapshot>;
}
export interface PostgresPgssRow {
queryid: string;
userid: string;
username: string | null;
dbid: string;
database: string | null;
query: string;
calls: number;
totalExecTime: number;
meanExecTime: number;
totalRows: number;
}
export interface PostgresPgssAggregateRow {
id: string;
queryid: string;
dbid: string;
database: string | null;
query: string;
deltaCalls: number;
deltaExecTime: number;
deltaRows: number;
meanExecTime: number;
distinctUsersDelta: number;
users: string[];
firstObservedAt: string;
}
export interface HistoricSqlSourceAdapterDeps {
sqlAnalysis: SqlAnalysisPort;
reader: HistoricSqlReader;
@ -234,88 +158,3 @@ export interface HistoricSqlSourceAdapterDeps {
legacyPostgresBaselineRootDir?: string;
now?: () => Date;
}
const historicSqlLiteralSlotClassificationSchema = z.enum(['constant', 'runtime', 'categorical']);
export type HistoricSqlLiteralSlotClassification = z.infer<typeof historicSqlLiteralSlotClassificationSchema>;
export const historicSqlMetadataSchema = z.object({
id: z.string().min(1),
title: z.string().min(1),
path: z.string().min(1),
objectType: z.literal(HISTORIC_SQL_OBJECT_TYPE),
lastEditedAt: z.null(),
properties: z.object({
fingerprint: z.string().min(1),
sub_cluster_id: z.string().nullable(),
dialect: historicSqlDialectSchema,
tables_touched: z.array(z.string()),
literal_slots: z.array(
z.object({
position: z.number().int().min(1),
type: z.enum(['string', 'number', 'timestamp', 'date', 'boolean', 'null', 'unknown']),
classification: historicSqlLiteralSlotClassificationSchema,
}),
),
triage_signals: z.record(z.string(), z.string()),
}),
});
export type HistoricSqlMetadata = z.infer<typeof historicSqlMetadataSchema>;
export const historicSqlUsageSchema = z.object({
stats: z.object({
executions: z.number().int().nonnegative(),
distinct_users: z.number().int().nonnegative(),
first_seen: z.string().datetime(),
last_seen: z.string().datetime(),
p50_runtime_ms: z.number().nonnegative().nullable(),
p95_runtime_ms: z.number().nonnegative().nullable(),
mean_runtime_ms: z.number().nonnegative().nullable().optional(),
error_rate: z.number().min(0).max(1),
rows_produced: z.number().int().nonnegative().nullable().optional(),
}),
literal_slots: z.array(
z.object({
position: z.number().int().min(1),
distinct_values: z.number().int().nonnegative(),
top_values: z.array(z.tuple([z.string(), z.number().int().nonnegative()])),
}),
),
samples: z.array(
z.object({
started_at: z.string().datetime(),
user: z.string().nullable(),
bound_sql: z.string(),
rows_produced: z.number().int().nonnegative().nullable().optional(),
runtime_ms: z.number().nonnegative().nullable(),
success: z.boolean(),
}),
),
});
export type HistoricSqlUsage = z.infer<typeof historicSqlUsageSchema>;
export const historicSqlManifestSchema = z.object({
source: z.literal(HISTORIC_SQL_SOURCE_KEY),
connectionId: z.string().min(1),
dialect: historicSqlDialectSchema,
fetchedAt: z.string().datetime(),
windowStart: z.string().datetime(),
windowEnd: z.string().datetime(),
nextSuccessfulCursor: z.string().datetime().nullable(),
templateCount: z.number().int().nonnegative(),
capped: z.boolean(),
warnings: z.array(z.string()),
degraded: z.boolean().default(false),
statsResetAt: z.string().datetime().nullable().default(null),
baselineFirstRun: z.boolean().default(false),
pgServerVersion: z.string().nullable().default(null),
deallocCount: z.number().int().nonnegative().nullable().default(null),
templates: z.array(
z.object({
id: z.string().min(1),
fingerprint: z.string().min(1),
subClusterId: z.string().nullable(),
path: z.string().min(1),
}),
),
});
export type HistoricSqlManifest = z.infer<typeof historicSqlManifestSchema>;

View file

@ -318,7 +318,6 @@ export { NOTION_ORG_KNOWLEDGE_WARNING } from './adapters/notion/chunk.js';
export { NotionSourceAdapter, type NotionSourceAdapterDeps } from './adapters/notion/notion.adapter.js';
export { NotionClient, type NotionApi, type NotionBotInfo } from './adapters/notion/notion-client.js';
export { bucketDistinctUsers, bucketErrorRate, bucketExecutions, bucketP95Runtime, bucketRecency } from './adapters/historic-sql/buckets.js';
export { chunkHistoricSqlStagedDir, describeHistoricSqlScope } from './adapters/historic-sql/chunk.js';
export { chunkHistoricSqlUnifiedStagedDir, describeHistoricSqlUnifiedScope } from './adapters/historic-sql/chunk-unified.js';
export { detectHistoricSqlStagedDir } from './adapters/historic-sql/detect.js';
export {
@ -330,10 +329,8 @@ export { HistoricSqlSourceAdapter } from './adapters/historic-sql/historic-sql.a
export { BigQueryHistoricSqlQueryHistoryReader } from './adapters/historic-sql/bigquery-query-history-reader.js';
export type { BigQueryHistoricSqlQueryHistoryReaderOptions } from './adapters/historic-sql/bigquery-query-history-reader.js';
export { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js';
export { PostgresPgssQueryHistoryReader } from './adapters/historic-sql/postgres-pgss-query-history-reader.js';
export { SnowflakeHistoricSqlQueryHistoryReader } from './adapters/historic-sql/snowflake-query-history-reader.js';
export { stageHistoricSqlAggregatedSnapshot } from './adapters/historic-sql/stage-unified.js';
export { stageHistoricSqlTemplates } from './adapters/historic-sql/stage.js';
export {
historicSqlEvidenceEnvelopeSchema,
historicSqlEvidencePath,
@ -359,46 +356,24 @@ export type {
PatternOutput,
TableUsageOutput,
} from './adapters/historic-sql/skill-schemas.js';
export {
pgssBaselinePath,
readPgssBaseline,
stagePgStatStatementsTemplates,
writePgssBaselineAtomic,
} from './adapters/historic-sql/stage-pgss.js';
export type { PgssBaseline, StagePgStatStatementsTemplatesResult } from './adapters/historic-sql/stage-pgss.js';
export type {
AggregatedTemplate,
HistoricSqlDialect,
HistoricSqlManifest,
HistoricSqlMetadata,
HistoricSqlProbeResult,
HistoricSqlPullConfig,
HistoricSqlQueryHistoryReader,
HistoricSqlRawQueryRow,
HistoricSqlReader,
HistoricSqlSourceAdapterDeps,
HistoricSqlTimeWindow,
HistoricSqlUnifiedPullConfig,
HistoricSqlUsage,
KtxPostgresQueryClient,
PostgresPgssAggregateRow,
PostgresPgssProbeResult,
PostgresPgssRow,
PostgresPgssSnapshot,
StagedManifest,
StagedPatternsInput,
StagedTableInput,
} from './adapters/historic-sql/types.js';
export {
HISTORIC_SQL_OBJECT_TYPE,
HISTORIC_SQL_SOURCE_KEY,
aggregatedTemplateSchema,
historicSqlManifestSchema,
historicSqlMetadataSchema,
historicSqlPullConfigSchema,
historicSqlRawQueryRowSchema,
historicSqlUnifiedPullConfigSchema,
historicSqlUsageSchema,
stagedManifestSchema,
stagedPatternsInputSchema,
stagedTableInputSchema,

View file

@ -405,44 +405,44 @@ describe('IngestBundleRunner — Stages 1 → 7', () => {
);
});
it('reuses document evidence indexing and page triage for historic-SQL WorkUnits', async () => {
it('reuses document evidence indexing and page triage for document WorkUnits', async () => {
const deps = makeDeps();
deps.adapter.source = 'historic-sql';
deps.adapter.skillNames = ['historic_sql_ingest'];
deps.adapter.reconcileSkillNames = ['historic_sql_curator'];
deps.adapter.source = 'notion';
deps.adapter.skillNames = ['notion_synthesize'];
deps.adapter.reconcileSkillNames = [];
deps.adapter.evidenceIndexing = 'documents';
deps.adapter.triageSupported = true;
deps.adapter.chunk.mockResolvedValue({
workUnits: [
{ unitKey: 'full', rawFiles: ['templates/full/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
{ unitKey: 'skip', rawFiles: ['templates/skip/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
{ unitKey: 'full', rawFiles: ['pages/full/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
{ unitKey: 'skip', rawFiles: ['pages/skip/metadata.json'], dependencyPaths: [], peerFileIndex: [] },
],
});
deps.diffSetService.compute.mockResolvedValue({
added: ['templates/full/metadata.json', 'templates/skip/metadata.json'],
added: ['pages/full/metadata.json', 'pages/skip/metadata.json'],
modified: [],
deleted: [],
unchanged: [],
});
deps.pageTriage.triageRun.mockResolvedValue({
enabled: true,
fullRawPaths: new Set(['templates/full/metadata.json']),
fullRawPaths: new Set(['pages/full/metadata.json']),
warnings: [],
});
const runner = buildRunner(deps);
(runner as any).stageRawFilesStage1 = vi.fn().mockResolvedValue({
currentHashes: new Map([
['templates/full/metadata.json', 'h-full'],
['templates/skip/metadata.json', 'h-skip'],
['pages/full/metadata.json', 'h-full'],
['pages/skip/metadata.json', 'h-skip'],
]),
rawDirInWorktree: 'raw-sources/c1/historic-sql/s',
rawDirInWorktree: 'raw-sources/c1/notion/s',
});
(runner as any).resolveStagedDir = vi.fn().mockResolvedValue('/tmp/stage/upload-x');
const result = await runner.run({
jobId: 'j1',
connectionId: 'c1',
sourceKey: 'historic-sql',
sourceKey: 'notion',
trigger: 'upload',
bundleRef: { kind: 'upload', uploadId: 'upload-x' },
});

View file

@ -29,48 +29,10 @@ describe('ingest prompt assets', () => {
expect(prompt).not.toMatch(forbiddenProductPattern());
});
it('pins historic-SQL triage rules with synthetic signal fixtures', async () => {
it('does not route historic-SQL through page-triage prompt examples', async () => {
const prompt = await readFile(new URL('../../prompts/skills/page_triage_classifier.md', import.meta.url), 'utf-8');
expect(prompt).toContain('signals.objectType === "historic_sql_template"');
expect(prompt).toContain('executions_bucket=low AND distinct_users_bucket=solo');
expect(prompt).toContain('service_account_only=true AND below the frequency floor');
expect(prompt).toContain('shared human usage with mid or high execution volume');
const fixtures = [
{
label: 'skip low solo template',
objectType: '"objectType": "historic_sql_template"',
executions: '"executions_bucket": "low"',
users: '"distinct_users_bucket": "solo"',
serviceAccount: '"service_account_only": "false"',
lane: '-> `skip`',
},
{
label: 'light service-account-only template',
objectType: '"objectType": "historic_sql_template"',
executions: '"executions_bucket": "high"',
users: '"distinct_users_bucket": "solo"',
serviceAccount: '"service_account_only": "true"',
lane: '-> `light`',
},
{
label: 'full shared human template',
objectType: '"objectType": "historic_sql_template"',
executions: '"executions_bucket": "high"',
users: '"distinct_users_bucket": "team"',
serviceAccount: '"service_account_only": "false"',
lane: '-> `full`',
},
];
for (const fixture of fixtures) {
expect(prompt).toContain(fixture.label);
expect(prompt).toContain(fixture.objectType);
expect(prompt).toContain(fixture.executions);
expect(prompt).toContain(fixture.users);
expect(prompt).toContain(fixture.serviceAccount);
expect(prompt).toContain(fixture.lane);
}
expect(prompt).not.toContain(['historic_sql', 'template'].join('_'));
expect(prompt).not.toContain('service_account_only=true AND below the frequency floor');
});
});

View file

@ -58,12 +58,6 @@ describe('ingest runtime assets', () => {
}
await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain('# Page Triage Classifier');
await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain(
'signals.objectType === "historic_sql_template"',
);
await expect(prompts.loadPrompt('skills/page_triage_classifier')).resolves.toContain(
'service_account_only=true AND below the frequency floor',
);
await expect(prompts.loadPrompt('skills/light_extraction')).resolves.toContain('# Light Context Extraction');
});

View file

@ -1,4 +1,4 @@
import { mkdir, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
@ -120,14 +120,6 @@ describe('PageTriageService', () => {
await rm(stagedDir, { recursive: true, force: true });
});
function parseSignalsFromClassifierPrompt(prompt: string): unknown {
const match = /<signals>\n([\s\S]*?)\n<\/signals>/.exec(prompt);
if (!match) {
throw new Error('classifier prompt did not include a <signals> block');
}
return JSON.parse(match[1]);
}
it('writes light-lane candidates and keeps the page out of full WorkUnits', async () => {
generateTextMock
.mockResolvedValueOnce({ text: JSON.stringify({ lane: 'light', reason: 'short durable policy' }) } as any)
@ -282,163 +274,6 @@ describe('PageTriageService', () => {
expect(repository.setDocumentTriageLane).toHaveBeenCalledWith('run-1', 'pages/page-1/page.md', 'light');
});
it.each([
{
name: 'skip low solo template',
propertyHints: {
executions_bucket: 'low',
distinct_users_bucket: 'solo',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '1 constant, 1 runtime',
},
expectedLane: 'skip',
expectedReport: { skip: 1, light: 0, full: 0 },
},
{
name: 'light service-account-only template',
propertyHints: {
executions_bucket: 'high',
distinct_users_bucket: 'solo',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'true',
slot_summary: '1 constant, 0 runtime',
},
expectedLane: 'light',
expectedReport: { skip: 0, light: 1, full: 0 },
},
{
name: 'full shared human template',
propertyHints: {
executions_bucket: 'high',
distinct_users_bucket: 'team',
error_rate_bucket: 'ok',
recency_bucket: 'active',
service_account_only: 'false',
slot_summary: '2 constant, 1 runtime',
},
expectedLane: 'full',
expectedReport: { skip: 0, light: 0, full: 1 },
},
] as const)('triages historic-SQL synthetic signal fixture as $expectedLane for $name', async ({
name,
propertyHints,
expectedLane,
expectedReport,
}) => {
const externalId = name.replace(/[^a-z0-9]+/g, '_');
const templateDir = join(stagedDir, 'templates', externalId);
await mkdir(templateDir, { recursive: true });
await writeFile(
join(templateDir, 'metadata.json'),
JSON.stringify({
id: externalId,
title: `snowflake - analytics.orders [${externalId.slice(0, 6)}]`,
path: `templates/${externalId}/page.md`,
objectType: 'historic_sql_template',
lastEditedAt: null,
properties: {
fingerprint: externalId,
sub_cluster_id: null,
dialect: 'snowflake',
tables_touched: ['analytics.orders'],
literal_slots: [{ position: 1, type: 'string', classification: 'constant' }],
triage_signals: propertyHints,
},
}),
'utf-8',
);
await writeFile(
join(templateDir, 'page.md'),
[
`# ${externalId}`,
'',
'## Normalized SQL',
'```sql',
'SELECT count(*) FROM analytics.orders WHERE status = ?',
'```',
'',
'## Tables touched',
'- analytics.orders',
].join('\n'),
'utf-8',
);
adapter.getTriageSignals.mockResolvedValueOnce({
objectType: 'historic_sql_template',
lastEditedAt: '2026-05-04T12:00:00.000Z',
propertyHints,
});
promptService.loadPrompt.mockImplementation((promptName: string) => {
if (promptName === 'skills/page_triage_classifier') {
return readFile(new URL('../../../prompts/skills/page_triage_classifier.md', import.meta.url), 'utf-8');
}
return Promise.resolve(`prompt:${promptName}`);
});
generateTextMock.mockImplementationOnce((args: any) => {
const prompt = args.messages[0].content as string;
expect(prompt).toContain('signals.objectType === "historic_sql_template"');
expect(prompt).toContain('executions_bucket=low AND distinct_users_bucket=solo');
expect(prompt).toContain('service_account_only=true AND below the frequency floor');
expect(prompt).toContain('shared human usage with mid or high execution volume');
expect(parseSignalsFromClassifierPrompt(prompt)).toEqual({
objectType: 'historic_sql_template',
lastEditedAt: '2026-05-04T12:00:00.000Z',
propertyHints,
});
return { text: JSON.stringify({ lane: expectedLane, reason: `${name} fixture` }) } as any;
});
if (expectedLane === 'light') {
generateTextMock.mockResolvedValueOnce({
text: JSON.stringify({
candidates: [
{
candidateKey: 'historic-sql-service-account-template',
topic: 'Historic SQL Service Account Template',
assertion: 'A service-account-only historic SQL template can remain as light evidence.',
rationale: 'The synthetic historic-SQL fixture is service-account-only and below the frequency floor.',
evidenceChunkIds: ['00000000-0000-0000-0000-000000000101'],
suggestedPageKey: 'historic-sql-service-account-template',
actionHint: 'create',
durabilityScore: 2,
authorityScore: 1,
reuseScore: 2,
noveltyScore: 1,
riskScore: 0,
},
],
}),
} as any);
}
const result = await service.triageRun({
stagedDir,
runId: 'run-1',
connectionId: 'conn-1',
sourceKey: 'historic-sql',
syncId: 'sync-1',
jobId: 'job-1',
diffSet: {
added: [`templates/${externalId}/metadata.json`, `templates/${externalId}/page.md`],
modified: [],
deleted: [],
unchanged: [],
},
adapter: adapter as any,
});
expect(result.report).toMatchObject({ pageCount: 1, ...expectedReport });
expect(repository.setDocumentTriageLane).toHaveBeenCalledWith(
'run-1',
`templates/${externalId}/page.md`,
expectedLane,
);
expect(result.fullRawPaths.has(`templates/${externalId}/metadata.json`)).toBe(expectedLane === 'full');
expect(result.fullRawPaths.has(`templates/${externalId}/page.md`)).toBe(expectedLane === 'full');
});
it('triages Notion data-source row pages without reading data-source metadata as page markdown', async () => {
triageSettings.lightExtractionEnabled = false;

View file

@ -232,11 +232,7 @@ describe('@ktx/context package exports', () => {
expect(ingest.HistoricSqlSourceAdapter).toBeTypeOf('function');
expect(ingest.SnowflakeHistoricSqlQueryHistoryReader).toBeTypeOf('function');
expect(ingest.BigQueryHistoricSqlQueryHistoryReader).toBeTypeOf('function');
expect(ingest.PostgresPgssQueryHistoryReader).toBeTypeOf('function');
expect(ingest.stagePgStatStatementsTemplates).toBeTypeOf('function');
expect(ingest.pgssBaselinePath).toBeTypeOf('function');
expect(ingest.readPgssBaseline).toBeTypeOf('function');
expect(ingest.writePgssBaselineAtomic).toBeTypeOf('function');
expect(ingest.PostgresPgssReader).toBeTypeOf('function');
expect(ingest.HistoricSqlExtensionMissingError).toBeTypeOf('function');
expect(ingest.HistoricSqlVersionUnsupportedError).toBeTypeOf('function');
expect(ingest.HISTORIC_SQL_SOURCE_KEY).toBe('historic-sql');