diff --git a/docs-site/content/docs/cli-reference/ktx-setup.mdx b/docs-site/content/docs/cli-reference/ktx-setup.mdx index 24469a63..0e6cb57c 100644 --- a/docs-site/content/docs/cli-reference/ktx-setup.mdx +++ b/docs-site/content/docs/cli-reference/ktx-setup.mdx @@ -148,6 +148,13 @@ fix the prerequisite. If the later schema-context build also fails, interactive setup offers **Disable query history and retry** so you can finish database setup with `connections..context.queryHistory.enabled: false`. +After the schema scan completes, setup can derive query-history service-account +filters from in-scope history. If **ktx** finds clear operational roles, it +prints each proposed exclusion with a reason and writes +`connections..context.queryHistory.filters.serviceAccounts` only when you +apply the proposal. In non-interactive setup with `--yes`, the proposal is +applied automatically. Existing `serviceAccounts` blocks are never overwritten. + For BigQuery, the remediation tells you to grant `roles/bigquery.resourceViewer` on the BigQuery project, or grant a custom role that contains `bigquery.jobs.listAll`. diff --git a/docs-site/content/docs/configuration/ktx-yaml.mdx b/docs-site/content/docs/configuration/ktx-yaml.mdx index a9298443..17a04c53 100644 --- a/docs-site/content/docs/configuration/ktx-yaml.mdx +++ b/docs-site/content/docs/configuration/ktx-yaml.mdx @@ -179,9 +179,22 @@ connections: context: queryHistory: enabled: true + enabledSchemas: + - orbit_raw + - orbit_analytics minExecutions: 5 ``` +- `enabledSchemas`: Optional list of schema or dataset names that query-history + ingest may mine. Omit it to let **ktx** derive the modeled schema floor from + the connection and semantic-layer sources. Use `["*"]` to disable the floor + for discovery runs. +- `filters.serviceAccounts`: Optional service-account filter block. During + setup, when query history is enabled and no service-account block already + exists, **ktx** can propose exact role patterns such as `^svc_loader$` from + observed in-scope query history. The block uses `mode: exclude` and remains + hand-editable. + ### Metabase ```yaml diff --git a/docs-site/content/docs/guides/building-context.mdx b/docs-site/content/docs/guides/building-context.mdx index 52179e70..9bcf2659 100644 --- a/docs-site/content/docs/guides/building-context.mdx +++ b/docs-site/content/docs/guides/building-context.mdx @@ -57,7 +57,10 @@ isolation. ## Query history PostgreSQL, BigQuery, and Snowflake can add query-history context: common joins, -filters, service-account patterns, redaction rules, and high-usage templates. +filters, redaction rules, high-usage templates, and service-account exclusions. +When query history is enabled during setup, **ktx** reviews observed in-scope +roles and can write exact `filters.serviceAccounts` patterns for operational +traffic such as loader or refresh roles. Enable it during setup, store it under `connections..context.queryHistory`, or request it for one run: diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/bigquery-query-history-reader.ts b/packages/cli/src/context/ingest/adapters/historic-sql/bigquery-query-history-reader.ts index ac4d4c71..fe637078 100644 --- a/packages/cli/src/context/ingest/adapters/historic-sql/bigquery-query-history-reader.ts +++ b/packages/cli/src/context/ingest/adapters/historic-sql/bigquery-query-history-reader.ts @@ -200,27 +200,78 @@ export class BigQueryHistoricSqlQueryHistoryReader { config: HistoricSqlUnifiedPullConfig, ): AsyncIterable { const sql = ` +WITH filtered_jobs AS ( + SELECT + COALESCE(query_info.query_hashes.normalized_literals, TO_HEX(SHA256(query))) AS template_id, + query, + user_email, + creation_time, + end_time, + error_result + FROM ${this.viewPath} + WHERE job_type = 'QUERY' + AND statement_type IN ('SELECT', 'MERGE') + AND creation_time >= ${timestampExpression(window.start)} + AND creation_time < ${timestampExpression(window.end)} + AND query IS NOT NULL +), +template_stats AS ( + SELECT + template_id, + MIN(query) AS canonical_sql, + COUNT(*) AS executions, + COUNT(DISTINCT user_email) AS distinct_users, + MIN(creation_time) AS first_seen, + MAX(creation_time) AS last_seen, + APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(50)] AS p50_ms, + APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(95)] AS p95_ms, + SAFE_DIVIDE(COUNTIF(error_result IS NOT NULL), COUNT(*)) AS error_rate, + CAST(NULL AS INT64) AS rows_produced + FROM filtered_jobs + GROUP BY template_id + HAVING COUNT(*) >= ${config.minExecutions} +), +template_users AS ( + SELECT + template_id, + user_email AS user, + COUNT(*) AS executions, + MAX(creation_time) AS last_seen + FROM filtered_jobs + GROUP BY template_id, user_email +) SELECT - query_hash AS template_id, - MIN(query) AS canonical_sql, - COUNT(*) AS executions, - COUNT(DISTINCT user_email) AS distinct_users, - MIN(creation_time) AS first_seen, - MAX(creation_time) AS last_seen, - APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(50)] AS p50_ms, - APPROX_QUANTILES(TIMESTAMP_DIFF(end_time, creation_time, MILLISECOND), 100)[OFFSET(95)] AS p95_ms, - SAFE_DIVIDE(COUNTIF(error_result IS NOT NULL), COUNT(*)) AS error_rate, - CAST(NULL AS INT64) AS rows_produced, - TO_JSON_STRING(ARRAY_AGG(STRUCT(user_email AS user, 1 AS executions) ORDER BY creation_time DESC LIMIT 5)) AS top_users -FROM ${this.viewPath} -WHERE job_type = 'QUERY' - AND statement_type IN ('SELECT', 'MERGE') - AND creation_time >= ${timestampExpression(window.start)} - AND creation_time < ${timestampExpression(window.end)} - AND query IS NOT NULL -GROUP BY query_hash -HAVING COUNT(*) >= ${config.minExecutions} -ORDER BY executions DESC`.trim(); + stats.template_id, + stats.canonical_sql, + stats.executions, + stats.distinct_users, + stats.first_seen, + stats.last_seen, + stats.p50_ms, + stats.p95_ms, + stats.error_rate, + stats.rows_produced, + TO_JSON_STRING( + ARRAY_AGG( + STRUCT(users.user AS user, users.executions AS executions) + ORDER BY users.executions DESC, users.last_seen DESC + ) + ) AS top_users +FROM template_stats AS stats +JOIN template_users AS users + ON users.template_id = stats.template_id +GROUP BY + stats.template_id, + stats.canonical_sql, + stats.executions, + stats.distinct_users, + stats.first_seen, + stats.last_seen, + stats.p50_ms, + stats.p95_ms, + stats.error_rate, + stats.rows_produced +ORDER BY stats.executions DESC`.trim(); const result = await queryClient(client).executeQuery(sql); if (result.error) { throw grantsError(result.error); diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/chunk-unified.ts b/packages/cli/src/context/ingest/adapters/historic-sql/chunk-unified.ts index 4477e753..56cc32e7 100644 --- a/packages/cli/src/context/ingest/adapters/historic-sql/chunk-unified.ts +++ b/packages/cli/src/context/ingest/adapters/historic-sql/chunk-unified.ts @@ -1,6 +1,7 @@ import { createHash } from 'node:crypto'; import { readFile, readdir } from 'node:fs/promises'; import { join, relative } from 'node:path'; +import { tableRefKey } from '../../../scan/table-ref.js'; import type { ChunkResult, DiffSet, ScopeDescriptor, WorkUnit } from '../../types.js'; import { isHistoricSqlPatternInputShardPath } from './pattern-inputs.js'; import { stagedManifestSchema, stagedPatternsInputSchema, stagedTableInputSchema } from './types.js'; @@ -37,7 +38,7 @@ export async function chunkHistoricSqlUnifiedStagedDir(stagedDir: string, diffSe } const table = stagedTableInputSchema.parse(await readJson(stagedDir, path)); workUnits.push({ - unitKey: `historic-sql-table-${safeUnitKey(table.table)}`, + unitKey: `historic-sql-table-${safeUnitKey(tableRefKey(table.tableRef))}`, displayLabel: `Historic SQL usage: ${table.table}`, rawFiles: [path], dependencyPaths: ['manifest.json'], diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/pattern-inputs.ts b/packages/cli/src/context/ingest/adapters/historic-sql/pattern-inputs.ts index 025fa43c..2e99836e 100644 --- a/packages/cli/src/context/ingest/adapters/historic-sql/pattern-inputs.ts +++ b/packages/cli/src/context/ingest/adapters/historic-sql/pattern-inputs.ts @@ -1,4 +1,5 @@ import { Buffer } from 'node:buffer'; +import { tableRefKey } from '../../../scan/table-ref.js'; import type { StagedPatternsInput } from './types.js'; const HISTORIC_SQL_PATTERN_WORKUNIT_DIR = 'patterns-input'; @@ -44,11 +45,16 @@ function sortedAuditTemplates(templates: readonly PatternTemplate[]): PatternTem function sortedPatternCandidates(templates: readonly PatternTemplate[]): PatternTemplate[] { return [...templates] .filter((template) => template.tablesTouched.length >= 2) - .map((template) => ({ ...template, tablesTouched: [...template.tablesTouched].sort() })) + .map((template) => ({ + ...template, + tablesTouched: [...template.tablesTouched].sort((left, right) => tableRefKey(left).localeCompare(tableRefKey(right))), + })) .sort((left, right) => { const cardinality = right.tablesTouched.length - left.tablesTouched.length; if (cardinality !== 0) return cardinality; - const tableSignature = left.tablesTouched.join('\0').localeCompare(right.tablesTouched.join('\0')); + const leftSignature = left.tablesTouched.map(tableRefKey).join('\0'); + const rightSignature = right.tablesTouched.map(tableRefKey).join('\0'); + const tableSignature = leftSignature.localeCompare(rightSignature); if (tableSignature !== 0) return tableSignature; return left.id.localeCompare(right.id); }); diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/query-history-filter-picker.ts b/packages/cli/src/context/ingest/adapters/historic-sql/query-history-filter-picker.ts new file mode 100644 index 00000000..bb296513 --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/historic-sql/query-history-filter-picker.ts @@ -0,0 +1,278 @@ +import { z } from 'zod'; +import type { KtxLlmRuntimePort } from '../../../../context/llm/runtime-port.js'; +import type { SqlAnalysisPort } from '../../../../context/sql-analysis/ports.js'; +import { tableRefKey } from '../../../scan/table-ref.js'; +import type { KtxTableRef } from '../../../scan/types.js'; +import { bucketDistinctUsers, bucketExecutions, bucketRecency } from './buckets.js'; +import { + compileHistoricSqlRedactionPatterns, + redactHistoricSqlText, + type HistoricSqlRedactionPattern, +} from './redaction.js'; +import { includedQueryHistoryTableRefs } from './scope-membership.js'; +import { + aggregatedTemplateSchema, + historicSqlUnifiedPullConfigSchema, + type AggregatedTemplate, + type HistoricSqlDialect, + type HistoricSqlReader, +} from './types.js'; + +export interface QueryHistoryFilterProposal { + excludedRoles: Array<{ role: string; reason: string; pattern: string }>; + consideredRoleCount: number; + skipped: { reason: 'no-llm' | 'no-daemon' | 'no-in-scope-history' | 'user-block-present' } | null; + warnings: string[]; +} + +export interface ProposeQueryHistoryServiceAccountFiltersInput { + connectionId: string; + dialect: HistoricSqlDialect; + queryClient: unknown; + reader: HistoricSqlReader; + sqlAnalysis: SqlAnalysisPort; + llmRuntime: KtxLlmRuntimePort | null; + pullConfig: unknown; + now?: Date; + userServiceAccountsPresent?: boolean; +} + +interface ParsedTemplateForPicker { + template: AggregatedTemplate; + tablesTouched: KtxTableRef[]; + includedTables: KtxTableRef[]; +} + +interface RoleAccumulator { + role: string; + executions: number; + distinctUsers: number; + lastSeen: string; + tables: Map; + templates: AggregatedTemplate[]; +} + +interface QueryHistoryRoleRecord { + role: string; + inScopeTables: string[]; + executionsBucket: string; + distinctUsersBucket: string; + recencyBucket: string; + representativeTemplates: Array<{ id: string; canonicalSql: string; dialect: HistoricSqlDialect }>; +} + +const queryHistoryFilterAdjudicationSchema = z.object({ + roles: z.array( + z.object({ + role: z.string().min(1), + exclude: z.boolean(), + reason: z.string().min(1), + }).strict(), + ), +}).strict(); + +type QueryHistoryFilterAdjudication = z.infer; + +function emptyProposal(skipped: QueryHistoryFilterProposal['skipped'], warnings: string[] = []): QueryHistoryFilterProposal { + return { excludedRoles: [], consideredRoleCount: 0, skipped, warnings }; +} + +function displayTableRef(ref: KtxTableRef): string { + return [ref.catalog, ref.db, ref.name].filter((part): part is string => !!part && part.length > 0).join('.'); +} + +function redactTemplateSqlForPicker( + template: AggregatedTemplate, + redactors: readonly HistoricSqlRedactionPattern[], +): AggregatedTemplate { + if (redactors.length === 0) { + return template; + } + return { + ...template, + canonicalSql: redactHistoricSqlText(template.canonicalSql, redactors), + }; +} + +/** @internal */ +export function regexEscapeForExactRolePattern(role: string): string { + return `^${role.replace(/[\\^$.*+?()[\]{}|]/g, '\\$&')}$`; +} + +function recordRole( + acc: RoleAccumulator, + template: AggregatedTemplate, + tables: readonly KtxTableRef[], + executions: number, +): void { + acc.executions += executions; + acc.distinctUsers = Math.max(acc.distinctUsers, template.stats.distinctUsers); + acc.lastSeen = template.stats.lastSeen > acc.lastSeen ? template.stats.lastSeen : acc.lastSeen; + for (const table of tables) { + acc.tables.set(tableRefKey(table), table); + } + acc.templates.push(template); +} + +function roleRecords(parsedTemplates: readonly ParsedTemplateForPicker[], now: Date): QueryHistoryRoleRecord[] { + const byRole = new Map(); + for (const parsed of parsedTemplates) { + for (const entry of parsed.template.topUsers) { + if (!entry.user || entry.user.trim().length === 0 || entry.executions <= 0) { + continue; + } + const role = entry.user.trim(); + const acc = + byRole.get(role) ?? + { + role, + executions: 0, + distinctUsers: 0, + lastSeen: '1970-01-01T00:00:00.000Z', + tables: new Map(), + templates: [], + }; + recordRole(acc, parsed.template, parsed.includedTables, entry.executions); + byRole.set(role, acc); + } + } + + return [...byRole.values()] + .sort((left, right) => right.executions - left.executions || left.role.localeCompare(right.role)) + .map((acc) => ({ + role: acc.role, + inScopeTables: [...acc.tables.entries()] + .sort(([left], [right]) => left.localeCompare(right)) + .slice(0, 25) + .map(([, ref]) => displayTableRef(ref)), + executionsBucket: bucketExecutions(acc.executions), + distinctUsersBucket: bucketDistinctUsers(acc.distinctUsers), + recencyBucket: bucketRecency(acc.lastSeen, now), + representativeTemplates: [...acc.templates] + .sort((left, right) => right.stats.executions - left.stats.executions || left.templateId.localeCompare(right.templateId)) + .slice(0, 3) + .map((template) => ({ + id: template.templateId, + canonicalSql: template.canonicalSql, + dialect: template.dialect, + })), + })); +} + +function adjudicationSystemPrompt(): string { + return [ + 'You are helping ktx decide whether observed query-history roles are operational service accounts.', + 'Default every role to keep. Mark exclude true only when the aggregate evidence clearly shows loader, ELT, reverse-ETL, export, refresh, or maintenance traffic rather than analyst or BI-dashboard usage.', + 'Use only the observed role records. Do not rely on a hardcoded denylist. Return structured output only.', + ].join('\n'); +} + +export async function proposeQueryHistoryServiceAccountFilters( + input: ProposeQueryHistoryServiceAccountFiltersInput, +): Promise { + if (!input.llmRuntime) { + return emptyProposal({ reason: 'no-llm' }); + } + + const config = historicSqlUnifiedPullConfigSchema.parse(input.pullConfig); + const redactors = compileHistoricSqlRedactionPatterns(config.redactionPatterns); + const now = input.now ?? new Date(); + const windowDays = 'windowDays' in config ? config.windowDays : 90; + const windowStart = new Date(now.getTime() - windowDays * 24 * 60 * 60 * 1000); + const warnings: string[] = []; + const snapshot: AggregatedTemplate[] = []; + + try { + for await (const row of input.reader.fetchAggregated(input.queryClient, { start: windowStart, end: now }, config)) { + snapshot.push(aggregatedTemplateSchema.parse(row)); + } + } catch (error) { + return emptyProposal(null, [ + `query_history_filter_picker_read_failed:${error instanceof Error ? error.message : String(error)}`, + ]); + } + + if (snapshot.length === 0) { + return emptyProposal({ reason: 'no-in-scope-history' }); + } + + const analysisItems = snapshot.map((template) => ({ id: template.templateId, sql: template.canonicalSql })); + const analysisOptions = + config.modeledTableCatalog.length > 0 ? { catalog: { tables: config.modeledTableCatalog } } : undefined; + let analysis: Awaited>; + try { + analysis = await input.sqlAnalysis.analyzeBatch(analysisItems, input.dialect, analysisOptions); + } catch (error) { + return emptyProposal({ reason: 'no-daemon' }, [ + `query_history_filter_picker_analysis_failed:${error instanceof Error ? error.message : String(error)}`, + ]); + } + + const parsedTemplates: ParsedTemplateForPicker[] = []; + for (const template of snapshot) { + const parsed = analysis.get(template.templateId); + if (!parsed || parsed.error) { + warnings.push(`query_history_filter_picker_parse_failed:${template.templateId}`); + continue; + } + const tablesTouched = [...new Map(parsed.tablesTouched.map((ref) => [tableRefKey(ref), ref])).values()] + .filter((ref) => ref.name.length > 0) + .sort((left, right) => tableRefKey(left).localeCompare(tableRefKey(right))); + const includedTables = includedQueryHistoryTableRefs(tablesTouched, config); + if (includedTables.length === 0) { + continue; + } + parsedTemplates.push({ + template: redactTemplateSqlForPicker(template, redactors), + tablesTouched, + includedTables, + }); + } + + const records = roleRecords(parsedTemplates, now); + if (records.length <= 1) { + return { + excludedRoles: [], + consideredRoleCount: records.length, + skipped: { reason: 'no-in-scope-history' }, + warnings, + }; + } + + let generated: QueryHistoryFilterAdjudication; + try { + generated = await input.llmRuntime.generateObject({ + role: 'candidateExtraction', + system: adjudicationSystemPrompt(), + prompt: JSON.stringify({ connectionId: input.connectionId, dialect: input.dialect, roles: records }), + schema: queryHistoryFilterAdjudicationSchema, + }); + } catch (error) { + return { + excludedRoles: [], + consideredRoleCount: records.length, + skipped: { reason: 'no-llm' }, + warnings: [ + ...warnings, + `query_history_filter_picker_llm_failed:${error instanceof Error ? error.message : String(error)}`, + ], + }; + } + + const knownRoles = new Set(records.map((record) => record.role)); + const excludedRoles = generated.roles + .filter((role) => role.exclude && knownRoles.has(role.role)) + .sort((left, right) => left.role.localeCompare(right.role)) + .map((role) => ({ + role: role.role, + reason: role.reason, + pattern: regexEscapeForExactRolePattern(role.role), + })); + + return { + excludedRoles, + consideredRoleCount: records.length, + skipped: input.userServiceAccountsPresent ? { reason: 'user-block-present' } : null, + warnings, + }; +} diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/scope-floor.ts b/packages/cli/src/context/ingest/adapters/historic-sql/scope-floor.ts new file mode 100644 index 00000000..23b36a0e --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/historic-sql/scope-floor.ts @@ -0,0 +1,260 @@ +import type { Dirent } from 'node:fs'; +import { access, readdir, readFile } from 'node:fs/promises'; +import { join, relative } from 'node:path'; +import YAML from 'yaml'; +import { getDriverRegistration } from '../../../connections/drivers.js'; +import { parseDottedTableEntry } from '../../../scan/enabled-tables.js'; +import { tableRefKey, tableRefSet, type KtxTableRefKey } from '../../../scan/table-ref.js'; +import type { KtxTableRef } from '../../../scan/types.js'; +import { readLiveDatabaseTableFiles } from '../live-database/stage.js'; + +export interface QueryHistoryScopeFloorInput { + projectDir: string; + connectionId: string; + driver: string; + connection: Record; + storedQueryHistory: Record; +} + +export interface QueryHistoryScopeFloor { + enabledTables: KtxTableRef[]; + enabledTableKeys: ReadonlySet | null; + enabledSchemas: string[]; + modeledTableCatalog: KtxTableRef[]; + floorDisabled: boolean; + warnings: string[]; +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function stringArray(value: unknown): string[] { + return Array.isArray(value) + ? value + .filter((item): item is string => typeof item === 'string' && item.trim().length > 0) + .map((item) => item.trim()) + : []; +} + +function tableRefsFromValues(values: unknown): KtxTableRef[] { + if (!Array.isArray(values)) return []; + return values.flatMap((value) => { + if (typeof value === 'string') { + const ref = parseDottedTableEntry(value); + return ref ? [ref] : []; + } + if (isRecord(value) && typeof value.name === 'string' && value.name.length > 0) { + return [ + { + catalog: typeof value.catalog === 'string' ? value.catalog : null, + db: typeof value.db === 'string' ? value.db : null, + name: value.name, + }, + ]; + } + return []; + }); +} + +function declaredSchemas(driver: string, connection: Record): string[] { + const key = getDriverRegistration(driver)?.scopeConfigKey; + if (!key) return []; + return [...new Set(stringArray(connection[key]))].sort(); +} + +function uniqueSortedTableRefs(refs: readonly KtxTableRef[]): KtxTableRef[] { + const byKey = new Map(); + for (const ref of refs) { + byKey.set(tableRefKey(ref), ref); + } + return [...byKey.entries()] + .sort(([left], [right]) => left.localeCompare(right)) + .map(([, ref]) => ref); +} + +async function latestLiveDatabaseScanDir(projectDir: string, connectionId: string): Promise { + const root = join(projectDir, 'raw-sources', connectionId, 'live-database'); + let entries: Dirent[]; + try { + entries = await readdir(root, { withFileTypes: true }); + } catch (error) { + if (error && typeof error === 'object' && 'code' in error && error.code === 'ENOENT') return null; + throw error; + } + const syncDirs = entries + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name) + .sort() + .reverse(); + for (const syncDir of syncDirs) { + const absolute = join(root, syncDir); + try { + await access(join(absolute, 'connection.json')); + return absolute; + } catch { + continue; + } + } + return null; +} + +async function scannedTableRefs( + projectDir: string, + connectionId: string, +): Promise<{ refs: KtxTableRef[]; catalogAvailable: boolean; warnings: string[] }> { + const scanDir = await latestLiveDatabaseScanDir(projectDir, connectionId); + if (!scanDir) { + return { refs: [], catalogAvailable: false, warnings: [] }; + } + try { + const tableFiles = await readLiveDatabaseTableFiles(scanDir); + return { + refs: uniqueSortedTableRefs( + tableFiles.map(({ table }) => ({ catalog: table.catalog, db: table.db, name: table.name })), + ), + catalogAvailable: true, + warnings: [], + }; + } catch (error) { + return { + refs: [], + catalogAvailable: false, + warnings: [ + `query_history_scope_floor_catalog_read_failed:live_database_scan:${error instanceof Error ? error.message : String(error)}`, + ], + }; + } +} + +async function listYamlFiles(root: string): Promise { + try { + const entries = await readdir(root, { withFileTypes: true, recursive: true }); + return entries + .filter((entry) => entry.isFile() && /\.ya?ml$/i.test(entry.name)) + .map((entry) => relative(root, join(entry.parentPath, entry.name)).replace(/\\/g, '/')) + .sort(); + } catch (error) { + if (error && typeof error === 'object' && 'code' in error && error.code === 'ENOENT') return []; + throw error; + } +} + +function refsFromManifest(content: string): KtxTableRef[] { + const parsed = YAML.parse(content) as unknown; + if (!isRecord(parsed) || !isRecord(parsed.tables)) return []; + return Object.values(parsed.tables).flatMap((entry) => { + if (!isRecord(entry) || typeof entry.table !== 'string') return []; + const ref = parseDottedTableEntry(entry.table); + return ref ? [ref] : []; + }); +} + +function refsFromStandaloneSource(content: string): KtxTableRef[] { + const parsed = YAML.parse(content) as unknown; + if (!isRecord(parsed) || typeof parsed.table !== 'string') return []; + const ref = parseDottedTableEntry(parsed.table); + return ref ? [ref] : []; +} + +async function semanticTableRefs( + projectDir: string, + connectionId: string, +): Promise<{ refs: KtxTableRef[]; warnings: string[] }> { + const root = join(projectDir, 'semantic-layer', connectionId); + const files = await listYamlFiles(root); + const refs: KtxTableRef[] = []; + const warnings: string[] = []; + for (const file of files) { + try { + const content = await readFile(join(root, file), 'utf-8'); + refs.push(...(file.startsWith('_schema/') ? refsFromManifest(content) : refsFromStandaloneSource(content))); + } catch (error) { + warnings.push( + `query_history_scope_floor_catalog_read_failed:${file}:${error instanceof Error ? error.message : String(error)}`, + ); + } + } + return { refs: uniqueSortedTableRefs(refs), warnings }; +} + +export async function resolveQueryHistoryScopeFloor(input: QueryHistoryScopeFloorInput): Promise { + const explicitEnabledTables = [ + ...tableRefsFromValues(input.storedQueryHistory.enabledTables), + ...tableRefsFromValues(input.connection.enabled_tables), + ]; + const semanticTables = await semanticTableRefs(input.projectDir, input.connectionId); + const scannedTables = await scannedTableRefs(input.projectDir, input.connectionId); + const modeledTables = uniqueSortedTableRefs([ + ...semanticTables.refs, + ...scannedTables.refs, + ...explicitEnabledTables, + ]); + const warnings = [...semanticTables.warnings, ...scannedTables.warnings]; + + if (explicitEnabledTables.length > 0) { + return { + enabledTables: explicitEnabledTables, + enabledTableKeys: tableRefSet(explicitEnabledTables), + enabledSchemas: [], + modeledTableCatalog: modeledTables, + floorDisabled: false, + warnings, + }; + } + + const explicitSchemas = stringArray(input.storedQueryHistory.enabledSchemas); + if (explicitSchemas.includes('*')) { + return { + enabledTables: [], + enabledTableKeys: null, + enabledSchemas: ['*'], + modeledTableCatalog: modeledTables, + floorDisabled: true, + warnings, + }; + } + if (explicitSchemas.length > 0) { + if (!scannedTables.catalogAvailable || modeledTables.length === 0) { + return { + enabledTables: [], + enabledTableKeys: null, + enabledSchemas: ['*'], + modeledTableCatalog: modeledTables, + floorDisabled: true, + warnings: [...warnings, 'query_history_scope_floor_disabled:catalog_unavailable'], + }; + } + return { + enabledTables: [], + enabledTableKeys: null, + enabledSchemas: [...new Set(explicitSchemas)].sort(), + modeledTableCatalog: modeledTables, + floorDisabled: false, + warnings, + }; + } + + const schemas = new Set(declaredSchemas(input.driver, input.connection)); + for (const ref of semanticTables.refs) { + if (ref.db) schemas.add(ref.db); + } + if (schemas.size > 0 && (!scannedTables.catalogAvailable || modeledTables.length === 0)) { + return { + enabledTables: [], + enabledTableKeys: null, + enabledSchemas: ['*'], + modeledTableCatalog: modeledTables, + floorDisabled: true, + warnings: [...warnings, 'query_history_scope_floor_disabled:catalog_unavailable'], + }; + } + return { + enabledTables: [], + enabledTableKeys: null, + enabledSchemas: [...schemas].sort(), + modeledTableCatalog: modeledTables, + floorDisabled: false, + warnings, + }; +} diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/scope-membership.ts b/packages/cli/src/context/ingest/adapters/historic-sql/scope-membership.ts new file mode 100644 index 00000000..8852a82d --- /dev/null +++ b/packages/cli/src/context/ingest/adapters/historic-sql/scope-membership.ts @@ -0,0 +1,45 @@ +import { tableRefKey, tableRefSet } from '../../../scan/table-ref.js'; +import type { KtxTableRef } from '../../../scan/types.js'; + +export interface QueryHistoryScopeMembershipConfig { + enabledTables: readonly KtxTableRef[]; + enabledSchemas: readonly string[]; +} + +function schemaNameForRef(ref: KtxTableRef): string | null { + return ref.db && ref.db.length > 0 ? ref.db : null; +} + +function schemaNamesFromConfig(enabledSchemas: readonly string[]): Set { + return new Set(enabledSchemas.filter((schema) => schema !== '*')); +} + +export function isQueryHistoryScopeFloorDisabled(config: QueryHistoryScopeMembershipConfig): boolean { + return config.enabledSchemas.includes('*'); +} + +export function shouldFailOpenQueryHistoryScope(config: QueryHistoryScopeMembershipConfig): boolean { + return ( + config.enabledTables.length === 0 && + !isQueryHistoryScopeFloorDisabled(config) && + config.enabledSchemas.length === 0 + ); +} + +export function includedQueryHistoryTableRefs( + tablesTouched: readonly KtxTableRef[], + config: QueryHistoryScopeMembershipConfig, +): KtxTableRef[] { + if (config.enabledTables.length > 0) { + const enabled = tableRefSet(config.enabledTables); + return tablesTouched.filter((ref) => enabled.has(tableRefKey(ref))); + } + if (isQueryHistoryScopeFloorDisabled(config) || shouldFailOpenQueryHistoryScope(config)) { + return [...tablesTouched]; + } + const schemas = schemaNamesFromConfig(config.enabledSchemas); + return tablesTouched.filter((ref) => { + const schema = schemaNameForRef(ref); + return schema !== null && schemas.has(schema); + }); +} diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/snowflake-query-history-reader.ts b/packages/cli/src/context/ingest/adapters/historic-sql/snowflake-query-history-reader.ts index 539df3c3..65aafb12 100644 --- a/packages/cli/src/context/ingest/adapters/historic-sql/snowflake-query-history-reader.ts +++ b/packages/cli/src/context/ingest/adapters/historic-sql/snowflake-query-history-reader.ts @@ -188,26 +188,75 @@ export class SnowflakeHistoricSqlQueryHistoryReader { config: HistoricSqlUnifiedPullConfig, ): AsyncIterable { const sql = ` +WITH filtered_queries AS ( + SELECT + query_hash, + query_text, + user_name, + start_time, + total_elapsed_time, + execution_status, + rows_produced + FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY + WHERE query_text IS NOT NULL + AND query_type IN ('SELECT', 'MERGE') + AND start_time >= ${timestampLiteral(window.start)} + AND start_time < ${timestampLiteral(window.end)} +), +template_stats AS ( + SELECT + query_hash AS template_id, + MIN(query_text) AS canonical_sql, + COUNT(*) AS executions, + COUNT(DISTINCT user_name) AS distinct_users, + MIN(start_time) AS first_seen, + MAX(start_time) AS last_seen, + APPROX_PERCENTILE(total_elapsed_time, 0.50) AS p50_ms, + APPROX_PERCENTILE(total_elapsed_time, 0.95) AS p95_ms, + DIV0(COUNT_IF(execution_status != 'SUCCESS'), COUNT(*)) AS error_rate, + SUM(rows_produced) AS rows_produced + FROM filtered_queries + GROUP BY query_hash + HAVING COUNT(*) >= ${config.minExecutions} +), +template_users AS ( + SELECT + query_hash AS template_id, + user_name AS user, + COUNT(*) AS executions, + MAX(start_time) AS last_seen + FROM filtered_queries + GROUP BY query_hash, user_name +) SELECT - query_hash AS template_id, - MIN(query_text) AS canonical_sql, - COUNT(*) AS executions, - COUNT(DISTINCT user_name) AS distinct_users, - MIN(start_time) AS first_seen, - MAX(start_time) AS last_seen, - APPROX_PERCENTILE(total_elapsed_time, 0.50) AS p50_ms, - APPROX_PERCENTILE(total_elapsed_time, 0.95) AS p95_ms, - DIV0(COUNT_IF(execution_status != 'SUCCESS'), COUNT(*)) AS error_rate, - SUM(rows_produced) AS rows_produced, - ARRAY_AGG(OBJECT_CONSTRUCT('user', user_name, 'executions', 1)) WITHIN GROUP (ORDER BY start_time DESC)::string AS top_users -FROM SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY -WHERE query_text IS NOT NULL - AND query_type IN ('SELECT', 'MERGE') - AND start_time >= ${timestampLiteral(window.start)} - AND start_time < ${timestampLiteral(window.end)} -GROUP BY query_hash -HAVING COUNT(*) >= ${config.minExecutions} -ORDER BY executions DESC`.trim(); + stats.template_id, + stats.canonical_sql, + stats.executions, + stats.distinct_users, + stats.first_seen, + stats.last_seen, + stats.p50_ms, + stats.p95_ms, + stats.error_rate, + stats.rows_produced, + ARRAY_AGG( + OBJECT_CONSTRUCT('user', users.user, 'executions', users.executions) + ) WITHIN GROUP (ORDER BY users.executions DESC, users.last_seen DESC)::string AS top_users +FROM template_stats AS stats +JOIN template_users AS users + ON users.template_id = stats.template_id +GROUP BY + stats.template_id, + stats.canonical_sql, + stats.executions, + stats.distinct_users, + stats.first_seen, + stats.last_seen, + stats.p50_ms, + stats.p95_ms, + stats.error_rate, + stats.rows_produced +ORDER BY stats.executions DESC`.trim(); const result = await queryClient(client).executeQuery(sql); if (result.error) { throw grantsError(result.error); diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/stage-unified.ts b/packages/cli/src/context/ingest/adapters/historic-sql/stage-unified.ts index 853a3e68..84ec75a7 100644 --- a/packages/cli/src/context/ingest/adapters/historic-sql/stage-unified.ts +++ b/packages/cli/src/context/ingest/adapters/historic-sql/stage-unified.ts @@ -1,6 +1,8 @@ import { mkdir, writeFile } from 'node:fs/promises'; import { dirname, join } from 'node:path'; import type { SqlAnalysisPort } from '../../../../context/sql-analysis/ports.js'; +import { tableRefKey, type KtxTableRefKey } from '../../../scan/table-ref.js'; +import type { KtxTableRef } from '../../../scan/types.js'; import { bucketDistinctUsers, bucketErrorRate, @@ -15,6 +17,11 @@ import { redactHistoricSqlText, type HistoricSqlRedactionPattern, } from './redaction.js'; +import { + includedQueryHistoryTableRefs, + isQueryHistoryScopeFloorDisabled, + shouldFailOpenQueryHistoryScope, +} from './scope-membership.js'; import { HISTORIC_SQL_SOURCE_KEY, aggregatedTemplateSchema, @@ -38,17 +45,13 @@ interface StageHistoricSqlAggregatedSnapshotInput { interface ParsedTemplate { template: AggregatedTemplate; - tablesTouched: string[]; - includedTables: string[]; + tablesTouched: KtxTableRef[]; + includedTables: KtxTableRef[]; columnsByClause: Record; } -interface EnabledTableFilter { - exact: Set; - uniqueUnqualified: Set; -} - interface TableAccumulator { + tableRef: KtxTableRef; table: string; executions: number; distinctUsers: number; @@ -105,8 +108,7 @@ function shouldDropByUsers(template: AggregatedTemplate, config: HistoricSqlUnif const matchingExecutions = template.topUsers .filter((entry) => matchesAny(entry.user, patterns)) .reduce((sum, entry) => sum + entry.executions, 0); - const allExecutions = template.topUsers.reduce((sum, entry) => sum + entry.executions, 0); - const serviceOnly = allExecutions > 0 && matchingExecutions >= allExecutions; + const serviceOnly = template.stats.executions > 0 && matchingExecutions >= template.stats.executions; return service.mode === 'exclude' ? serviceOnly : !serviceOnly; } @@ -122,90 +124,8 @@ function shouldDropTemplate(template: AggregatedTemplate, config: HistoricSqlUni return false; } -function normalizeTableIdentifier(value: string): string { - return value.trim().toLowerCase(); -} - -function unqualifiedTableIdentifier(value: string): string { - const parts = normalizeTableIdentifier(value).split('.').filter(Boolean); - return parts.at(-1) ?? ''; -} - -function buildEnabledTableFilter(enabledTables: string[]): EnabledTableFilter | null { - if (enabledTables.length === 0) { - return null; - } - const exact = new Set(enabledTables.map(normalizeTableIdentifier).filter((value) => value.length > 0)); - const unqualifiedCounts = new Map(); - for (const table of exact) { - const unqualified = unqualifiedTableIdentifier(table); - if (unqualified.length > 0) { - unqualifiedCounts.set(unqualified, (unqualifiedCounts.get(unqualified) ?? 0) + 1); - } - } - return { - exact, - uniqueUnqualified: new Set( - [...unqualifiedCounts.entries()] - .filter(([, count]) => count === 1) - .map(([table]) => table), - ), - }; -} - -function isEnabledTable(table: string, filter: EnabledTableFilter | null): boolean { - if (!filter) { - return true; - } - const normalized = normalizeTableIdentifier(table); - return filter.exact.has(normalized) || filter.uniqueUnqualified.has(unqualifiedTableIdentifier(normalized)); -} - -/** - * pg_stat_statements records queries as written, so the same physical table can appear - * both bare (`accounts`, resolved via search_path) and schema-qualified - * (`orbit_raw.accounts`). Collapse a bare identifier into its schema-qualified form when - * exactly one qualified form shares its unqualified name, so the two never become separate - * work units. Ambiguous bare names (two qualified forms) are left untouched. - */ -function canonicalizeTableIdentifiers(parsedTemplates: ParsedTemplate[]): void { - const all = new Set(); - for (const parsed of parsedTemplates) { - for (const table of parsed.includedTables) { - all.add(table); - } - } - const qualifiedByUnqualified = new Map>(); - for (const table of all) { - if (!table.includes('.')) { - continue; - } - const unqualified = unqualifiedTableIdentifier(table); - if (unqualified.length === 0) { - continue; - } - const forms = qualifiedByUnqualified.get(unqualified) ?? new Set(); - forms.add(table); - qualifiedByUnqualified.set(unqualified, forms); - } - const canonical = new Map(); - for (const table of all) { - if (table.includes('.')) { - continue; - } - const forms = qualifiedByUnqualified.get(unqualifiedTableIdentifier(table)); - if (forms && forms.size === 1) { - canonical.set(table, [...forms][0]); - } - } - if (canonical.size === 0) { - return; - } - const remap = (table: string): string => canonical.get(table) ?? table; - for (const parsed of parsedTemplates) { - parsed.includedTables = [...new Set(parsed.includedTables.map(remap))].sort(); - parsed.tablesTouched = [...new Set(parsed.tablesTouched.map(remap))].sort(); - } +function displayTableRef(ref: KtxTableRef): string { + return [ref.catalog, ref.db, ref.name].filter((part): part is string => !!part && part.length > 0).join('.'); } function historicSqlWindowDays(config: HistoricSqlUnifiedPullConfig): number { @@ -240,9 +160,10 @@ function recordJoin(acc: TableAccumulator, otherTable: string, columns: string[] } } -function accumulatorFor(table: string): TableAccumulator { +function accumulatorFor(tableRef: KtxTableRef): TableAccumulator { return { - table, + tableRef, + table: displayTableRef(tableRef), executions: 0, distinctUsers: 0, errorRateNumerator: 0, @@ -272,8 +193,8 @@ function addTemplate(acc: TableAccumulator, parsed: ParsedTemplate): void { } } const joinColumns = parsed.columnsByClause.join ?? []; - for (const otherTable of parsed.tablesTouched.filter((table) => table !== acc.table)) { - recordJoin(acc, otherTable, joinColumns, executions); + for (const otherTable of parsed.tablesTouched.filter((table) => tableRefKey(table) !== tableRefKey(acc.tableRef))) { + recordJoin(acc, displayTableRef(otherTable), joinColumns, executions); } acc.topTemplates.push(parsed.template); } @@ -310,6 +231,7 @@ function toStagedTable(acc: TableAccumulator, now: Date): StagedTableInput { return { table: acc.table, + tableRef: acc.tableRef, stats: { executionsBucket: bucketExecutions(acc.executions), distinctUsersBucket: bucketDistinctUsers(acc.distinctUsers), @@ -329,7 +251,7 @@ function toPatternsInput(parsedTemplates: ParsedTemplate[]): StagedPatternsInput .map(({ template, tablesTouched }) => ({ id: template.templateId, canonicalSql: template.canonicalSql, - tablesTouched: [...tablesTouched].sort(), + tablesTouched: [...tablesTouched].sort((left, right) => tableRefKey(left).localeCompare(tableRefKey(right))), executionsBucket: bucketExecutions(template.stats.executions), distinctUsersBucket: bucketDistinctUsers(template.stats.distinctUsers), dialect: template.dialect, @@ -340,7 +262,6 @@ function toPatternsInput(parsedTemplates: ParsedTemplate[]): StagedPatternsInput export async function stageHistoricSqlAggregatedSnapshot(input: StageHistoricSqlAggregatedSnapshotInput): Promise { const config = historicSqlUnifiedPullConfigSchema.parse(input.pullConfig); - const enabledTableFilter = buildEnabledTableFilter(config.enabledTables); const redactors = compileHistoricSqlRedactionPatterns(config.redactionPatterns); const now = input.now ?? new Date(); const windowStart = new Date(now.getTime() - historicSqlWindowDays(config) * 24 * 60 * 60 * 1000); @@ -356,11 +277,25 @@ export async function stageHistoricSqlAggregatedSnapshot(input: StageHistoricSql } } - const analysis = await input.sqlAnalysis.analyzeBatch( - snapshot.map((template) => ({ id: template.templateId, sql: template.canonicalSql })), - config.dialect, - ); - const warnings: string[] = []; + const analysisItems = snapshot.map((template) => ({ id: template.templateId, sql: template.canonicalSql })); + const analysisOptions = + config.modeledTableCatalog.length > 0 ? { catalog: { tables: config.modeledTableCatalog } } : undefined; + const warnings: string[] = [ + ...config.scopeFloorWarnings, + ...(shouldFailOpenQueryHistoryScope(config) ? ['query_history_scope_floor_disabled:empty_modeled_scope'] : []), + ]; + let scopeDisabledByQualificationFailure = false; + let analysis: Awaited>; + try { + analysis = await input.sqlAnalysis.analyzeBatch(analysisItems, config.dialect, analysisOptions); + } catch (error) { + if (!analysisOptions || config.enabledTables.length > 0 || isQueryHistoryScopeFloorDisabled(config)) { + throw error; + } + warnings.push('query_history_scope_floor_disabled:catalog_qualification_failed'); + scopeDisabledByQualificationFailure = true; + analysis = await input.sqlAnalysis.analyzeBatch(analysisItems, config.dialect, undefined); + } const parsedTemplates: ParsedTemplate[] = []; for (const template of snapshot) { const parsed = analysis.get(template.templateId); @@ -368,8 +303,12 @@ export async function stageHistoricSqlAggregatedSnapshot(input: StageHistoricSql warnings.push(`parse_failed:${template.templateId}`); continue; } - const tablesTouched = [...new Set(parsed.tablesTouched)].filter((table) => table.length > 0).sort(); - const includedTables = tablesTouched.filter((table) => isEnabledTable(table, enabledTableFilter)); + const tablesTouched = [...new Map(parsed.tablesTouched.map((ref) => [tableRefKey(ref), ref])).values()] + .filter((ref) => ref.name.length > 0) + .sort((left, right) => tableRefKey(left).localeCompare(tableRefKey(right))); + const includedTables = scopeDisabledByQualificationFailure + ? [...tablesTouched] + : includedQueryHistoryTableRefs(tablesTouched, config); if (includedTables.length === 0) { continue; } @@ -383,24 +322,23 @@ export async function stageHistoricSqlAggregatedSnapshot(input: StageHistoricSql }); } - canonicalizeTableIdentifiers(parsedTemplates); - - const byTable = new Map(); + const byTable = new Map(); for (const parsed of parsedTemplates) { - for (const table of parsed.includedTables) { - const acc = byTable.get(table) ?? accumulatorFor(table); + for (const tableRef of parsed.includedTables) { + const key = tableRefKey(tableRef); + const acc = byTable.get(key) ?? accumulatorFor(tableRef); addTemplate(acc, parsed); - byTable.set(table, acc); + byTable.set(key, acc); } } await mkdir(input.stagedDir, { recursive: true }); - for (const [table, acc] of [...byTable.entries()].sort(([left], [right]) => left.localeCompare(right))) { - await writeJson(input.stagedDir, `tables/${table}.json`, toStagedTable(acc, now)); + for (const [, acc] of [...byTable.entries()].sort((left, right) => left[0].localeCompare(right[0]))) { + await writeJson(input.stagedDir, `tables/${acc.table}.json`, toStagedTable(acc, now)); } const patternsInput = toPatternsInput(parsedTemplates); const patternInputSplit = splitHistoricSqlPatternInputs(patternsInput); - const allWarnings = [...warnings, ...patternInputSplit.warnings]; + const allWarnings = [...new Set([...warnings, ...patternInputSplit.warnings])]; await writeJson(input.stagedDir, 'patterns-input.json', patternInputSplit.auditInput); for (const shard of patternInputSplit.shards) { await writeJson(input.stagedDir, shard.path, shard.input); diff --git a/packages/cli/src/context/ingest/adapters/historic-sql/types.ts b/packages/cli/src/context/ingest/adapters/historic-sql/types.ts index 1d256b13..aca50c4e 100644 --- a/packages/cli/src/context/ingest/adapters/historic-sql/types.ts +++ b/packages/cli/src/context/ingest/adapters/historic-sql/types.ts @@ -8,9 +8,22 @@ export type HistoricSqlDialect = z.infer; const filterModeSchema = z.enum(['exclude', 'include', 'mark-only']); +const ktxTableRefSchema = z.object({ + catalog: z.string().nullable(), + db: z.string().nullable(), + name: z.string().min(1), +}).strict(); + +const ktxTableRefWithColumnsSchema = ktxTableRefSchema.extend({ + columns: z.array(z.string().min(1)).optional(), +}).strict(); + const historicSqlCommonPullConfigSchema = z.object({ minExecutions: z.number().int().nonnegative().default(5), - enabledTables: z.array(z.string().min(1)).default([]), + enabledTables: z.array(ktxTableRefSchema).default([]), + enabledSchemas: z.array(z.string().min(1)).default([]), + modeledTableCatalog: z.array(ktxTableRefWithColumnsSchema).default([]), + scopeFloorWarnings: z.array(z.string()).default([]), filters: z.object({ serviceAccounts: z.object({ patterns: z.array(z.string()).default([]), @@ -68,6 +81,7 @@ export type AggregatedTemplate = z.infer; export const stagedTableInputSchema = z.object({ table: z.string().min(1), + tableRef: ktxTableRefSchema, stats: z.object({ executionsBucket: z.string(), distinctUsersBucket: z.string(), @@ -93,7 +107,7 @@ export const stagedPatternsInputSchema = z.object({ templates: z.array(z.object({ id: z.string(), canonicalSql: z.string(), - tablesTouched: z.array(z.string()), + tablesTouched: z.array(ktxTableRefSchema), executionsBucket: z.string(), distinctUsersBucket: z.string(), dialect: historicSqlDialectSchema, diff --git a/packages/cli/src/context/ingest/local-adapters.ts b/packages/cli/src/context/ingest/local-adapters.ts index 4739f4e4..3cd8a998 100644 --- a/packages/cli/src/context/ingest/local-adapters.ts +++ b/packages/cli/src/context/ingest/local-adapters.ts @@ -9,6 +9,7 @@ import { DbtSourceAdapter } from './adapters/dbt/dbt.adapter.js'; import { FakeSourceAdapter } from './adapters/fake/fake.adapter.js'; import { HistoricSqlSourceAdapter } from './adapters/historic-sql/historic-sql.adapter.js'; import { PostgresPgssReader } from './adapters/historic-sql/postgres-pgss-reader.js'; +import { resolveQueryHistoryScopeFloor } from './adapters/historic-sql/scope-floor.js'; import { HISTORIC_SQL_SOURCE_KEY, historicSqlUnifiedPullConfigSchema, @@ -179,12 +180,39 @@ function queryHistoryRecord(connection: unknown): Record | null return queryHistory; } -function queryHistoryPullConfig(connection: unknown): Record | null { +async function queryHistoryPullConfig( + project: KtxLocalProject, + connectionId: string, + connection: unknown, +): Promise | null> { const queryHistory = queryHistoryRecord(connection); if (queryHistory?.enabled !== true || !isRecord(connection)) return null; - const dialect = historicSqlDialectByDriver.get(String(connection.driver ?? '').toLowerCase()); + const driver = String(connection.driver ?? '').toLowerCase(); + const dialect = historicSqlDialectByDriver.get(driver); if (!dialect) return null; - return { ...queryHistory, dialect }; + const scopeFloor = await resolveQueryHistoryScopeFloor({ + projectDir: project.projectDir, + connectionId, + driver, + connection, + storedQueryHistory: queryHistory, + }); + const { + enabled: _enabled, + dialect: _dialect, + enabledTables: _enabledTables, + enabledSchemas: _enabledSchemas, + scopeFloorWarnings: _scopeFloorWarnings, + ...stored + } = queryHistory; + return { + ...stored, + dialect, + ...(scopeFloor.enabledTables.length > 0 ? { enabledTables: scopeFloor.enabledTables } : {}), + ...(scopeFloor.enabledSchemas.length > 0 ? { enabledSchemas: scopeFloor.enabledSchemas } : {}), + ...(scopeFloor.modeledTableCatalog.length > 0 ? { modeledTableCatalog: scopeFloor.modeledTableCatalog } : {}), + ...(scopeFloor.warnings.length > 0 ? { scopeFloorWarnings: scopeFloor.warnings } : {}), + }; } function stringField(value: unknown): string | null { @@ -245,7 +273,7 @@ export async function localPullConfigForAdapter( if (options.historicSqlPullConfigOverride) { return historicSqlUnifiedPullConfigSchema.parse(options.historicSqlPullConfigOverride); } - const queryHistory = queryHistoryPullConfig(connection); + const queryHistory = await queryHistoryPullConfig(project, connectionId, connection); if (!queryHistory) { throw new Error(`Connection "${connectionId}" does not have context.queryHistory.enabled: true`); } diff --git a/packages/cli/src/context/sql-analysis/http-sql-analysis-port.ts b/packages/cli/src/context/sql-analysis/http-sql-analysis-port.ts index 238b8863..3093816d 100644 --- a/packages/cli/src/context/sql-analysis/http-sql-analysis-port.ts +++ b/packages/cli/src/context/sql-analysis/http-sql-analysis-port.ts @@ -1,8 +1,10 @@ import { request as httpRequest } from 'node:http'; import { request as httpsRequest } from 'node:https'; import { URL } from 'node:url'; +import type { KtxTableRef } from '../scan/types.js'; import type { SqlAnalysisBatchItem, + SqlAnalysisBatchOptions, SqlAnalysisBatchResult, SqlAnalysisDialect, SqlAnalysisFingerprintResult, @@ -89,6 +91,14 @@ function optionalString(raw: Record, field: string): string | n throw new Error(`sql analysis response has invalid optional string field ${field}`); } +function optionalNullableStringField(raw: Record, field: string): string | null { + const value = raw[field]; + if (value === null || value === undefined || typeof value === 'string') { + return value ?? null; + } + throw new Error(`sql analysis response has invalid optional nullable string field ${field}`); +} + function requiredStringArray(raw: Record, field: string): string[] { const value = raw[field]; if (!Array.isArray(value) || value.some((item) => typeof item !== 'string')) { @@ -175,10 +185,34 @@ function mapColumnsByClause(raw: Record): SqlAnalysisBatchResul return result; } +function requiredTableRef(raw: unknown, field: string): KtxTableRef { + if (!raw || typeof raw !== 'object' || Array.isArray(raw)) { + throw new Error(`sql analysis response contains invalid table ref in ${field}`); + } + const record = raw as Record; + const name = record.name; + if (typeof name !== 'string' || name.length === 0) { + throw new Error(`sql analysis response table ref in ${field} is missing name`); + } + return { + catalog: optionalNullableStringField(record, 'catalog'), + db: optionalNullableStringField(record, 'db'), + name, + }; +} + +function requiredTableRefArray(raw: Record, field: string): KtxTableRef[] { + const value = raw[field]; + if (!Array.isArray(value)) { + throw new Error(`sql analysis response is missing table-ref[] field ${field}`); + } + return value.map((item, index) => requiredTableRef(item, `${field}.${index}`)); +} + function mapBatchResult(raw: Record): SqlAnalysisBatchResult { const error = optionalString(raw, 'error'); return { - tablesTouched: requiredStringArray(raw, 'tables_touched'), + tablesTouched: requiredTableRefArray(raw, 'tables_touched'), columnsByClause: mapColumnsByClause(raw), ...(error !== undefined ? { error } : {}), }; @@ -215,10 +249,11 @@ export function createHttpSqlAnalysisPort(options: HttpSqlAnalysisPortOptions): }); return mapResult(raw); }, - async analyzeBatch(items: SqlAnalysisBatchItem[], dialect: SqlAnalysisDialect) { + async analyzeBatch(items: SqlAnalysisBatchItem[], dialect: SqlAnalysisDialect, options?: SqlAnalysisBatchOptions) { const raw = await requestJson('/sql/analyze-batch', { dialect, items, + ...(options?.catalog ? { catalog: options.catalog } : {}), }); return mapBatchResponse(raw); }, diff --git a/packages/cli/src/context/sql-analysis/ports.ts b/packages/cli/src/context/sql-analysis/ports.ts index 887be605..898fca38 100644 --- a/packages/cli/src/context/sql-analysis/ports.ts +++ b/packages/cli/src/context/sql-analysis/ports.ts @@ -1,3 +1,5 @@ +import type { KtxTableRef } from '../scan/types.js'; + export type SqlAnalysisDialect = | 'bigquery' | 'snowflake' @@ -32,8 +34,20 @@ export interface SqlAnalysisBatchItem { sql: string; } +interface SqlAnalysisCatalogTable extends KtxTableRef { + columns?: string[]; +} + +interface SqlAnalysisCatalog { + tables: SqlAnalysisCatalogTable[]; +} + +export interface SqlAnalysisBatchOptions { + catalog?: SqlAnalysisCatalog; +} + export interface SqlAnalysisBatchResult { - tablesTouched: string[]; + tablesTouched: KtxTableRef[]; columnsByClause: Partial>; error?: string | null; } @@ -48,6 +62,7 @@ export interface SqlAnalysisPort { analyzeBatch( items: SqlAnalysisBatchItem[], dialect: SqlAnalysisDialect, + options?: SqlAnalysisBatchOptions, ): Promise>; validateReadOnly(sql: string, dialect: SqlAnalysisDialect): Promise; } diff --git a/packages/cli/src/local-adapters.ts b/packages/cli/src/local-adapters.ts index 0cd2d940..3e3b0486 100644 --- a/packages/cli/src/local-adapters.ts +++ b/packages/cli/src/local-adapters.ts @@ -15,7 +15,7 @@ import { BigQueryHistoricSqlQueryHistoryReader } from './context/ingest/adapters import { historicSqlDialectForConnectionDriver } from './context/ingest/adapters/historic-sql/connection-dialect.js'; import { createDaemonLiveDatabaseIntrospection } from './context/ingest/adapters/live-database/daemon-introspection.js'; import { createDefaultLocalIngestAdapters, type DefaultLocalIngestAdaptersOptions } from './context/ingest/local-adapters.js'; -import type { HistoricSqlReader } from './context/ingest/adapters/historic-sql/types.js'; +import type { HistoricSqlDialect, HistoricSqlReader } from './context/ingest/adapters/historic-sql/types.js'; import type { LiveDatabaseIntrospectionOptions, LiveDatabaseIntrospectionPort, @@ -31,7 +31,7 @@ import { createManagedDaemonLookerTableIdentifierParser, createManagedDaemonSqlAnalysisPort, managedDaemonDatabaseIntrospectionOptions, - type ManagedPythonCoreDaemonOptions, + type ManagedPythonDaemonHttpOptions, } from './managed-python-http.js'; import type { KtxOperationalLogger } from './io/logger.js'; import { resolveKtxConfigReference } from './context/core/config-reference.js'; @@ -161,10 +161,17 @@ export interface KtxCliLocalIngestAdaptersOptions extends DefaultLocalIngestAdap historicSqlConnectionId?: string; sqlAnalysis?: SqlAnalysisPort; sqlAnalysisUrl?: string; - managedDaemon?: ManagedPythonCoreDaemonOptions; + managedDaemon?: ManagedPythonDaemonHttpOptions; logger?: KtxOperationalLogger; } +export interface KtxCliHistoricSqlRuntime { + dialect: HistoricSqlDialect; + sqlAnalysis: SqlAnalysisPort; + reader: HistoricSqlReader; + queryClient: unknown; +} + function createEphemeralPostgresHistoricSqlClient(project: KtxLocalProject, connectionId: string) { const connection = project.config.connections[connectionId] as KtxPostgresConnectionConfig | undefined; const inputDriver = connection?.driver ?? 'unknown'; @@ -262,7 +269,10 @@ function bigQueryRegion(connection: KtxBigQueryConnectionConfig): string { : 'us'; } -function historicSqlOptionsForLocalRun(project: KtxLocalProject, options: KtxCliLocalIngestAdaptersOptions) { +function historicSqlOptionsForLocalRun( + project: KtxLocalProject, + options: KtxCliLocalIngestAdaptersOptions, +): KtxCliHistoricSqlRuntime | undefined { const connectionId = options.historicSqlConnectionId; if (!connectionId) { return undefined; @@ -285,6 +295,7 @@ function historicSqlOptionsForLocalRun(project: KtxLocalProject, options: KtxCli if (dialect === 'postgres') { return { ...base, + dialect, reader: new PostgresPgssReader() satisfies HistoricSqlReader, queryClient: createEphemeralPostgresHistoricSqlClient(project, connectionId), }; @@ -297,6 +308,7 @@ function historicSqlOptionsForLocalRun(project: KtxLocalProject, options: KtxCli } return { ...base, + dialect, reader: new BigQueryHistoricSqlQueryHistoryReader({ projectId: bigQueryProjectId(connection, process.env), region: bigQueryRegion(connection), @@ -307,6 +319,7 @@ function historicSqlOptionsForLocalRun(project: KtxLocalProject, options: KtxCli return { ...base, + dialect, reader: new SnowflakeHistoricSqlQueryHistoryReader() satisfies HistoricSqlReader, queryClient: { async executeQuery(query: string) { @@ -318,11 +331,24 @@ function historicSqlOptionsForLocalRun(project: KtxLocalProject, options: KtxCli }; } +export function createKtxCliHistoricSqlRuntime( + project: KtxLocalProject, + connectionId: string, + options: KtxCliLocalIngestAdaptersOptions = {}, +): KtxCliHistoricSqlRuntime | undefined { + return historicSqlOptionsForLocalRun(project, { + ...options, + historicSqlConnectionId: connectionId, + }); +} + export function createKtxCliLocalIngestAdapters( project: KtxLocalProject, options: KtxCliLocalIngestAdaptersOptions = {}, ): SourceAdapter[] { - const historicSql = historicSqlOptionsForLocalRun(project, options); + const historicSql = options.historicSqlConnectionId + ? createKtxCliHistoricSqlRuntime(project, options.historicSqlConnectionId, options) + : undefined; const base = createDefaultLocalIngestAdapters(project, { ...options, databaseIntrospection: ktxCliDaemonDatabaseIntrospectionOptions(options), diff --git a/packages/cli/src/public-ingest.ts b/packages/cli/src/public-ingest.ts index 7fc43ac4..44a2b024 100644 --- a/packages/cli/src/public-ingest.ts +++ b/packages/cli/src/public-ingest.ts @@ -5,6 +5,7 @@ import type { KtxProgressPort } from './context/scan/types.js'; import type { KtxCliIo } from './index.js'; import type { KtxIngestArgs, KtxIngestDeps, KtxIngestProgressUpdate } from './ingest.js'; import { isDatabaseDriver, normalizeConnectionDriver } from './connection-drivers.js'; +import { resolveQueryHistoryScopeFloor } from './context/ingest/adapters/historic-sql/scope-floor.js'; import { ensureManagedPythonCommandRuntime, type KtxManagedPythonInstallPolicy, @@ -19,6 +20,7 @@ import { import { createAggregateProgressPort } from './progress-port-adapter.js'; import { resolvePublicIngestRuntimeRequirements } from './runtime-requirements.js'; import type { KtxScanArgs, KtxScanDeps } from './scan.js'; +import type { KtxTableRef } from './context/scan/types.js'; import { profileMark } from './startup-profile.js'; import { isDemoConnection } from './telemetry/demo-detect.js'; import { emitProjectStackSnapshot, emitTelemetryEvent } from './telemetry/index.js'; @@ -281,26 +283,35 @@ function positiveInteger(value: unknown): number | undefined { return typeof value === 'number' && Number.isInteger(value) && value > 0 ? value : undefined; } -function enabledTablesForConnection(connection: KtxProjectConnectionConfig): string[] | undefined { - const raw = connection.enabled_tables; - if (!Array.isArray(raw)) { - return undefined; - } - const tables = raw.filter((value): value is string => typeof value === 'string' && value.trim().length > 0); - return tables.length > 0 ? tables : undefined; -} - -function queryHistoryPullConfig(input: { +/** @internal */ +export function queryHistoryPullConfig(input: { stored: Record; dialect: HistoricSqlDialect; windowDays?: number; - enabledTables?: string[]; + enabledTables?: KtxTableRef[]; + enabledSchemas?: string[]; + modeledTableCatalog?: KtxTableRef[]; + scopeFloorWarnings?: string[]; }): Record { - const { enabled: _enabled, dialect: _dialect, ...storedConfig } = input.stored; + const { + enabled: _enabled, + dialect: _dialect, + enabledTables: _enabledTables, + enabledSchemas: _enabledSchemas, + scopeFloorWarnings: _scopeFloorWarnings, + ...storedConfig + } = input.stored; return { ...storedConfig, dialect: input.dialect, - ...(input.enabledTables ? { enabledTables: input.enabledTables } : {}), + ...(input.enabledTables && input.enabledTables.length > 0 ? { enabledTables: input.enabledTables } : {}), + ...(input.enabledSchemas && input.enabledSchemas.length > 0 ? { enabledSchemas: input.enabledSchemas } : {}), + ...(input.modeledTableCatalog && input.modeledTableCatalog.length > 0 + ? { modeledTableCatalog: input.modeledTableCatalog } + : {}), + ...(input.scopeFloorWarnings && input.scopeFloorWarnings.length > 0 + ? { scopeFloorWarnings: input.scopeFloorWarnings } + : {}), ...(input.windowDays !== undefined ? { windowDays: input.windowDays } : {}), }; } @@ -361,7 +372,6 @@ function resolveDatabaseTargetOptions(input: { stored: storedQh, dialect, windowDays: queryHistory.windowDays, - enabledTables: enabledTablesForConnection(input.connection), }), }, steps: ['database-schema', 'query-history'], @@ -374,6 +384,43 @@ function resolveDatabaseTargetOptions(input: { }; } +async function resolvedQueryHistoryPullConfigForTarget( + target: KtxPublicIngestPlanTarget, + project: KtxPublicIngestProject, +): Promise | null> { + if (target.operation !== 'database-ingest' || target.queryHistory?.enabled !== true || !target.queryHistory.dialect) { + return null; + } + const connection = project.config.connections[target.connectionId]; + if (!connection) { + return ( + target.queryHistory.pullConfig ?? + queryHistoryPullConfig({ + stored: {}, + dialect: target.queryHistory.dialect, + windowDays: target.queryHistory.windowDays, + }) + ); + } + const stored = storedQueryHistory(connection); + const scopeFloor = await resolveQueryHistoryScopeFloor({ + projectDir: project.projectDir, + connectionId: target.connectionId, + driver: target.driver, + connection: connection as Record, + storedQueryHistory: stored, + }); + return queryHistoryPullConfig({ + stored, + dialect: target.queryHistory.dialect, + windowDays: target.queryHistory.windowDays, + enabledTables: scopeFloor.enabledTables, + enabledSchemas: scopeFloor.enabledSchemas, + modeledTableCatalog: scopeFloor.modeledTableCatalog, + scopeFloorWarnings: scopeFloor.warnings, + }); +} + function enrichmentReadinessGaps(config: KtxProjectConfig): string[] { const gaps: string[] = []; if (config.llm.provider.backend === 'none' || !config.llm.models.default) { @@ -877,7 +924,7 @@ export async function executePublicIngestTarget( project: KtxPublicIngestProject, ): Promise { const startedAt = performance.now(); - const result = await runIngestTargetSteps(target, args, io, deps); + const result = await runIngestTargetSteps(target, args, io, deps, project); // `io` may be a capture buffer for the scan/ingest step output; the telemetry // debug echo belongs on the real user-facing stream, which callers expose as // `deps.runtimeIo` (falling back to `io` when the step io is already real). @@ -890,6 +937,7 @@ async function runIngestTargetSteps( args: Extract, io: KtxCliIo, deps: KtxPublicIngestDeps, + project: KtxPublicIngestProject, ): Promise { if (target.preflightFailure) { if (target.operation === 'database-ingest') { @@ -959,6 +1007,11 @@ async function runIngestTargetSteps( if (target.queryHistory?.enabled === true) { const { runKtxIngest } = await import('./ingest.js'); const runIngest = deps.runIngest ?? runKtxIngest; + const historicSqlPullConfigOverride = + (await resolvedQueryHistoryPullConfigForTarget(target, project)) ?? { + dialect: target.queryHistory.dialect, + ...(target.queryHistory.windowDays !== undefined ? { windowDays: target.queryHistory.windowDays } : {}), + }; const ingestArgs: KtxIngestArgs = { command: 'run', projectDir: args.projectDir, @@ -969,11 +1022,7 @@ async function runIngestTargetSteps( ...(args.cliVersion ? { cliVersion: args.cliVersion } : {}), ...(args.runtimeInstallPolicy ? { runtimeInstallPolicy: args.runtimeInstallPolicy } : {}), allowImplicitAdapter: true, - historicSqlPullConfigOverride: - target.queryHistory.pullConfig ?? { - dialect: target.queryHistory.dialect, - ...(target.queryHistory.windowDays !== undefined ? { windowDays: target.queryHistory.windowDays } : {}), - }, + historicSqlPullConfigOverride, }; // Query history runs after the schema scan has already written its report // into the shared target io, so it needs a phase-local capture. Reusing diff --git a/packages/cli/src/setup-databases.ts b/packages/cli/src/setup-databases.ts index 1fd93486..3cb6c5d2 100644 --- a/packages/cli/src/setup-databases.ts +++ b/packages/cli/src/setup-databases.ts @@ -4,7 +4,15 @@ import { delimiter, dirname, join } from 'node:path'; import { fileURLToPath } from 'node:url'; import { promisify } from 'node:util'; import { getDriverRegistration } from './context/connections/drivers.js'; +import { createLocalKtxLlmRuntimeFromConfig } from './context/llm/local-config.js'; +import type { KtxLlmRuntimePort } from './context/llm/runtime-port.js'; import { queryHistoryDialectForConnection } from './context/ingest/adapters/historic-sql/connection-dialect.js'; +import { + proposeQueryHistoryServiceAccountFilters, + type ProposeQueryHistoryServiceAccountFiltersInput, + type QueryHistoryFilterProposal, +} from './context/ingest/adapters/historic-sql/query-history-filter-picker.js'; +import { resolveQueryHistoryScopeFloor } from './context/ingest/adapters/historic-sql/scope-floor.js'; import type { HistoricSqlDialect } from './context/ingest/adapters/historic-sql/types.js'; import { runHistoricSqlReadinessProbe, @@ -15,7 +23,7 @@ import { type KtxProjectConnectionConfig, serializeKtxProjectConfig } from './co import { loadKtxProject } from './context/project/project.js'; import { markKtxSetupStateStepComplete, setKtxSetupDatabaseConnectionIds } from './context/project/setup-config.js'; import type { KtxTableListEntry } from './context/scan/types.js'; -import type { KtxCliIo } from './cli-runtime.js'; +import { getKtxCliPackageInfo, type KtxCliIo } from './cli-runtime.js'; import { errorMessage, flushPrefixedBufferedCommandOutput, @@ -35,6 +43,10 @@ import { type PickDatabaseScopeArgs, } from './database-tree-picker.js'; import { withMultiselectNavigation, withTextInputNavigation } from './prompt-navigation.js'; +import { createKtxCliHistoricSqlRuntime } from './local-adapters.js'; +import type { KtxManagedPythonInstallPolicy } from './managed-python-command.js'; +import type { ManagedPythonCoreDaemonOptions } from './managed-python-http.js'; +import { queryHistoryPullConfig } from './public-ingest.js'; import { runKtxScan } from './scan.js'; import { writeProjectLocalSecretReference } from './setup-secrets.js'; import { isDemoConnection } from './telemetry/demo-detect.js'; @@ -61,6 +73,9 @@ export type KtxSetupDatabaseDriver = export interface KtxSetupDatabasesArgs { projectDir: string; inputMode: 'auto' | 'disabled'; + yes?: boolean; + cliVersion?: string; + runtimeInstallPolicy?: KtxManagedPythonInstallPolicy; databaseDrivers?: KtxSetupDatabaseDriver[]; databaseConnectionIds?: string[]; databaseConnectionId?: string; @@ -123,6 +138,13 @@ export interface KtxSetupDatabasesDeps { listTables?: (projectDir: string, connectionId: string, schemas?: string[]) => Promise; pickDatabaseScope?: (args: PickDatabaseScopeArgs, io: KtxCliIo) => Promise; historicSqlReadinessProbe?: HistoricSqlReadinessProbe; + queryHistoryFilterPicker?: ( + input: ProposeQueryHistoryServiceAccountFiltersInput, + ) => Promise; + createQueryHistoryLlmRuntime?: ( + projectDir: string, + project: Awaited>, + ) => KtxLlmRuntimePort | null; } const DRIVER_OPTIONS: Array<{ value: KtxSetupDatabaseDriver; label: string }> = [ @@ -947,10 +969,14 @@ async function maybeApplyHistoricSqlConfig(input: { return withQueryHistoryConfig(input.connection, { ...existing, enabled: false }); } + const existingFilters = + existing.filters && typeof existing.filters === 'object' && !Array.isArray(existing.filters) + ? (existing.filters as Record) + : {}; const common: Record = { ...existing, enabled: true, - filters: historicSqlFiltersForSetup(input.args.queryHistoryServiceAccountPatterns), + filters: historicSqlFiltersForSetup(input.args.queryHistoryServiceAccountPatterns, existingFilters), }; if (dialect === 'postgres') { @@ -967,9 +993,13 @@ async function maybeApplyHistoricSqlConfig(input: { }); } -function historicSqlFiltersForSetup(patterns: string[] | undefined) { +function historicSqlFiltersForSetup( + patterns: string[] | undefined, + existingFilters: Record = {}, +) { const serviceAccountPatterns = patterns ?? []; return { + ...existingFilters, dropTrivialProbes: true, ...(serviceAccountPatterns.length > 0 ? { @@ -1587,6 +1617,189 @@ async function maybeRunHistoricSqlSetupProbe(input: { return result.ok; } +function hasServiceAccountsBlock(connection: KtxProjectConnectionConfig | undefined): boolean { + const queryHistory = queryHistoryConfigRecord(connection); + const filters = queryHistory?.filters; + if (!filters || typeof filters !== 'object' || Array.isArray(filters)) { + return false; + } + return 'serviceAccounts' in filters; +} + +function printQueryHistoryFilterProposal(io: KtxCliIo, proposal: QueryHistoryFilterProposal): void { + if (proposal.excludedRoles.length === 0) { + if (proposal.skipped?.reason === 'no-llm') { + io.stdout.write('│ Query-history filter picker skipped: no LLM is configured.\n'); + } else if (proposal.skipped?.reason === 'no-daemon') { + io.stdout.write('│ Query-history filter picker skipped: SQL analysis is unavailable.\n'); + } else if (proposal.skipped?.reason === 'no-in-scope-history') { + io.stdout.write('│ Query-history filter picker found no in-scope service-account exclusions.\n'); + } + for (const warning of proposal.warnings) { + io.stdout.write(`│ ! ${warning}\n`); + } + return; + } + + io.stdout.write('│ Proposed query-history service-account filters:\n'); + for (const excluded of proposal.excludedRoles) { + io.stdout.write(`│ - ${excluded.role}: ${excluded.reason}\n`); + } +} + +async function shouldApplyQueryHistoryFilterProposal(input: { + args: KtxSetupDatabasesArgs; + prompts: KtxSetupDatabasesPromptAdapter; + proposal: QueryHistoryFilterProposal; +}): Promise { + if (input.proposal.excludedRoles.length === 0 || input.proposal.skipped?.reason === 'user-block-present') { + return false; + } + if (input.args.yes === true || input.args.inputMode === 'disabled') { + return true; + } + const choice = await input.prompts.select({ + message: `Apply ${input.proposal.excludedRoles.length} derived query-history service-account exclusion${ + input.proposal.excludedRoles.length === 1 ? '' : 's' + }?`, + options: [ + { value: 'apply', label: 'Apply derived filters (recommended)' }, + { value: 'skip', label: 'Leave query history filters unchanged' }, + ], + }); + return choice === 'apply'; +} + +function createSetupQueryHistoryLlmRuntime(input: { + projectDir: string; + project: Awaited>; + deps: KtxSetupDatabasesDeps; +}): KtxLlmRuntimePort | null { + try { + return ( + input.deps.createQueryHistoryLlmRuntime?.(input.projectDir, input.project) ?? + createLocalKtxLlmRuntimeFromConfig(input.project.config.llm, { + projectDir: input.projectDir, + }) + ); + } catch { + return null; + } +} + +/** @internal */ +export function managedDaemonOptionsForSetupQueryHistoryPicker(input: { + projectDir: string; + args: Pick; + io: KtxCliIo; +}): ManagedPythonCoreDaemonOptions { + return { + cliVersion: input.args.cliVersion ?? getKtxCliPackageInfo().version, + projectDir: input.projectDir, + installPolicy: input.args.runtimeInstallPolicy ?? (input.args.inputMode === 'disabled' ? 'never' : 'prompt'), + io: input.io, + }; +} + +async function maybeProposeQueryHistoryFilters(input: { + projectDir: string; + connectionId: string; + io: KtxCliIo; + deps: KtxSetupDatabasesDeps; + args: KtxSetupDatabasesArgs; + prompts: KtxSetupDatabasesPromptAdapter; +}): Promise { + const project = await loadKtxProject({ projectDir: input.projectDir }); + const connection = project.config.connections[input.connectionId]; + const queryHistory = queryHistoryConfigRecord(connection); + if (!connection || queryHistory?.enabled !== true) { + return; + } + const dialect = queryHistoryDialectForConnection(connection); + if (!dialect) { + return; + } + + const picker = input.deps.queryHistoryFilterPicker ?? proposeQueryHistoryServiceAccountFilters; + const llmRuntime = createSetupQueryHistoryLlmRuntime({ + projectDir: input.projectDir, + project, + deps: input.deps, + }); + if (!llmRuntime && !input.deps.queryHistoryFilterPicker) { + printQueryHistoryFilterProposal(input.io, { + excludedRoles: [], + consideredRoleCount: 0, + skipped: { reason: 'no-llm' }, + warnings: [], + }); + return; + } + + const runtime = createKtxCliHistoricSqlRuntime(project, input.connectionId, { + managedDaemon: managedDaemonOptionsForSetupQueryHistoryPicker({ + projectDir: input.projectDir, + args: input.args, + io: input.io, + }), + }); + if (!runtime) { + return; + } + const userServiceAccountsPresent = hasServiceAccountsBlock(connection); + const scopeFloor = await resolveQueryHistoryScopeFloor({ + projectDir: input.projectDir, + connectionId: input.connectionId, + driver: String(connection.driver ?? ''), + connection: connection as Record, + storedQueryHistory: queryHistory, + }); + const pullConfig = queryHistoryPullConfig({ + stored: queryHistory, + dialect, + enabledTables: scopeFloor.enabledTables, + enabledSchemas: scopeFloor.enabledSchemas, + modeledTableCatalog: scopeFloor.modeledTableCatalog, + scopeFloorWarnings: scopeFloor.warnings, + }); + const proposal = await picker({ + connectionId: input.connectionId, + dialect, + queryClient: runtime.queryClient, + reader: runtime.reader, + sqlAnalysis: runtime.sqlAnalysis, + llmRuntime, + pullConfig, + userServiceAccountsPresent, + }); + + printQueryHistoryFilterProposal(input.io, proposal); + if (proposal.skipped?.reason === 'user-block-present') { + input.io.stdout.write('│ Existing query-history service-account filters left unchanged.\n'); + return; + } + if (!(await shouldApplyQueryHistoryFilterProposal({ args: input.args, prompts: input.prompts, proposal }))) { + return; + } + + await writeConnectionConfig({ + projectDir: input.projectDir, + connectionId: input.connectionId, + connection: withQueryHistoryConfig(connection, { + ...queryHistory, + filters: { + ...(queryHistory.filters && typeof queryHistory.filters === 'object' && !Array.isArray(queryHistory.filters) + ? queryHistory.filters + : {}), + serviceAccounts: { + mode: 'exclude', + patterns: proposal.excludedRoles.map((role) => role.pattern), + }, + }, + }), + }); +} + async function applyHistoricSqlConfigToExistingConnection(input: { projectDir: string; connectionId: string; @@ -1725,6 +1938,16 @@ async function validateAndScanConnection(input: { `Schema context complete for ${input.connectionId}`, [`Changes: ${summarizeScanChanges(scanOutput)}`], ); + if (queryHistoryAvailable) { + await maybeProposeQueryHistoryFilters({ + projectDir: input.projectDir, + connectionId: input.connectionId, + io: input.io, + deps: input.deps, + args: input.args, + prompts: input.prompts, + }); + } writeSetupSection(input.io, 'Database ready', [ `${input.connectionId} · ${driverDisplay} · schema context complete`, ]); diff --git a/packages/cli/src/setup.ts b/packages/cli/src/setup.ts index 7d4fdb0e..a991367e 100644 --- a/packages/cli/src/setup.ts +++ b/packages/cli/src/setup.ts @@ -735,6 +735,9 @@ async function runKtxSetupInner(args: KtxSetupArgs, io: KtxCliIo, deps: KtxSetup { projectDir: projectResult.projectDir, inputMode: args.inputMode, + yes: args.yes, + cliVersion: args.cliVersion, + runtimeInstallPolicy: setupRuntimeInstallPolicy(args), ...(args.databaseDrivers ? { databaseDrivers: args.databaseDrivers } : {}), ...(args.databaseConnectionIds ? { databaseConnectionIds: args.databaseConnectionIds } : {}), ...(args.databaseConnectionId ? { databaseConnectionId: args.databaseConnectionId } : {}), diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts index 74393e4b..e3222ab5 100644 --- a/packages/cli/test/context/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts +++ b/packages/cli/test/context/ingest/adapters/historic-sql/bigquery-query-history-reader.test.ts @@ -91,7 +91,10 @@ describe('BigQueryHistoricSqlQueryHistoryReader', () => { 40, 0.05, null, - JSON.stringify([{ user: 'analyst@example.test', executions: 1 }]), + JSON.stringify([ + { user: 'svc-loader@example.test', executions: 40 }, + { user: 'analyst@example.test', executions: 2 }, + ]), ], ], totalRows: 1, @@ -103,15 +106,25 @@ describe('BigQueryHistoricSqlQueryHistoryReader', () => { for await (const row of reader.fetchAggregated( client, { start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') }, - { dialect: 'bigquery', minExecutions: 5, windowDays: 90, enabledTables: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 }, + { dialect: 'bigquery', minExecutions: 5, windowDays: 90, enabledTables: [], enabledSchemas: [], modeledTableCatalog: [], scopeFloorWarnings: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 }, )) { rows.push(row); } const sql = firstQuery(client); + expect(sql).toContain('WITH filtered_jobs AS'); + expect(sql).toContain('query_info.query_hashes.normalized_literals'); + expect(sql).toContain('TO_HEX(SHA256(query))'); + expect(sql).toContain('AS template_id'); + expect(sql).toContain('template_stats AS'); + expect(sql).toContain('template_users AS'); expect(sql).toContain('COUNT(*) AS executions'); expect(sql).toContain('COUNT(DISTINCT user_email) AS distinct_users'); - expect(sql).toContain('GROUP BY query_hash'); + expect(sql).toContain('GROUP BY template_id'); + expect(sql).toContain('GROUP BY template_id, user_email'); + expect(sql).toContain('ORDER BY users.executions DESC'); + expect(sql).not.toMatch(/\bquery_hash\b/); + expect(sql).not.toContain('LIMIT 5'); expect(sql).toContain('HAVING COUNT(*) >= 5'); expect(rows).toMatchObject([ { @@ -120,7 +133,10 @@ describe('BigQueryHistoricSqlQueryHistoryReader', () => { executions: 42, errorRate: 0.05, }, - topUsers: [{ user: 'analyst@example.test', executions: 1 }], + topUsers: [ + { user: 'svc-loader@example.test', executions: 40 }, + { user: 'analyst@example.test', executions: 2 }, + ], }, ]); }); @@ -137,6 +153,9 @@ describe('BigQueryHistoricSqlQueryHistoryReader', () => { minExecutions: 5, windowDays: 90, enabledTables: [], + enabledSchemas: [], + modeledTableCatalog: [], + scopeFloorWarnings: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90, diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/chunk-unified.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/chunk-unified.test.ts index a8e99c39..f3d7d293 100644 --- a/packages/cli/test/context/ingest/adapters/historic-sql/chunk-unified.test.ts +++ b/packages/cli/test/context/ingest/adapters/historic-sql/chunk-unified.test.ts @@ -30,6 +30,7 @@ async function writeUnifiedStagedDir(root: string): Promise { }); await writeJson(root, 'tables/public.orders.json', { table: 'public.orders', + tableRef: { catalog: null, db: 'public', name: 'orders' }, stats: { executionsBucket: '10-100', distinctUsersBucket: '2-5', @@ -46,7 +47,10 @@ async function writeUnifiedStagedDir(root: string): Promise { { id: 'orders', canonicalSql: 'select * from public.orders join public.customers on true', - tablesTouched: ['public.orders', 'public.customers'], + tablesTouched: [ + { catalog: null, db: 'public', name: 'orders' }, + { catalog: null, db: 'public', name: 'customers' }, + ], executionsBucket: '10-100', distinctUsersBucket: '2-5', dialect: 'postgres', @@ -58,7 +62,10 @@ async function writeUnifiedStagedDir(root: string): Promise { { id: 'orders', canonicalSql: 'select * from public.orders join public.customers on true', - tablesTouched: ['public.orders', 'public.customers'], + tablesTouched: [ + { catalog: null, db: 'public', name: 'orders' }, + { catalog: null, db: 'public', name: 'customers' }, + ], executionsBucket: '10-100', distinctUsersBucket: '2-5', dialect: 'postgres', @@ -155,7 +162,10 @@ describe('chunkHistoricSqlUnifiedStagedDir', () => { { id: 'line-items', canonicalSql: 'select * from public.orders join public.line_items on true', - tablesTouched: ['public.orders', 'public.line_items'], + tablesTouched: [ + { catalog: null, db: 'public', name: 'orders' }, + { catalog: null, db: 'public', name: 'line_items' }, + ], executionsBucket: '10-100', distinctUsersBucket: '2-5', dialect: 'postgres', diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/historic-sql.adapter.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/historic-sql.adapter.test.ts index 850be576..dcd00d32 100644 --- a/packages/cli/test/context/ingest/adapters/historic-sql/historic-sql.adapter.test.ts +++ b/packages/cli/test/context/ingest/adapters/historic-sql/historic-sql.adapter.test.ts @@ -76,7 +76,10 @@ describe('HistoricSqlSourceAdapter', () => { [ 'pg:1', { - tablesTouched: ['public.orders', 'public.customers'], + tablesTouched: [ + { catalog: null, db: 'public', name: 'orders' }, + { catalog: null, db: 'public', name: 'customers' }, + ], columnsByClause: { select: ['status'], join: ['customer_id', 'id'], groupBy: ['status'] }, }, ], diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts index 48e5744b..e818f1c9 100644 --- a/packages/cli/test/context/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts +++ b/packages/cli/test/context/ingest/adapters/historic-sql/local-ingest-acceptance.test.ts @@ -126,7 +126,10 @@ function acceptanceSqlAnalysis(): SqlAnalysisPort { items.map((item) => [ item.id, { - tablesTouched: ['public.orders', 'public.customers'], + tablesTouched: [ + { catalog: null, db: 'public', name: 'orders' }, + { catalog: null, db: 'public', name: 'customers' }, + ], columnsByClause: { select: ['status', 'segment'], where: ['status'], diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/pattern-inputs.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/pattern-inputs.test.ts index 9fae5e08..780fcf3d 100644 --- a/packages/cli/test/context/ingest/adapters/historic-sql/pattern-inputs.test.ts +++ b/packages/cli/test/context/ingest/adapters/historic-sql/pattern-inputs.test.ts @@ -9,11 +9,18 @@ import type { StagedPatternsInput } from '../../../../../src/context/ingest/adap type PatternTemplate = StagedPatternsInput['templates'][number]; +function tableRef(value: string): { catalog: string | null; db: string | null; name: string } { + const parts = value.split('.'); + if (parts.length === 3) return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! }; + if (parts.length === 2) return { catalog: null, db: parts[0]!, name: parts[1]! }; + return { catalog: null, db: null, name: value }; +} + function template(id: string, tablesTouched: string[], canonicalSql = 'select 1'): PatternTemplate { return { id, canonicalSql, - tablesTouched, + tablesTouched: tablesTouched.map(tableRef), executionsBucket: '10-100', distinctUsersBucket: '2-5', dialect: 'postgres', @@ -32,7 +39,7 @@ describe('historic-SQL pattern input sharding', () => { ], }; - const result = splitHistoricSqlPatternInputs(input, { maxBytes: 760 }); + const result = splitHistoricSqlPatternInputs(input, { maxBytes: 1200 }); expect(result.auditInput.templates.map((entry) => entry.id)).toEqual([ 'orders-customers-1', @@ -51,7 +58,7 @@ describe('historic-SQL pattern input sharding', () => { 'orders-customers-1', 'orders-customers-2', ]); - expect(result.shards.every((shard) => shard.byteLength <= 760)).toBe(true); + expect(result.shards.every((shard) => shard.byteLength <= 1200)).toBe(true); expect(result.shards.flatMap((shard) => shard.input.templates).some((entry) => entry.id === 'single-table-orders')).toBe(false); expect(result.warnings).toEqual([]); }); diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts index 41baf331..4c9fc7bb 100644 --- a/packages/cli/test/context/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts +++ b/packages/cli/test/context/ingest/adapters/historic-sql/postgres-pgss-reader.test.ts @@ -215,7 +215,7 @@ describe('PostgresPgssReader aggregate path', () => { for await (const row of reader.fetchAggregated( { executeQuery }, { start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') }, - { dialect: 'postgres', minExecutions: 5, enabledTables: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 }, + { dialect: 'postgres', minExecutions: 5, enabledTables: [], enabledSchemas: [], modeledTableCatalog: [], scopeFloorWarnings: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 }, )) { rows.push(row); } diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/query-history-filter-picker.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/query-history-filter-picker.test.ts new file mode 100644 index 00000000..4c295092 --- /dev/null +++ b/packages/cli/test/context/ingest/adapters/historic-sql/query-history-filter-picker.test.ts @@ -0,0 +1,274 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { KtxLlmRuntimePort } from '../../../../../src/context/llm/runtime-port.js'; +import type { + SqlAnalysisBatchItem, + SqlAnalysisBatchResult, + SqlAnalysisPort, +} from '../../../../../src/context/sql-analysis/ports.js'; +import { + proposeQueryHistoryServiceAccountFilters, + regexEscapeForExactRolePattern, +} from '../../../../../src/context/ingest/adapters/historic-sql/query-history-filter-picker.js'; +import type { + AggregatedTemplate, + HistoricSqlReader, +} from '../../../../../src/context/ingest/adapters/historic-sql/types.js'; + +function aggregate(overrides: Partial & { templateId: string; canonicalSql: string }): AggregatedTemplate { + return { + templateId: overrides.templateId, + canonicalSql: overrides.canonicalSql, + dialect: overrides.dialect ?? 'postgres', + stats: overrides.stats ?? { + executions: 25, + distinctUsers: 1, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-06-01T00:00:00.000Z', + p50RuntimeMs: 50, + p95RuntimeMs: 100, + errorRate: 0, + rowsProduced: 10, + }, + topUsers: overrides.topUsers ?? [{ user: 'analyst', executions: 25 }], + }; +} + +function reader(...templates: AggregatedTemplate[]): HistoricSqlReader { + return { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + for (const template of templates) { + yield template; + } + }, + }; +} + +function sqlAnalysis(tablesById: Record>): SqlAnalysisPort { + return { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async (items: SqlAnalysisBatchItem[]): Promise> => + new Map( + items.map((item) => [ + item.id, + { + tablesTouched: tablesById[item.id] ?? [], + columnsByClause: {}, + }, + ]), + ), + ), + validateReadOnly: vi.fn(async () => ({ ok: true })), + }; +} + +function llm(decisions: Array<{ role: string; exclude: boolean; reason: string }>): KtxLlmRuntimePort { + const generateObject = vi.fn(async () => ({ roles: decisions })) as KtxLlmRuntimePort['generateObject']; + return { + generateText: vi.fn(), + generateObject, + runAgentLoop: vi.fn(), + }; +} + +describe('query-history filter picker', () => { + it('emits anchored escaped patterns for excluded roles from one batched LLM call', async () => { + const runtime = llm([ + { role: 'svc.loader+prod', exclude: true, reason: 'Runs recurring loader traffic only.' }, + { role: 'analyst', exclude: false, reason: 'Interactive analytic usage.' }, + ]); + const analysis = sqlAnalysis({ + loader: [{ catalog: null, db: 'analytics', name: 'orders' }], + analyst: [{ catalog: null, db: 'analytics', name: 'orders' }], + }); + + const proposal = await proposeQueryHistoryServiceAccountFilters({ + connectionId: 'warehouse', + dialect: 'postgres', + queryClient: {}, + reader: reader( + aggregate({ + templateId: 'loader', + canonicalSql: 'merge into analytics.orders using staging.orders_delta on orders.id = orders_delta.id', + topUsers: [{ user: 'svc.loader+prod', executions: 40 }], + }), + aggregate({ + templateId: 'analyst', + canonicalSql: 'select status, count(*) from analytics.orders group by status', + topUsers: [{ user: 'analyst', executions: 25 }], + }), + ), + sqlAnalysis: analysis, + llmRuntime: runtime, + pullConfig: { + dialect: 'postgres', + enabledSchemas: ['analytics'], + enabledTables: [], + modeledTableCatalog: [{ catalog: null, db: 'analytics', name: 'orders' }], + filters: { dropTrivialProbes: true }, + }, + now: new Date('2026-06-03T00:00:00.000Z'), + }); + + expect(runtime.generateObject).toHaveBeenCalledTimes(1); + expect(proposal).toMatchObject({ + excludedRoles: [ + { + role: 'svc.loader+prod', + pattern: '^svc\\.loader\\+prod$', + reason: 'Runs recurring loader traffic only.', + }, + ], + consideredRoleCount: 2, + skipped: null, + warnings: [], + }); + }); + + it('redacts representative SQL before sending role records to the LLM', async () => { + const originalSql = + "select * from public.api_events where api_key = 'sk_live_abc123' and note = 'Secret_Token_9f'"; // pragma: allowlist secret + const runtime = llm([ + { role: 'svc_loader', exclude: false, reason: 'Keep by default.' }, + { role: 'analyst', exclude: false, reason: 'Interactive analytic usage.' }, + ]); + const analysis = sqlAnalysis({ + secret: [{ catalog: null, db: 'public', name: 'api_events' }], + analyst: [{ catalog: null, db: 'public', name: 'orders' }], + }); + + await proposeQueryHistoryServiceAccountFilters({ + connectionId: 'warehouse', + dialect: 'postgres', + queryClient: {}, + reader: reader( + aggregate({ + templateId: 'secret', + canonicalSql: originalSql, + topUsers: [{ user: 'svc_loader', executions: 30 }], + }), + aggregate({ + templateId: 'analyst', + canonicalSql: 'select status, count(*) from public.orders group by status', + topUsers: [{ user: 'analyst', executions: 25 }], + }), + ), + sqlAnalysis: analysis, + llmRuntime: runtime, + pullConfig: { + dialect: 'postgres', + enabledSchemas: ['public'], + enabledTables: [], + modeledTableCatalog: [], + redactionPatterns: ['sk_live_[A-Za-z0-9]+', '(?i)secret_token_[a-z0-9]+'], + filters: { dropTrivialProbes: true }, + }, + now: new Date('2026-06-03T00:00:00.000Z'), + }); + + expect(analysis.analyzeBatch).toHaveBeenCalledWith( + [ + { id: 'secret', sql: originalSql }, + { id: 'analyst', sql: 'select status, count(*) from public.orders group by status' }, + ], + 'postgres', + undefined, + ); + const call = vi.mocked(runtime.generateObject).mock.calls[0]?.[0]; + expect(call?.prompt).toContain('[REDACTED]'); + expect(call?.prompt).not.toContain('sk_live_abc123'); + expect(call?.prompt).not.toContain('Secret_Token_9f'); + }); + + it('fails open with no LLM runtime', async () => { + const proposal = await proposeQueryHistoryServiceAccountFilters({ + connectionId: 'warehouse', + dialect: 'postgres', + queryClient: {}, + reader: reader(), + sqlAnalysis: sqlAnalysis({}), + llmRuntime: null, + pullConfig: { dialect: 'postgres', filters: { dropTrivialProbes: true } }, + }); + + expect(proposal).toEqual({ + excludedRoles: [], + consideredRoleCount: 0, + skipped: { reason: 'no-llm' }, + warnings: [], + }); + }); + + it('proposes nothing for a single-role stack', async () => { + const runtime = llm([{ role: 'warehouse_user', exclude: true, reason: 'Only observed role.' }]); + + const proposal = await proposeQueryHistoryServiceAccountFilters({ + connectionId: 'warehouse', + dialect: 'postgres', + queryClient: {}, + reader: reader( + aggregate({ + templateId: 'single-role', + canonicalSql: 'select * from analytics.orders', + topUsers: [{ user: 'warehouse_user', executions: 40 }], + }), + ), + sqlAnalysis: sqlAnalysis({ + 'single-role': [{ catalog: null, db: 'analytics', name: 'orders' }], + }), + llmRuntime: runtime, + pullConfig: { dialect: 'postgres', enabledSchemas: ['analytics'], filters: { dropTrivialProbes: true } }, + }); + + expect(runtime.generateObject).not.toHaveBeenCalled(); + expect(proposal.excludedRoles).toEqual([]); + expect(proposal.skipped).toEqual({ reason: 'no-in-scope-history' }); + }); + + it('keeps clean in-scope history when the model excludes nothing', async () => { + const proposal = await proposeQueryHistoryServiceAccountFilters({ + connectionId: 'warehouse', + dialect: 'bigquery', + queryClient: {}, + reader: reader( + aggregate({ + templateId: 'dashboard', + canonicalSql: 'select status, count(*) from `demo.analytics.orders` group by status', + dialect: 'bigquery', + topUsers: [{ user: 'bi_runner', executions: 1 }], + }), + aggregate({ + templateId: 'analyst', + canonicalSql: 'select * from `demo.analytics.orders` where id = @id', + dialect: 'bigquery', + topUsers: [{ user: 'analyst', executions: 1 }], + }), + ), + sqlAnalysis: sqlAnalysis({ + dashboard: [{ catalog: 'demo', db: 'analytics', name: 'orders' }], + analyst: [{ catalog: 'demo', db: 'analytics', name: 'orders' }], + }), + llmRuntime: llm([ + { role: 'bi_runner', exclude: false, reason: 'Dashboard usage is analytic.' }, + { role: 'analyst', exclude: false, reason: 'Interactive analyst usage.' }, + ]), + pullConfig: { + dialect: 'bigquery', + windowDays: 90, + enabledSchemas: ['analytics'], + filters: { dropTrivialProbes: true }, + }, + }); + + expect(proposal.excludedRoles).toEqual([]); + expect(proposal.consideredRoleCount).toBe(2); + expect(proposal.skipped).toBeNull(); + }); + + it('escapes regex metacharacters for exact role matches', () => { + expect(regexEscapeForExactRolePattern('svc.loader+prod')).toBe('^svc\\.loader\\+prod$'); + expect(regexEscapeForExactRolePattern('team[etl](west)')).toBe('^team\\[etl\\]\\(west\\)$'); + }); +}); diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/scope-floor.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/scope-floor.test.ts new file mode 100644 index 00000000..597cae46 --- /dev/null +++ b/packages/cli/test/context/ingest/adapters/historic-sql/scope-floor.test.ts @@ -0,0 +1,194 @@ +import { mkdir, mkdtemp, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { describe, expect, it } from 'vitest'; +import { resolveQueryHistoryScopeFloor } from '../../../../../src/context/ingest/adapters/historic-sql/scope-floor.js'; + +async function tempProject(): Promise { + return mkdtemp(join(tmpdir(), 'ktx-qh-scope-')); +} + +async function seedLiveScanTable( + projectDir: string, + connectionId: string, + syncId: string, + table: { catalog: string | null; db: string | null; name: string }, +): Promise { + const root = join(projectDir, 'raw-sources', connectionId, 'live-database', syncId); + await mkdir(join(root, 'tables'), { recursive: true }); + await writeFile( + join(root, 'connection.json'), + `${JSON.stringify({ connectionId, driver: 'postgres' }, null, 2)}\n`, + 'utf-8', + ); + await writeFile( + join(root, 'tables', `${table.db ?? 'default'}-${table.name}.json`), + `${JSON.stringify( + { + ...table, + kind: 'table', + comment: null, + estimatedRows: null, + columns: [], + foreignKeys: [], + }, + null, + 2, + )}\n`, + 'utf-8', + ); + await writeFile( + join(root, 'scan-report.json'), + `${JSON.stringify( + { + connectionId, + driver: 'postgres', + syncId, + runId: `scan-${syncId}`, + trigger: 'cli', + mode: 'enriched', + dryRun: false, + artifactPaths: { + rawSourcesDir: `raw-sources/${connectionId}/live-database/${syncId}`, + reportPath: `raw-sources/${connectionId}/live-database/${syncId}/scan-report.json`, + manifestShards: [], + enrichmentArtifacts: [], + }, + counts: {}, + warnings: [], + enrichment: {}, + enrichmentState: {}, + }, + null, + 2, + )}\n`, + 'utf-8', + ); +} + +describe('resolveQueryHistoryScopeFloor', () => { + it('computes modeled schemas from connection schemas plus semantic source tables', async () => { + const projectDir = await tempProject(); + await mkdir(join(projectDir, 'semantic-layer/warehouse'), { recursive: true }); + await writeFile( + join(projectDir, 'semantic-layer/warehouse/revenue.yaml'), + [ + 'name: revenue', + 'table: orbit_analytics.mart_revenue', + 'grain: [id]', + 'columns:', + ' - name: id', + ' type: string', + '', + ].join('\n'), + 'utf-8', + ); + await seedLiveScanTable(projectDir, 'warehouse', 'sync-1', { + catalog: null, + db: 'orbit_raw', + name: 'accounts', + }); + + const scope = await resolveQueryHistoryScopeFloor({ + projectDir, + connectionId: 'warehouse', + driver: 'postgres', + connection: { driver: 'postgres', schemas: ['orbit_raw'] }, + storedQueryHistory: {}, + }); + + expect(scope.enabledSchemas).toEqual(['orbit_analytics', 'orbit_raw']); + expect(scope.modeledTableCatalog).toEqual([ + { catalog: null, db: 'orbit_analytics', name: 'mart_revenue' }, + { catalog: null, db: 'orbit_raw', name: 'accounts' }, + ]); + expect(scope.enabledTables).toEqual([]); + expect(scope.floorDisabled).toBe(false); + }); + + it('uses explicit enabledTables before explicit enabledSchemas and computed scope', async () => { + const scope = await resolveQueryHistoryScopeFloor({ + projectDir: await tempProject(), + connectionId: 'warehouse', + driver: 'postgres', + connection: { driver: 'postgres', schemas: ['orbit_raw'] }, + storedQueryHistory: { + enabledTables: ['orbit_analytics.mart_revenue'], + enabledSchemas: ['orbit_raw'], + }, + }); + + expect(scope.enabledTables).toEqual([{ catalog: null, db: 'orbit_analytics', name: 'mart_revenue' }]); + expect(scope.enabledSchemas).toEqual([]); + expect(scope.floorDisabled).toBe(false); + }); + + it('disables the floor for enabledSchemas star', async () => { + const scope = await resolveQueryHistoryScopeFloor({ + projectDir: await tempProject(), + connectionId: 'warehouse', + driver: 'postgres', + connection: { driver: 'postgres', schemas: ['orbit_raw'] }, + storedQueryHistory: { enabledSchemas: ['*'] }, + }); + + expect(scope.enabledTables).toEqual([]); + expect(scope.enabledSchemas).toEqual(['*']); + expect(scope.floorDisabled).toBe(true); + }); + + it('adds latest live-database scan tables to the modeled table catalog', async () => { + const projectDir = await tempProject(); + await mkdir(join(projectDir, 'semantic-layer/warehouse'), { recursive: true }); + await writeFile( + join(projectDir, 'semantic-layer/warehouse/revenue.yaml'), + [ + 'name: revenue', + 'table: orbit_analytics.mart_revenue', + 'grain: [id]', + 'columns:', + ' - name: id', + ' type: string', + '', + ].join('\n'), + 'utf-8', + ); + await seedLiveScanTable(projectDir, 'warehouse', 'sync-1', { + catalog: null, + db: 'orbit_raw', + name: 'accounts', + }); + + const scope = await resolveQueryHistoryScopeFloor({ + projectDir, + connectionId: 'warehouse', + driver: 'postgres', + connection: { driver: 'postgres', schemas: ['orbit_raw'] }, + storedQueryHistory: {}, + }); + + expect(scope.enabledSchemas).toEqual(['orbit_analytics', 'orbit_raw']); + expect(scope.modeledTableCatalog).toEqual([ + { catalog: null, db: 'orbit_analytics', name: 'mart_revenue' }, + { catalog: null, db: 'orbit_raw', name: 'accounts' }, + ]); + expect(scope.warnings).toEqual([]); + expect(scope.floorDisabled).toBe(false); + }); + + it('fails open when schema scope exists but the scan catalog is unavailable', async () => { + const scope = await resolveQueryHistoryScopeFloor({ + projectDir: await tempProject(), + connectionId: 'warehouse', + driver: 'postgres', + connection: { driver: 'postgres', schemas: ['orbit_raw'] }, + storedQueryHistory: {}, + }); + + expect(scope.enabledTables).toEqual([]); + expect(scope.enabledSchemas).toEqual(['*']); + expect(scope.modeledTableCatalog).toEqual([]); + expect(scope.floorDisabled).toBe(true); + expect(scope.warnings).toContain('query_history_scope_floor_disabled:catalog_unavailable'); + }); +}); diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/scope-membership.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/scope-membership.test.ts new file mode 100644 index 00000000..bfcefc22 --- /dev/null +++ b/packages/cli/test/context/ingest/adapters/historic-sql/scope-membership.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, it } from 'vitest'; +import { + includedQueryHistoryTableRefs, + isQueryHistoryScopeFloorDisabled, + shouldFailOpenQueryHistoryScope, +} from '../../../../../src/context/ingest/adapters/historic-sql/scope-membership.js'; +import type { KtxTableRef } from '../../../../../src/context/scan/types.js'; + +function ref(db: string | null, name: string, catalog: string | null = null): KtxTableRef { + return { catalog, db, name }; +} + +describe('query-history scope membership', () => { + it('prefers explicit enabled tables over schema scope', () => { + const orders = ref('analytics', 'orders'); + const noise = ref('metabase', 'application_table'); + + expect( + includedQueryHistoryTableRefs([orders, noise], { + enabledTables: [orders], + enabledSchemas: ['metabase'], + }), + ).toEqual([orders]); + }); + + it('matches schema scope by the db component across catalogs', () => { + const modeled = ref('orbit_analytics', 'orders', 'demo-project'); + const noise = ref('metabase', 'application_table', 'demo-project'); + + expect( + includedQueryHistoryTableRefs([modeled, noise], { + enabledTables: [], + enabledSchemas: ['orbit_analytics'], + }), + ).toEqual([modeled]); + }); + + it('keeps every touched ref when wildcard scope disables the floor', () => { + const tables = [ref('analytics', 'orders'), ref('metabase', 'application_table')]; + + expect(isQueryHistoryScopeFloorDisabled({ enabledTables: [], enabledSchemas: ['*'] })).toBe(true); + expect(includedQueryHistoryTableRefs(tables, { enabledTables: [], enabledSchemas: ['*'] })).toEqual(tables); + }); + + it('fails open when no tables, schemas, or wildcard are configured', () => { + const tables = [ref('metabase', 'application_table')]; + + expect(shouldFailOpenQueryHistoryScope({ enabledTables: [], enabledSchemas: [] })).toBe(true); + expect(includedQueryHistoryTableRefs(tables, { enabledTables: [], enabledSchemas: [] })).toEqual(tables); + }); +}); diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts index 7307fcdd..ab76c533 100644 --- a/packages/cli/test/context/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts +++ b/packages/cli/test/context/ingest/adapters/historic-sql/snowflake-query-history-reader.test.ts @@ -90,7 +90,10 @@ describe('SnowflakeHistoricSqlQueryHistoryReader', () => { 40, 0.05, 100, - JSON.stringify([{ user: 'ANALYST', executions: 1 }]), + JSON.stringify([ + { user: 'SVC_LOADER', executions: 40 }, + { user: 'ANALYST', executions: 2 }, + ]), ], ], totalRows: 1, @@ -102,15 +105,20 @@ describe('SnowflakeHistoricSqlQueryHistoryReader', () => { for await (const row of reader.fetchAggregated( client, { start: new Date('2026-02-10T00:00:00.000Z'), end: new Date('2026-05-11T00:00:00.000Z') }, - { dialect: 'snowflake', minExecutions: 5, windowDays: 90, enabledTables: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 }, + { dialect: 'snowflake', minExecutions: 5, windowDays: 90, enabledTables: [], enabledSchemas: [], modeledTableCatalog: [], scopeFloorWarnings: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90 }, )) { rows.push(row); } const sql = firstQuery(client); + expect(sql).toContain('WITH filtered_queries AS'); + expect(sql).toContain('template_stats AS'); + expect(sql).toContain('template_users AS'); expect(sql).toContain('SNOWFLAKE.ACCOUNT_USAGE.QUERY_HISTORY'); expect(sql).toContain('COUNT(*) AS executions'); - expect(sql).toContain('GROUP BY query_hash'); + expect(sql).toContain('COUNT(DISTINCT user_name) AS distinct_users'); + expect(sql).toContain('GROUP BY query_hash, user_name'); + expect(sql).toContain('ORDER BY users.executions DESC'); expect(sql).toContain('HAVING COUNT(*) >= 5'); expect(rows).toMatchObject([ { @@ -119,7 +127,10 @@ describe('SnowflakeHistoricSqlQueryHistoryReader', () => { executions: 42, errorRate: 0.05, }, - topUsers: [{ user: 'ANALYST', executions: 1 }], + topUsers: [ + { user: 'SVC_LOADER', executions: 40 }, + { user: 'ANALYST', executions: 2 }, + ], }, ]); }); @@ -136,6 +147,9 @@ describe('SnowflakeHistoricSqlQueryHistoryReader', () => { minExecutions: 5, windowDays: 90, enabledTables: [], + enabledSchemas: [], + modeledTableCatalog: [], + scopeFloorWarnings: [], filters: { dropTrivialProbes: true }, redactionPatterns: [], staleArchiveAfterDays: 90, diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/stage-unified.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/stage-unified.test.ts index 630a3939..a104508d 100644 --- a/packages/cli/test/context/ingest/adapters/historic-sql/stage-unified.test.ts +++ b/packages/cli/test/context/ingest/adapters/historic-sql/stage-unified.test.ts @@ -14,6 +14,13 @@ async function readJson(root: string, relPath: string): Promise { return JSON.parse(await readFile(join(root, relPath), 'utf-8')) as T; } +function tableRef(value: string): { catalog: string | null; db: string | null; name: string } { + const parts = value.split('.'); + if (parts.length === 3) return { catalog: parts[0]!, db: parts[1]!, name: parts[2]! }; + if (parts.length === 2) return { catalog: null, db: parts[0]!, name: parts[1]! }; + return { catalog: null, db: null, name: value }; +} + function aggregate(overrides: Partial & { templateId: string; canonicalSql: string }): AggregatedTemplate { return { templateId: overrides.templateId, @@ -72,7 +79,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { [ 'orders-by-status', { - tablesTouched: ['public.orders', 'public.customers'], + tablesTouched: [tableRef('public.orders'), tableRef('public.customers')], columnsByClause: { select: ['status'], where: ['created_at'], @@ -94,6 +101,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { sqlAnalysis, pullConfig: { dialect: 'postgres', + enabledSchemas: ['public'], filters: { serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' }, }, @@ -111,6 +119,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { { id: 'bad-parse', sql: 'select broken from' }, ], 'postgres', + undefined, ); expect(await readdir(join(stagedDir, 'tables'))).toEqual(['public.customers.json', 'public.orders.json']); @@ -131,6 +140,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { const orders = await readJson>(stagedDir, 'tables/public.orders.json'); expect(orders).toMatchObject({ table: 'public.orders', + tableRef: tableRef('public.orders'), stats: { executionsBucket: '10-100', distinctUsersBucket: '2-5', @@ -159,7 +169,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { { id: 'orders-by-status', canonicalSql: expect.stringContaining('public.orders'), - tablesTouched: ['public.customers', 'public.orders'], + tablesTouched: [tableRef('public.customers'), tableRef('public.orders')], executionsBucket: '10-100', distinctUsersBucket: '2-5', dialect: 'postgres', @@ -167,6 +177,129 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { ]); }); + it('keeps templates when service-account topUsers are only a partial execution sample', async () => { + const stagedDir = await tempDir(); + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ + templateId: 'shared-bigquery-template', + canonicalSql: 'select status, count(*) from `demo.analytics.orders` group by status', + dialect: 'bigquery', + stats: { + executions: 42, + distinctUsers: 2, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 20, + p95RuntimeMs: 80, + errorRate: 0, + rowsProduced: null, + }, + topUsers: [{ user: 'svc_loader', executions: 5 }], + }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => + new Map([ + [ + 'shared-bigquery-template', + { + tablesTouched: [tableRef('demo.analytics.orders')], + columnsByClause: { select: ['status'], groupBy: ['status'] }, + }, + ], + ]), + ), + validateReadOnly: vi.fn(async () => ({ ok: true })), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { + dialect: 'bigquery', + windowDays: 90, + enabledSchemas: ['analytics'], + filters: { + serviceAccounts: { patterns: ['^svc_loader$'], mode: 'exclude' }, + }, + }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + const patterns = await readJson>(stagedDir, 'patterns-input.json'); + expect(patterns.templates.map((template: { id: string }) => template.id)).toEqual([ + 'shared-bigquery-template', + ]); + const orders = await readJson>(stagedDir, 'tables/demo.analytics.orders.json'); + expect(orders.topTemplates).toEqual([ + { + id: 'shared-bigquery-template', + canonicalSql: 'select status, count(*) from `demo.analytics.orders` group by status', + topUsers: [{ user: 'svc_loader' }], + }, + ]); + }); + + it('drops service-account-only templates when matched users cover all executions', async () => { + const stagedDir = await tempDir(); + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ + templateId: 'service-only-template', + canonicalSql: 'merge into analytics.orders using staging.orders_delta on orders.id = orders_delta.id', + stats: { + executions: 12, + distinctUsers: 1, + firstSeen: '2026-05-01T00:00:00.000Z', + lastSeen: '2026-05-11T00:00:00.000Z', + p50RuntimeMs: 20, + p95RuntimeMs: 80, + errorRate: 0, + rowsProduced: 0, + }, + topUsers: [{ user: 'svc_loader', executions: 12 }], + }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map()), + validateReadOnly: vi.fn(async () => ({ ok: true })), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { + dialect: 'postgres', + enabledSchemas: ['analytics'], + filters: { + serviceAccounts: { patterns: ['^svc_loader$'], mode: 'exclude' }, + }, + }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith([], 'postgres', undefined); + const patterns = await readJson>(stagedDir, 'patterns-input.json'); + expect(patterns.templates).toEqual([]); + }); + it('redacts configured SQL substrings in staged artifacts while analyzing original SQL', async () => { const stagedDir = await tempDir(); const originalSql = @@ -198,7 +331,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { [ 'api-events-with-secret', { - tablesTouched: ['public.api_events'], + tablesTouched: [tableRef('public.api_events')], columnsByClause: { select: [], where: ['api_key', 'note'], @@ -219,6 +352,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { sqlAnalysis, pullConfig: { dialect: 'postgres', + enabledSchemas: ['public'], redactionPatterns: ['sk_live_[A-Za-z0-9]+', '(?i)secret_token_[a-z0-9]+'], }, now: new Date('2026-05-11T12:00:00.000Z'), @@ -227,6 +361,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith( [{ id: 'api-events-with-secret', sql: originalSql }], 'postgres', + undefined, ); const tableJson = await readFile(join(stagedDir, 'tables/public.api_events.json'), 'utf-8'); @@ -266,21 +401,21 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { [ 'selected-qualified', { - tablesTouched: ['orbit_analytics.int_active_contract_arr'], + tablesTouched: [tableRef('orbit_analytics.int_active_contract_arr')], columnsByClause: { select: [], where: [], join: [], groupBy: [] }, }, ], [ 'selected-unqualified', { - tablesTouched: ['int_customer_health_signals'], + tablesTouched: [tableRef('orbit_analytics.int_customer_health_signals')], columnsByClause: { select: [], where: [], join: [], groupBy: [] }, }, ], [ 'unselected', { - tablesTouched: ['orbit_raw.accounts'], + tablesTouched: [tableRef('orbit_raw.accounts')], columnsByClause: { select: [], where: [], join: [], groupBy: [] }, }, ], @@ -297,16 +432,16 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { pullConfig: { dialect: 'postgres', enabledTables: [ - 'orbit_analytics.int_active_contract_arr', - 'orbit_analytics.int_customer_health_signals', + tableRef('orbit_analytics.int_active_contract_arr'), + tableRef('orbit_analytics.int_customer_health_signals'), ], }, now: new Date('2026-05-11T12:00:00.000Z'), }); expect(await readdir(join(stagedDir, 'tables'))).toEqual([ - 'int_customer_health_signals.json', 'orbit_analytics.int_active_contract_arr.json', + 'orbit_analytics.int_customer_health_signals.json', ]); const manifest = await readJson>(stagedDir, 'manifest.json'); expect(manifest.touchedTableCount).toBe(2); @@ -372,7 +507,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { [ 'orders-customers-a', { - tablesTouched: ['public.orders', 'public.customers'], + tablesTouched: [tableRef('public.orders'), tableRef('public.customers')], columnsByClause: { select: [], where: ['payload'], @@ -384,7 +519,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { [ 'orders-customers-b', { - tablesTouched: ['public.orders', 'public.customers'], + tablesTouched: [tableRef('public.orders'), tableRef('public.customers')], columnsByClause: { select: [], where: ['payload_b'], @@ -396,7 +531,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { [ 'orders-single-table', { - tablesTouched: ['public.orders'], + tablesTouched: [tableRef('public.orders')], columnsByClause: { select: [], where: [], @@ -415,7 +550,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { queryClient: {}, reader, sqlAnalysis, - pullConfig: { dialect: 'postgres' }, + pullConfig: { dialect: 'postgres', enabledSchemas: ['public'] }, now: new Date('2026-05-11T12:00:00.000Z'), }); @@ -456,7 +591,13 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { const sqlAnalysis: SqlAnalysisPort = { analyzeForFingerprint: vi.fn(), analyzeBatch: vi.fn(async () => new Map([ - ['analytic', { tablesTouched: ['public.orders'], columnsByClause: { select: ['status'], where: [], join: [], groupBy: ['status'] } }], + [ + 'analytic', + { + tablesTouched: [tableRef('public.orders')], + columnsByClause: { select: ['status'], where: [], join: [], groupBy: ['status'] }, + }, + ], ])), validateReadOnly: vi.fn(async () => ({ ok: true })), }; @@ -467,7 +608,7 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { queryClient: {}, reader, sqlAnalysis, - pullConfig: { dialect: 'postgres' }, + pullConfig: { dialect: 'postgres', enabledSchemas: ['public'] }, now: new Date('2026-05-11T12:00:00.000Z'), }); @@ -475,26 +616,27 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledWith( [{ id: 'analytic', sql: 'select status, count(*) from public.orders group by status' }], 'postgres', + undefined, ); expect(await readdir(join(stagedDir, 'tables'))).toEqual(['public.orders.json']); }); - it('merges bare and schema-qualified references to the same table into one work unit', async () => { + it('keeps modeled-schema refs and drops unmodeled-schema refs by default', async () => { const stagedDir = await tempDir(); const reader: HistoricSqlReader = { async probe() { return { warnings: [], info: [] }; }, async *fetchAggregated() { - yield aggregate({ templateId: 'qualified', canonicalSql: 'select count(*) from orbit_raw.accounts' }); - yield aggregate({ templateId: 'bare', canonicalSql: 'select id from accounts where active' }); + yield aggregate({ templateId: 'modeled', canonicalSql: 'select count(*) from orbit_raw.accounts' }); + yield aggregate({ templateId: 'noise', canonicalSql: 'select count(*) from metabase.application_table' }); }, }; const sqlAnalysis: SqlAnalysisPort = { analyzeForFingerprint: vi.fn(), analyzeBatch: vi.fn(async () => new Map([ - ['qualified', { tablesTouched: ['orbit_raw.accounts'], columnsByClause: { select: [], where: [], join: [], groupBy: [] } }], - ['bare', { tablesTouched: ['accounts'], columnsByClause: { select: ['id'], where: ['active'], join: [], groupBy: [] } }], + ['modeled', { tablesTouched: [{ catalog: null, db: 'orbit_raw', name: 'accounts' }], columnsByClause: {} }], + ['noise', { tablesTouched: [{ catalog: null, db: 'metabase', name: 'application_table' }], columnsByClause: {} }], ])), validateReadOnly: vi.fn(async () => ({ ok: true })), }; @@ -505,16 +647,213 @@ describe('stageHistoricSqlAggregatedSnapshot', () => { queryClient: {}, reader, sqlAnalysis, - pullConfig: { dialect: 'postgres' }, + pullConfig: { + dialect: 'postgres', + enabledSchemas: ['orbit_raw'], + modeledTableCatalog: [{ catalog: null, db: 'orbit_raw', name: 'accounts' }], + }, now: new Date('2026-05-11T12:00:00.000Z'), }); - // The bare `accounts` reference resolves to the unique qualified `orbit_raw.accounts`, - // so the two templates collapse into a single work unit instead of two. expect(await readdir(join(stagedDir, 'tables'))).toEqual(['orbit_raw.accounts.json']); - const merged = await readJson>(stagedDir, 'tables/orbit_raw.accounts.json'); - expect(merged.topTemplates.map((t: any) => t.id).sort()).toEqual(['bare', 'qualified']); const manifest = await readJson>(stagedDir, 'manifest.json'); expect(manifest.touchedTableCount).toBe(1); }); + + it('fails open when the implicit modeled scope is empty', async () => { + const stagedDir = await tempDir(); + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ templateId: 'any-table', canonicalSql: 'select count(*) from metabase.application_table' }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map([ + ['any-table', { tablesTouched: [{ catalog: null, db: 'metabase', name: 'application_table' }], columnsByClause: {} }], + ])), + validateReadOnly: vi.fn(async () => ({ ok: true })), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { dialect: 'postgres', enabledSchemas: [], modeledTableCatalog: [] }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + expect(await readdir(join(stagedDir, 'tables'))).toEqual(['metabase.application_table.json']); + const manifest = await readJson>(stagedDir, 'manifest.json'); + expect(manifest.warnings).toContain('query_history_scope_floor_disabled:empty_modeled_scope'); + }); + + it('lets enabledSchemas star disable the floor', async () => { + const stagedDir = await tempDir(); + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ templateId: 'noise', canonicalSql: 'select count(*) from metabase.application_table' }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map([ + ['noise', { tablesTouched: [{ catalog: null, db: 'metabase', name: 'application_table' }], columnsByClause: {} }], + ])), + validateReadOnly: vi.fn(async () => ({ ok: true })), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { + dialect: 'postgres', + enabledSchemas: ['*'], + modeledTableCatalog: [{ catalog: null, db: 'orbit_raw', name: 'accounts' }], + }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + expect(await readdir(join(stagedDir, 'tables'))).toEqual(['metabase.application_table.json']); + }); + + it('matches BigQuery dataset scope even when refs include a catalog', async () => { + const stagedDir = await tempDir(); + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ templateId: 'modeled', canonicalSql: 'select count(*) from `demo-project.orbit_analytics.orders`' }); + yield aggregate({ templateId: 'noise', canonicalSql: 'select count(*) from `demo-project.metabase.application_table`' }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map([ + ['modeled', { tablesTouched: [{ catalog: 'demo-project', db: 'orbit_analytics', name: 'orders' }], columnsByClause: {} }], + ['noise', { tablesTouched: [{ catalog: 'demo-project', db: 'metabase', name: 'application_table' }], columnsByClause: {} }], + ])), + validateReadOnly: vi.fn(async () => ({ ok: true })), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { + dialect: 'bigquery', + enabledSchemas: ['orbit_analytics'], + modeledTableCatalog: [{ catalog: 'demo-project', db: 'orbit_analytics', name: 'orders' }], + }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + expect(await readdir(join(stagedDir, 'tables'))).toEqual(['demo-project.orbit_analytics.orders.json']); + }); + + it('writes propagated scope-floor warnings to the staged manifest', async () => { + const stagedDir = await tempDir(); + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ templateId: 'any-table', canonicalSql: 'select count(*) from metabase.application_table' }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi.fn(async () => new Map([ + ['any-table', { tablesTouched: [{ catalog: null, db: 'metabase', name: 'application_table' }], columnsByClause: {} }], + ])), + validateReadOnly: vi.fn(async () => ({ ok: true })), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { + dialect: 'postgres', + enabledSchemas: ['*'], + scopeFloorWarnings: ['query_history_scope_floor_disabled:catalog_unavailable'], + }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + const manifest = await readJson>(stagedDir, 'manifest.json'); + expect(manifest.warnings).toContain('query_history_scope_floor_disabled:catalog_unavailable'); + expect(await readdir(join(stagedDir, 'tables'))).toEqual(['metabase.application_table.json']); + }); + + it('retries without the catalog and disables the floor when catalog qualification fails wholesale', async () => { + const stagedDir = await tempDir(); + const reader: HistoricSqlReader = { + async probe() { + return { warnings: [], info: [] }; + }, + async *fetchAggregated() { + yield aggregate({ templateId: 'noise', canonicalSql: 'select count(*) from metabase.application_table' }); + }, + }; + const sqlAnalysis: SqlAnalysisPort = { + analyzeForFingerprint: vi.fn(), + analyzeBatch: vi + .fn() + .mockRejectedValueOnce(new Error('catalog qualification failed')) + .mockResolvedValueOnce( + new Map([ + ['noise', { tablesTouched: [{ catalog: null, db: 'metabase', name: 'application_table' }], columnsByClause: {} }], + ]), + ), + validateReadOnly: vi.fn(async () => ({ ok: true })), + }; + + await stageHistoricSqlAggregatedSnapshot({ + stagedDir, + connectionId: 'warehouse', + queryClient: {}, + reader, + sqlAnalysis, + pullConfig: { + dialect: 'postgres', + enabledSchemas: ['orbit_raw'], + modeledTableCatalog: [{ catalog: null, db: 'orbit_raw', name: 'accounts' }], + }, + now: new Date('2026-05-11T12:00:00.000Z'), + }); + + expect(sqlAnalysis.analyzeBatch).toHaveBeenCalledTimes(2); + expect(sqlAnalysis.analyzeBatch).toHaveBeenNthCalledWith( + 1, + [{ id: 'noise', sql: 'select count(*) from metabase.application_table' }], + 'postgres', + { catalog: { tables: [{ catalog: null, db: 'orbit_raw', name: 'accounts' }] } }, + ); + expect(sqlAnalysis.analyzeBatch).toHaveBeenNthCalledWith( + 2, + [{ id: 'noise', sql: 'select count(*) from metabase.application_table' }], + 'postgres', + undefined, + ); + expect(await readdir(join(stagedDir, 'tables'))).toEqual(['metabase.application_table.json']); + const manifest = await readJson>(stagedDir, 'manifest.json'); + expect(manifest.warnings).toContain('query_history_scope_floor_disabled:catalog_qualification_failed'); + }); }); diff --git a/packages/cli/test/context/ingest/adapters/historic-sql/types.test.ts b/packages/cli/test/context/ingest/adapters/historic-sql/types.test.ts index d9417521..ca3d7e70 100644 --- a/packages/cli/test/context/ingest/adapters/historic-sql/types.test.ts +++ b/packages/cli/test/context/ingest/adapters/historic-sql/types.test.ts @@ -59,6 +59,7 @@ describe('historic-sql unified contracts', () => { expect( stagedTableInputSchema.parse({ table: 'public.orders', + tableRef: { catalog: null, db: 'public', name: 'orders' }, stats: { executionsBucket: '10-100', distinctUsersBucket: '2-5', @@ -81,7 +82,7 @@ describe('historic-sql unified contracts', () => { { id: 'pg:123', canonicalSql: 'select * from public.orders', - tablesTouched: ['public.orders'], + tablesTouched: [{ catalog: null, db: 'public', name: 'orders' }], executionsBucket: '10-100', distinctUsersBucket: '2-5', dialect: 'postgres', diff --git a/packages/cli/test/context/ingest/local-adapters.test.ts b/packages/cli/test/context/ingest/local-adapters.test.ts index f70c4879..a8799cee 100644 --- a/packages/cli/test/context/ingest/local-adapters.test.ts +++ b/packages/cli/test/context/ingest/local-adapters.test.ts @@ -1,4 +1,4 @@ -import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; @@ -34,6 +34,36 @@ describe('local ingest adapters', () => { }; } + async function seedLiveScanTable( + projectDir: string, + connectionId: string, + table: { catalog: string | null; db: string | null; name: string }, + ): Promise { + const rawRoot = join(projectDir, 'raw-sources', connectionId, 'live-database', 'sync-1'); + await mkdir(join(rawRoot, 'tables'), { recursive: true }); + await writeFile( + join(rawRoot, 'connection.json'), + `${JSON.stringify({ connectionId, driver: 'postgres' }, null, 2)}\n`, + 'utf-8', + ); + await writeFile( + join(rawRoot, 'tables', `${table.db ?? 'default'}-${table.name}.json`), + `${JSON.stringify( + { + ...table, + kind: 'table', + comment: null, + estimatedRows: null, + columns: [], + foreignKeys: [], + }, + null, + 2, + )}\n`, + 'utf-8', + ); + } + it('registers Metabase locally as a staged-bundle adapter', () => { const adapters = createDefaultLocalIngestAdapters(project); @@ -205,11 +235,14 @@ describe('local ingest adapters', () => { dialect: 'postgres', minExecutions: 7, enabledTables: [], + enabledSchemas: [], + modeledTableCatalog: [], filters: { serviceAccounts: { patterns: ['^svc_'], mode: 'exclude' }, dropTrivialProbes: true, }, redactionPatterns: [], + scopeFloorWarnings: [], staleArchiveAfterDays: 90, }); }); @@ -237,6 +270,71 @@ describe('local ingest adapters', () => { }); }); + it('passes computed modeled scope to direct historic-sql adapter pull config', async () => { + await mkdir(join(project.projectDir, 'semantic-layer/warehouse'), { recursive: true }); + await writeFile( + join(project.projectDir, 'semantic-layer/warehouse/revenue.yaml'), + [ + 'name: revenue', + 'table: orbit_analytics.mart_revenue', + 'grain: [id]', + 'columns:', + ' - name: id', + ' type: string', + '', + ].join('\n'), + 'utf-8', + ); + await seedLiveScanTable(project.projectDir, 'warehouse', { + catalog: null, + db: 'orbit_raw', + name: 'accounts', + }); + const projectWithQueryHistory = projectWithConnections({ + warehouse: { + driver: 'postgres', + schemas: ['orbit_raw'], + context: { + queryHistory: { + enabled: true, + minExecutions: 7, + filters: { dropTrivialProbes: true }, + }, + }, + }, + }); + const adapter = { source: 'historic-sql' } as never; + + await expect(localPullConfigForAdapter(projectWithQueryHistory, adapter, 'warehouse')).resolves.toMatchObject({ + dialect: 'postgres', + minExecutions: 7, + enabledSchemas: ['orbit_analytics', 'orbit_raw'], + modeledTableCatalog: [ + { catalog: null, db: 'orbit_analytics', name: 'mart_revenue' }, + { catalog: null, db: 'orbit_raw', name: 'accounts' }, + ], + }); + }); + + it('passes query-history scope fail-open warnings to direct historic-sql pull config', async () => { + const projectDir = await mkdtemp(join(tmpdir(), 'ktx-local-qh-scope-warning-')); + const project = await initKtxProject({ projectDir }); + project.config.connections.warehouse = { + driver: 'postgres', + schemas: ['orbit_raw'], + context: { queryHistory: { enabled: true } }, + } as never; + const adapter = { source: 'historic-sql' } as never; + + await expect(localPullConfigForAdapter(project, adapter, 'warehouse')).resolves.toMatchObject({ + dialect: 'postgres', + enabledSchemas: ['*'], + scopeFloorWarnings: ['query_history_scope_floor_disabled:catalog_unavailable'], + }); + + await rm(projectDir, { recursive: true, force: true }); + }); + it('rejects local historic-sql pulls when the connection has not enabled historic SQL', async () => { const historicSql = createDefaultLocalIngestAdapters(project, { historicSql: { diff --git a/packages/cli/test/context/sql-analysis/http-sql-analysis-port.test.ts b/packages/cli/test/context/sql-analysis/http-sql-analysis-port.test.ts index 02b275a6..df32fb8d 100644 --- a/packages/cli/test/context/sql-analysis/http-sql-analysis-port.test.ts +++ b/packages/cli/test/context/sql-analysis/http-sql-analysis-port.test.ts @@ -49,7 +49,10 @@ describe('createHttpSqlAnalysisPort', () => { const requestJson = vi.fn(async () => ({ results: { orders: { - tables_touched: ['public.orders', 'public.customers'], + tables_touched: [ + { catalog: null, db: 'public', name: 'orders' }, + { catalog: null, db: 'public', name: 'customers' }, + ], columns_by_clause: { select: ['status'], where: ['created_at'], @@ -79,7 +82,10 @@ describe('createHttpSqlAnalysisPort', () => { [ 'orders', { - tablesTouched: ['public.orders', 'public.customers'], + tablesTouched: [ + { catalog: null, db: 'public', name: 'orders' }, + { catalog: null, db: 'public', name: 'customers' }, + ], columnsByClause: { select: ['status'], where: ['created_at'], @@ -108,6 +114,62 @@ describe('createHttpSqlAnalysisPort', () => { }); }); + it('passes an optional catalog and maps structured table refs for SQL batch analysis', async () => { + const requestJson = vi.fn(async () => ({ + results: { + orders: { + tables_touched: [ + { catalog: null, db: 'orbit_raw', name: 'accounts' }, + { catalog: 'demo_project', db: 'orbit_analytics', name: 'orders' }, + ], + columns_by_clause: { select: ['id'] }, + error: null, + }, + }, + })); + const port = createHttpSqlAnalysisPort({ baseUrl: 'http://python.test', requestJson }); + + await expect( + port.analyzeBatch( + [{ id: 'orders', sql: 'select id from accounts' }], + 'postgres', + { + catalog: { + tables: [ + { catalog: null, db: 'orbit_raw', name: 'accounts', columns: ['id'] }, + { catalog: 'demo_project', db: 'orbit_analytics', name: 'orders', columns: ['id'] }, + ], + }, + }, + ), + ).resolves.toEqual( + new Map([ + [ + 'orders', + { + tablesTouched: [ + { catalog: null, db: 'orbit_raw', name: 'accounts' }, + { catalog: 'demo_project', db: 'orbit_analytics', name: 'orders' }, + ], + columnsByClause: { select: ['id'] }, + error: null, + }, + ], + ]), + ); + + expect(requestJson).toHaveBeenCalledWith('/sql/analyze-batch', { + dialect: 'postgres', + items: [{ id: 'orders', sql: 'select id from accounts' }], + catalog: { + tables: [ + { catalog: null, db: 'orbit_raw', name: 'accounts', columns: ['id'] }, + { catalog: 'demo_project', db: 'orbit_analytics', name: 'orders', columns: ['id'] }, + ], + }, + }); + }); + it('maps read-only SQL validation responses', async () => { const requests: Array<{ path: string; payload: Record }> = []; const port = createHttpSqlAnalysisPort({ @@ -150,7 +212,7 @@ describe('createHttpSqlAnalysisPort', () => { const requestJson = vi.fn(async () => ({ results: { orders: { - tables_touched: ['public.orders'], + tables_touched: [{ catalog: null, db: 'public', name: 'orders' }], columns_by_clause: { select: ['status'], where: [42] }, error: null, }, diff --git a/packages/cli/test/local-adapters.test.ts b/packages/cli/test/local-adapters.test.ts index 345f662a..467c9a56 100644 --- a/packages/cli/test/local-adapters.test.ts +++ b/packages/cli/test/local-adapters.test.ts @@ -2,8 +2,8 @@ import { mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { loadKtxProject } from '../src/context/project/project.js'; -import { afterEach, beforeEach, describe, expect, it } from 'vitest'; -import { createKtxCliLocalIngestAdapters } from '../src/local-adapters.js'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { createKtxCliHistoricSqlRuntime, createKtxCliLocalIngestAdapters } from '../src/local-adapters.js'; function sqlAnalysisStub() { return { @@ -70,6 +70,116 @@ describe('CLI local ingest adapters', () => { ]); }); + it('creates reusable query-history runtime dependencies for setup', async () => { + await writeProject( + tempDir, + [ + 'connections:', + ' warehouse:', + ' driver: postgres', + ' url: env:WAREHOUSE_DATABASE_URL', + ' readonly: true', + ' context:', + ' queryHistory:', + ' enabled: true', + '', + ].join('\n'), + ); + const project = await loadKtxProject({ projectDir: tempDir }); + const sqlAnalysis = sqlAnalysisStub(); + + const runtime = createKtxCliHistoricSqlRuntime(project, 'warehouse', { sqlAnalysis }); + + expect(runtime).toMatchObject({ + dialect: 'postgres', + sqlAnalysis, + }); + expect(runtime?.reader).toBeDefined(); + expect(runtime?.queryClient).toBeDefined(); + }); + + it('uses managed daemon SQL analysis when query-history runtime gets managed daemon options', async () => { + await writeProject( + tempDir, + [ + 'connections:', + ' warehouse:', + ' driver: postgres', + ' url: env:WAREHOUSE_DATABASE_URL', + ' readonly: true', + ' context:', + ' queryHistory:', + ' enabled: true', + '', + ].join('\n'), + ); + const project = await loadKtxProject({ projectDir: tempDir }); + const testIo = { + stdout: { write: vi.fn() }, + stderr: { write: vi.fn() }, + }; + const ensureRuntime = vi.fn(async () => ({ + layout: {} as never, + manifest: {} as never, + })); + const startDaemon = vi.fn(async () => ({ + status: 'started' as const, + layout: {} as never, + state: { pid: 1234 } as never, + baseUrl: 'http://127.0.0.1:61234', + })); + const postJson = vi.fn(async () => ({ + results: { + probe: { + tables_touched: [], + columns_by_clause: {}, + error: null, + }, + }, + })); + + const runtime = createKtxCliHistoricSqlRuntime(project, 'warehouse', { + managedDaemon: { + cliVersion: '0.2.0', + projectDir: tempDir, + installPolicy: 'auto', + io: testIo, + ensureRuntime, + startDaemon, + postJson, + }, + }); + + await expect(runtime?.sqlAnalysis.analyzeBatch([{ id: 'probe', sql: 'select 1' }], 'postgres')).resolves.toEqual( + new Map([ + [ + 'probe', + { + tablesTouched: [], + columnsByClause: {}, + error: null, + }, + ], + ]), + ); + expect(ensureRuntime).toHaveBeenCalledWith({ + cliVersion: '0.2.0', + installPolicy: 'auto', + io: testIo, + feature: 'core', + }); + expect(startDaemon).toHaveBeenCalledWith({ + cliVersion: '0.2.0', + projectDir: tempDir, + features: ['core'], + force: false, + }); + expect(postJson).toHaveBeenCalledWith('http://127.0.0.1:61234', '/sql/analyze-batch', { + dialect: 'postgres', + items: [{ id: 'probe', sql: 'select 1' }], + }); + }); + it('registers historic SQL when explicitly requested even if connection query history is disabled', async () => { await writeProject( tempDir, diff --git a/packages/cli/test/managed-python-http.test.ts b/packages/cli/test/managed-python-http.test.ts index f19b6d72..74334042 100644 --- a/packages/cli/test/managed-python-http.test.ts +++ b/packages/cli/test/managed-python-http.test.ts @@ -161,7 +161,7 @@ describe('KTX daemon ingest ports', () => { const requestJson = vi.fn(async () => ({ results: { orders: { - tables_touched: ['public.orders'], + tables_touched: [{ catalog: null, db: 'public', name: 'orders' }], columns_by_clause: { select: ['status'] }, error: null, }, @@ -175,7 +175,7 @@ describe('KTX daemon ingest ports', () => { [ 'orders', { - tablesTouched: ['public.orders'], + tablesTouched: [{ catalog: null, db: 'public', name: 'orders' }], columnsByClause: { select: ['status'] }, error: null, }, diff --git a/packages/cli/test/public-ingest.test.ts b/packages/cli/test/public-ingest.test.ts index 1a8b457e..ba35faf6 100644 --- a/packages/cli/test/public-ingest.test.ts +++ b/packages/cli/test/public-ingest.test.ts @@ -1,4 +1,4 @@ -import { mkdtemp, rm } from 'node:fs/promises'; +import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { buildDefaultKtxProjectConfig, type KtxProjectConfig } from '../src/context/project/config.js'; @@ -668,12 +668,134 @@ describe('runKtxPublicIngest', () => { dropFailedBelow: { errorRate: 0.5, executions: 3 }, }, redactionPatterns: ['(?i)secret'], - enabledTables: ['orbit_analytics.int_active_contract_arr'], + enabledTables: [{ catalog: null, db: 'orbit_analytics', name: 'int_active_contract_arr' }], }, }); expect(ingestArgs?.historicSqlPullConfigOverride).not.toHaveProperty('enabled'); }); + it('resolves query-history scope after the schema scan writes artifacts', async () => { + const io = makeIo(); + const projectDir = await mkdtemp(join(tmpdir(), 'ktx-public-qh-scope-')); + const project = deepReadyProject({ + warehouse: { + driver: 'postgres', + schemas: ['orbit_raw'], + context: { queryHistory: { enabled: true } }, + }, + }); + const runScan = vi.fn(async () => { + await mkdir(join(projectDir, 'semantic-layer/warehouse'), { recursive: true }); + await writeFile( + join(projectDir, 'semantic-layer/warehouse/revenue.yaml'), + [ + 'name: revenue', + 'table: orbit_analytics.mart_revenue', + 'grain: [id]', + 'columns:', + ' - name: id', + ' type: string', + '', + ].join('\n'), + 'utf-8', + ); + const rawRoot = join(projectDir, 'raw-sources/warehouse/live-database/sync-1'); + await mkdir(join(rawRoot, 'tables'), { recursive: true }); + await writeFile( + join(rawRoot, 'connection.json'), + `${JSON.stringify({ connectionId: 'warehouse', driver: 'postgres' }, null, 2)}\n`, + 'utf-8', + ); + await writeFile( + join(rawRoot, 'tables/accounts.json'), + `${JSON.stringify( + { + catalog: null, + db: 'orbit_raw', + name: 'accounts', + kind: 'table', + comment: null, + estimatedRows: null, + columns: [ + { + name: 'id', + nativeType: 'integer', + normalizedType: 'integer', + dimensionType: 'number', + nullable: false, + primaryKey: true, + comment: null, + }, + ], + foreignKeys: [], + }, + null, + 2, + )}\n`, + 'utf-8', + ); + await writeFile( + join(rawRoot, 'scan-report.json'), + `${JSON.stringify( + { + connectionId: 'warehouse', + driver: 'postgres', + syncId: 'sync-1', + runId: 'scan-sync-1', + trigger: 'cli', + mode: 'enriched', + dryRun: false, + artifactPaths: { + rawSourcesDir: 'raw-sources/warehouse/live-database/sync-1', + reportPath: 'raw-sources/warehouse/live-database/sync-1/scan-report.json', + manifestShards: [], + enrichmentArtifacts: [], + }, + counts: {}, + warnings: [], + enrichment: {}, + enrichmentState: {}, + }, + null, + 2, + )}\n`, + 'utf-8', + ); + return 0; + }); + const runIngest = vi.fn>(async () => 0); + + await expect( + runKtxPublicIngest( + { + command: 'run', + projectDir, + targetConnectionId: 'warehouse', + all: false, + json: false, + inputMode: 'disabled', + queryHistory: 'enabled', + }, + io.io, + { loadProject: vi.fn(async () => ({ ...project, projectDir })), runScan, runIngest }, + ), + ).resolves.toBe(0); + + const ingestArgs = runIngest.mock.calls[0]?.[0] as + | Extract>[0], { command: 'run' }> + | undefined; + expect(ingestArgs?.historicSqlPullConfigOverride).toMatchObject({ + dialect: 'postgres', + enabledSchemas: ['orbit_analytics', 'orbit_raw'], + modeledTableCatalog: [ + { catalog: null, db: 'orbit_analytics', name: 'mart_revenue' }, + { catalog: null, db: 'orbit_raw', name: 'accounts' }, + ], + }); + + await rm(projectDir, { recursive: true, force: true }); + }); + it('prints the schema-first notice for explicit query-history runs', async () => { const io = makeIo(); const project = deepReadyProject({ diff --git a/packages/cli/test/setup-databases.test.ts b/packages/cli/test/setup-databases.test.ts index 265459e2..6adb0af0 100644 --- a/packages/cli/test/setup-databases.test.ts +++ b/packages/cli/test/setup-databases.test.ts @@ -6,6 +6,7 @@ import { parseKtxProjectConfig } from '../src/context/project/config.js'; import { readKtxSetupState, writeKtxSetupState } from '../src/context/project/setup-config.js'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { + managedDaemonOptionsForSetupQueryHistoryPicker, type KtxSetupDatabaseDriver, type KtxSetupDatabasesDeps, type KtxSetupDatabasesPromptAdapter, @@ -137,6 +138,22 @@ function textInputPrompt(message: string): string { return `${title}\n│\n│ ${bodyLines.join('\n│ ')}\n│ Press Escape to go back.\n│`; } +function queryHistoryFromConfig(connection: unknown): { + filters?: { serviceAccounts?: unknown; dropTrivialProbes?: boolean }; +} | undefined { + if (!connection || typeof connection !== 'object' || Array.isArray(connection)) { + return undefined; + } + const context = (connection as { context?: unknown }).context; + if (!context || typeof context !== 'object' || Array.isArray(context)) { + return undefined; + } + const queryHistory = (context as { queryHistory?: unknown }).queryHistory; + return queryHistory && typeof queryHistory === 'object' && !Array.isArray(queryHistory) + ? (queryHistory as { filters?: { serviceAccounts?: unknown; dropTrivialProbes?: boolean } }) + : undefined; +} + describe('setup databases step', () => { let tempDir: string; @@ -150,6 +167,61 @@ describe('setup databases step', () => { await rm(tempDir, { recursive: true, force: true }); }); + it('builds managed daemon options for setup query-history SQL analysis', () => { + const io = makeIo(); + + expect( + managedDaemonOptionsForSetupQueryHistoryPicker({ + projectDir: tempDir, + args: { + inputMode: 'disabled', + cliVersion: '0.2.0', + runtimeInstallPolicy: 'auto', + }, + io: io.io, + }), + ).toEqual({ + cliVersion: '0.2.0', + projectDir: tempDir, + installPolicy: 'auto', + io: io.io, + }); + }); + + it('defaults managed daemon setup options when the database step is called directly', () => { + const io = makeIo(); + + expect( + managedDaemonOptionsForSetupQueryHistoryPicker({ + projectDir: tempDir, + args: { + inputMode: 'disabled', + }, + io: io.io, + }), + ).toMatchObject({ + cliVersion: expect.any(String), + projectDir: tempDir, + installPolicy: 'never', + io: io.io, + }); + + expect( + managedDaemonOptionsForSetupQueryHistoryPicker({ + projectDir: tempDir, + args: { + inputMode: 'auto', + }, + io: io.io, + }), + ).toMatchObject({ + cliVersion: expect.any(String), + projectDir: tempDir, + installPolicy: 'prompt', + io: io.io, + }); + }); + it('shows every supported database in the interactive checklist', async () => { const prompts = makePromptAdapter({ multiselectValues: [['back']] }); @@ -2569,6 +2641,190 @@ describe('setup databases step', () => { expect(io.stdout()).toContain('pg_stat_statements ready'); }); + it('auto-applies derived query-history service-account filters in non-interactive setup', async () => { + const io = makeIo(); + const queryHistoryFilterPicker = vi.fn(async () => ({ + excludedRoles: [ + { + role: 'svc_loader', + pattern: '^svc_loader$', + reason: 'Runs recurring loader traffic against modeled tables.', + }, + ], + consideredRoleCount: 2, + skipped: null, + warnings: [], + })); + + const result = await runKtxSetupDatabasesStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + databaseDrivers: ['postgres'], + databaseConnectionId: 'warehouse', + databaseUrl: 'env:DATABASE_URL', + databaseSchemas: ['public'], + enableQueryHistory: true, + skipDatabases: false, + }, + io.io, + { + testConnection: vi.fn(async () => 0), + scanConnection: vi.fn(async () => 0), + historicSqlReadinessProbe: vi.fn(async () => { + const runner = fakeHistoricSqlRunner('postgres', 'pg_stat_statements'); + return { + ok: true as const, + dialect: 'postgres' as const, + runner, + result: { pgServerVersion: 'PostgreSQL 16.4', warnings: [], info: [] }, + }; + }), + queryHistoryFilterPicker, + createQueryHistoryLlmRuntime: vi.fn(() => null), + }, + ); + + expect(result.status).toBe('ready'); + expect(queryHistoryFilterPicker).toHaveBeenCalledTimes(1); + const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')); + expect(config.connections.warehouse).toMatchObject({ + context: { + queryHistory: { + filters: { + dropTrivialProbes: true, + serviceAccounts: { + mode: 'exclude', + patterns: ['^svc_loader$'], + }, + }, + }, + }, + }); + expect(io.stdout()).toContain('Proposed query-history service-account filters'); + expect(io.stdout()).toContain('svc_loader'); + }); + + it('lets interactive setup skip applying derived filters', async () => { + const io = makeIo(); + const prompts = makePromptAdapter({ + selectValues: ['skip'], + }); + + const result = await runKtxSetupDatabasesStep( + { + projectDir: tempDir, + inputMode: 'auto', + yes: false, + databaseDrivers: ['postgres'], + databaseConnectionId: 'warehouse', + databaseUrl: 'env:DATABASE_URL', + databaseSchemas: ['public'], + enableQueryHistory: true, + skipDatabases: false, + }, + io.io, + { + prompts, + testConnection: vi.fn(async () => 0), + scanConnection: vi.fn(async () => 0), + historicSqlReadinessProbe: vi.fn(async () => { + const runner = fakeHistoricSqlRunner('postgres', 'pg_stat_statements'); + return { + ok: true as const, + dialect: 'postgres' as const, + runner, + result: { pgServerVersion: 'PostgreSQL 16.4', warnings: [], info: [] }, + }; + }), + queryHistoryFilterPicker: vi.fn(async () => ({ + excludedRoles: [{ role: 'svc_loader', pattern: '^svc_loader$', reason: 'Loader traffic.' }], + consideredRoleCount: 2, + skipped: null, + warnings: [], + })), + createQueryHistoryLlmRuntime: vi.fn(() => null), + }, + ); + + expect(result.status).toBe('ready'); + const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')); + expect(queryHistoryFromConfig(config.connections.warehouse)?.filters).toEqual({ dropTrivialProbes: true }); + expect(prompts.select).toHaveBeenCalledWith({ + message: 'Apply 1 derived query-history service-account exclusion?', + options: [ + { value: 'apply', label: 'Apply derived filters (recommended)' }, + { value: 'skip', label: 'Leave query history filters unchanged' }, + ], + }); + }); + + it('does not overwrite an existing serviceAccounts block', async () => { + await writeFile( + join(tempDir, 'ktx.yaml'), + [ + 'connections:', + ' warehouse:', + ' driver: postgres', + ' url: env:DATABASE_URL', + ' context:', + ' queryHistory:', + ' enabled: true', + ' filters:', + ' dropTrivialProbes: true', + ' serviceAccounts:', + ' mode: exclude', + ' patterns:', + " - '^existing$'", + '', + ].join('\n'), + 'utf-8', + ); + + const io = makeIo(); + const result = await runKtxSetupDatabasesStep( + { + projectDir: tempDir, + inputMode: 'disabled', + yes: true, + databaseConnectionIds: ['warehouse'], + databaseSchemas: [], + enableQueryHistory: true, + skipDatabases: false, + }, + io.io, + { + testConnection: vi.fn(async () => 0), + scanConnection: vi.fn(async () => 0), + historicSqlReadinessProbe: vi.fn(async () => { + const runner = fakeHistoricSqlRunner('postgres', 'pg_stat_statements'); + return { + ok: true as const, + dialect: 'postgres' as const, + runner, + result: { pgServerVersion: 'PostgreSQL 16.4', warnings: [], info: [] }, + }; + }), + queryHistoryFilterPicker: vi.fn(async () => ({ + excludedRoles: [{ role: 'svc_loader', pattern: '^svc_loader$', reason: 'Loader traffic.' }], + consideredRoleCount: 2, + skipped: { reason: 'user-block-present' as const }, + warnings: [], + })), + createQueryHistoryLlmRuntime: vi.fn(() => null), + }, + ); + + expect(result.status).toBe('ready'); + const config = parseKtxProjectConfig(await readFile(join(tempDir, 'ktx.yaml'), 'utf-8')); + expect(queryHistoryFromConfig(config.connections.warehouse)?.filters?.serviceAccounts).toEqual({ + mode: 'exclude', + patterns: ['^existing$'], + }); + expect(io.stdout()).toContain('Existing query-history service-account filters left unchanged'); + }); + it('asks interactive Postgres setup whether to enable query history', async () => { await writeFile( join(tempDir, 'ktx.yaml'), diff --git a/packages/cli/test/setup.test.ts b/packages/cli/test/setup.test.ts index e4eca44d..9b8bf689 100644 --- a/packages/cli/test/setup.test.ts +++ b/packages/cli/test/setup.test.ts @@ -1684,6 +1684,9 @@ describe('setup status', () => { expect.objectContaining({ projectDir: tempDir, inputMode: 'disabled', + yes: true, + cliVersion: '0.2.0', + runtimeInstallPolicy: 'auto', databaseDrivers: ['postgres'], databaseConnectionId: 'warehouse', databaseUrl: 'env:DATABASE_URL', diff --git a/packages/cli/test/sql.test.ts b/packages/cli/test/sql.test.ts index b48ebe5b..ef74fd49 100644 --- a/packages/cli/test/sql.test.ts +++ b/packages/cli/test/sql.test.ts @@ -33,7 +33,7 @@ function makeIo(options: { isTTY?: boolean } = {}) { function makeSqlAnalysis(result: Awaited>): SqlAnalysisPort { return { analyzeForFingerprint: vi.fn(), - analyzeBatch: vi.fn(async () => new Map([['cli-sql', { tablesTouched: ['orders'], columnsByClause: {} }]])), + analyzeBatch: vi.fn(async () => new Map([['cli-sql', { tablesTouched: [{ catalog: null, db: null, name: 'orders' }], columnsByClause: {} }]])), validateReadOnly: vi.fn(async () => result), }; } diff --git a/python/ktx-daemon/src/ktx_daemon/sql_analysis.py b/python/ktx-daemon/src/ktx_daemon/sql_analysis.py index e831e47f..54f3d0e2 100644 --- a/python/ktx-daemon/src/ktx_daemon/sql_analysis.py +++ b/python/ktx-daemon/src/ktx_daemon/sql_analysis.py @@ -2,15 +2,32 @@ from __future__ import annotations import os from concurrent.futures import ProcessPoolExecutor +from dataclasses import dataclass from typing import Literal import sqlglot from pydantic import BaseModel, Field from sqlglot import exp +from sqlglot.optimizer.normalize_identifiers import normalize_identifiers +from sqlglot.optimizer.qualify_tables import qualify_tables SqlAnalysisClause = Literal["select", "where", "join", "groupBy", "having", "orderBy"] +class SqlAnalysisTableRef(BaseModel): + catalog: str | None = None + db: str | None = None + name: str + + +class SqlAnalysisCatalogTable(SqlAnalysisTableRef): + columns: list[str] = Field(default_factory=list) + + +class AnalyzeSqlCatalog(BaseModel): + tables: list[SqlAnalysisCatalogTable] = Field(default_factory=list) + + class AnalyzeSqlBatchItem(BaseModel): id: str sql: str @@ -19,11 +36,12 @@ class AnalyzeSqlBatchItem(BaseModel): class AnalyzeSqlBatchRequest(BaseModel): dialect: str items: list[AnalyzeSqlBatchItem] + catalog: AnalyzeSqlCatalog | None = None max_workers: int | None = Field(default=None, ge=1, le=32) class AnalyzeSqlBatchResult(BaseModel): - tables_touched: list[str] = Field(default_factory=list) + tables_touched: list[SqlAnalysisTableRef] = Field(default_factory=list) columns_by_clause: dict[SqlAnalysisClause, list[str]] = Field(default_factory=dict) error: str | None = None @@ -82,17 +100,76 @@ def _ordered_unique(values: list[str]) -> list[str]: return result -def _table_ref(table: exp.Table) -> str: - parts: list[str] = [] +def _normalize_identifier(value: str | None, dialect: str) -> str | None: + if value is None: + return None + identifier = exp.to_identifier(value) + identifier.meta["is_table"] = True + normalized = normalize_identifiers(identifier, dialect=dialect) + return str(normalized.name) + + +def _normalized_ref(ref: SqlAnalysisTableRef, dialect: str) -> SqlAnalysisTableRef: + return SqlAnalysisTableRef( + catalog=_normalize_identifier(ref.catalog, dialect), + db=_normalize_identifier(ref.db, dialect), + name=_normalize_identifier(ref.name, dialect) or ref.name, + ) + + +@dataclass(frozen=True) +class _CatalogIndex: + by_full: dict[tuple[str | None, str | None, str], SqlAnalysisTableRef] + by_name: dict[str, list[SqlAnalysisTableRef]] + + +def _catalog_index( + catalog: AnalyzeSqlCatalog | None, dialect: str +) -> _CatalogIndex | None: + if catalog is None or not catalog.tables: + return None + by_full: dict[tuple[str | None, str | None, str], SqlAnalysisTableRef] = {} + by_name: dict[str, list[SqlAnalysisTableRef]] = {} + for table in catalog.tables: + ref = _normalized_ref(table, dialect) + key = (ref.catalog, ref.db, ref.name) + by_full[key] = ref + by_name.setdefault(ref.name, []).append(ref) + return _CatalogIndex(by_full=by_full, by_name=by_name) + + +def _raw_table_ref(table: exp.Table, dialect: str) -> SqlAnalysisTableRef | None: + if not table.name: + return None catalog = table.args.get("catalog") db = table.args.get("db") - if catalog is not None and getattr(catalog, "name", None): - parts.append(str(catalog.name)) - if db is not None and getattr(db, "name", None): - parts.append(str(db.name)) - if table.name: - parts.append(str(table.name)) - return ".".join(parts) + return _normalized_ref( + SqlAnalysisTableRef( + catalog=str(catalog.name) + if catalog is not None and getattr(catalog, "name", None) + else None, + db=str(db.name) if db is not None and getattr(db, "name", None) else None, + name=str(table.name), + ), + dialect, + ) + + +def _resolve_table_refs( + raw: SqlAnalysisTableRef, + catalog: _CatalogIndex | None, +) -> list[SqlAnalysisTableRef]: + if catalog is None: + return [raw] + exact = catalog.by_full.get((raw.catalog, raw.db, raw.name)) + if exact is not None: + return [exact] + if raw.db is not None: + return [raw] + matches = catalog.by_name.get(raw.name, []) + if matches: + return matches + return [SqlAnalysisTableRef(catalog=None, db=None, name=raw.name)] def _column_name(column: exp.Column) -> str: @@ -146,33 +223,48 @@ def _columns_by_clause(tree: exp.Expression) -> dict[SqlAnalysisClause, list[str return result +def _table_refs( + tree: exp.Expression, dialect: str, catalog: _CatalogIndex | None +) -> list[SqlAnalysisTableRef]: + normalized_tree = normalize_identifiers(tree, dialect=dialect) + qualified_tree = qualify_tables(normalized_tree, dialect=dialect) + cte_names = {cte.alias_or_name.lower() for cte in qualified_tree.find_all(exp.CTE)} + refs: list[SqlAnalysisTableRef] = [] + seen: set[tuple[str | None, str | None, str]] = set() + for table in qualified_tree.find_all(exp.Table): + if table.name.lower() in cte_names: + continue + raw = _raw_table_ref(table, dialect) + if raw is None: + continue + for ref in _resolve_table_refs(raw, catalog): + key = (ref.catalog, ref.db, ref.name) + if key not in seen: + seen.add(key) + refs.append(ref) + return refs + + def _analyze_one( - item_id: str, sql: str, dialect: str + item_id: str, sql: str, dialect: str, catalog: _CatalogIndex | None ) -> tuple[str, AnalyzeSqlBatchResult]: try: tree = sqlglot.parse_one(sql, read=dialect) except sqlglot.errors.SqlglotError as exc: return item_id, AnalyzeSqlBatchResult(error=str(exc)) - cte_names = {cte.alias_or_name.lower() for cte in tree.find_all(exp.CTE)} - table_refs = [ - table_ref - for table_ref in (_table_ref(table) for table in tree.find_all(exp.Table)) - if table_ref and table_ref.split(".")[-1].lower() not in cte_names - ] - return item_id, AnalyzeSqlBatchResult( - tables_touched=_ordered_unique(table_refs), + tables_touched=_table_refs(tree, dialect, catalog), columns_by_clause=_columns_by_clause(tree), error=None, ) def _analyze_payload( - payload: tuple[str, str, str], + payload: tuple[str, str, str, _CatalogIndex | None], ) -> tuple[str, AnalyzeSqlBatchResult]: - item_id, sql, dialect = payload - return _analyze_one(item_id, sql, dialect) + item_id, sql, dialect, catalog = payload + return _analyze_one(item_id, sql, dialect, catalog) def validate_read_only_sql_response( @@ -222,7 +314,8 @@ def _worker_count(request: AnalyzeSqlBatchRequest) -> int: def analyze_sql_batch_response( request: AnalyzeSqlBatchRequest, ) -> AnalyzeSqlBatchResponse: - payloads = [(item.id, item.sql, request.dialect) for item in request.items] + catalog = _catalog_index(request.catalog, request.dialect) + payloads = [(item.id, item.sql, request.dialect, catalog) for item in request.items] if _worker_count(request) == 1: analyzed = [_analyze_payload(payload) for payload in payloads] else: diff --git a/python/ktx-daemon/tests/test_app.py b/python/ktx-daemon/tests/test_app.py index 9960daaf..2c3237ad 100644 --- a/python/ktx-daemon/tests/test_app.py +++ b/python/ktx-daemon/tests/test_app.py @@ -368,7 +368,9 @@ def test_sql_analyze_batch_endpoint_returns_per_item_results() -> None: assert response.status_code == 200 body = response.json() - assert body["results"]["orders"]["tables_touched"] == ["public.orders"] + assert body["results"]["orders"]["tables_touched"] == [ + {"catalog": None, "db": "public", "name": "orders"} + ] assert body["results"]["orders"]["columns_by_clause"] == { "select": ["status"], "where": ["created_at"], diff --git a/python/ktx-daemon/tests/test_sql_analysis.py b/python/ktx-daemon/tests/test_sql_analysis.py index 855d16fd..2fb3970a 100644 --- a/python/ktx-daemon/tests/test_sql_analysis.py +++ b/python/ktx-daemon/tests/test_sql_analysis.py @@ -32,7 +32,10 @@ def test_analyze_sql_batch_extracts_tables_and_clause_columns() -> None: result = response.results["orders_by_customer"] assert result.error is None - assert result.tables_touched == ["public.orders", "public.customers"] + assert [item.model_dump() for item in result.tables_touched] == [ + {"catalog": None, "db": "public", "name": "orders"}, + {"catalog": None, "db": "public", "name": "customers"}, + ] assert result.columns_by_clause == { "select": ["status"], "where": ["created_at"], @@ -56,6 +59,114 @@ def test_analyze_sql_batch_returns_per_item_parse_errors() -> None: assert result.error is not None +def test_analyze_sql_batch_qualifies_bare_table_from_catalog() -> None: + response = analyze_sql_batch_response( + AnalyzeSqlBatchRequest( + dialect="postgres", + catalog={ + "tables": [ + { + "catalog": None, + "db": "orbit_raw", + "name": "accounts", + "columns": ["id"], + }, + { + "catalog": None, + "db": "orbit_analytics", + "name": "orders", + "columns": ["id"], + }, + ] + }, + items=[AnalyzeSqlBatchItem(id="bare", sql="select id from accounts")], + max_workers=1, + ) + ) + + assert [item.model_dump() for item in response.results["bare"].tables_touched] == [ + {"catalog": None, "db": "orbit_raw", "name": "accounts"} + ] + + +def test_analyze_sql_batch_returns_all_ambiguous_modeled_matches() -> None: + response = analyze_sql_batch_response( + AnalyzeSqlBatchRequest( + dialect="postgres", + catalog={ + "tables": [ + { + "catalog": None, + "db": "orbit_raw", + "name": "events", + "columns": ["id"], + }, + { + "catalog": None, + "db": "orbit_analytics", + "name": "events", + "columns": ["id"], + }, + ] + }, + items=[AnalyzeSqlBatchItem(id="ambiguous", sql="select id from events")], + max_workers=1, + ) + ) + + assert [ + item.model_dump() for item in response.results["ambiguous"].tables_touched + ] == [ + {"catalog": None, "db": "orbit_raw", "name": "events"}, + {"catalog": None, "db": "orbit_analytics", "name": "events"}, + ] + + +def test_analyze_sql_batch_leaves_unresolved_bare_refs_unqualified() -> None: + response = analyze_sql_batch_response( + AnalyzeSqlBatchRequest( + dialect="postgres", + catalog={ + "tables": [{"catalog": None, "db": "orbit_raw", "name": "accounts"}] + }, + items=[AnalyzeSqlBatchItem(id="missing", sql="select * from invoices")], + max_workers=1, + ) + ) + + assert [ + item.model_dump() for item in response.results["missing"].tables_touched + ] == [{"catalog": None, "db": None, "name": "invoices"}] + + +def test_analyze_sql_batch_returns_bigquery_project_dataset_table_refs() -> None: + response = analyze_sql_batch_response( + AnalyzeSqlBatchRequest( + dialect="bigquery", + catalog={ + "tables": [ + { + "catalog": "demo-project", + "db": "orbit_analytics", + "name": "orders", + } + ] + }, + items=[ + AnalyzeSqlBatchItem( + id="bq", + sql="select * from `demo-project.orbit_analytics.orders`", + ) + ], + max_workers=1, + ) + ) + + assert [item.model_dump() for item in response.results["bq"].tables_touched] == [ + {"catalog": "demo-project", "db": "orbit_analytics", "name": "orders"} + ] + + def test_columns_from_nodes_ignores_non_expression_clause_values() -> None: assert _columns_from_nodes([True, False, None]) == []