From 18245c23730d6019bea75795bd910d840052680d Mon Sep 17 00:00:00 2001 From: Mayorkun Ayanshina Date: Mon, 8 Jun 2026 11:21:19 +0100 Subject: [PATCH] feat(mysql): implement columnStats using INFORMATION_SCHEMA.STATISTICS (#233) * feat(mysql): implement columnStats using INFORMATION_SCHEMA.STATISTICS Enable column cardinality statistics for the MySQL connector by querying INFORMATION_SCHEMA.STATISTICS, which provides index-based cardinality estimates without requiring additional permissions. - Add generateColumnStatisticsQuery() to KtxMysqlDialect - Add getColumnStatistics() and columnStats() to KtxMysqlScanConnector - Flip columnStats capability from false to true - Add MysqlStatsRow and KtxMysqlColumnStatisticsResult interfaces - Add tests for dialect query generation and connector stats retrieval - Update dialect conformance fixture for mysql * fix(mysql): filter to leading index columns to avoid inflated cardinality Add AND SEQ_IN_INDEX = 1 to INFORMATION_SCHEMA.STATISTICS query to ensure only leading index columns are returned. For composite indexes, non-leading columns report the cardinality of the index prefix rather than the column's own distinct count, which inflates distinctCount. Add regression test asserting SEQ_IN_INDEX = 1 is present in the query. * fix: add trailing newline to dialect.test.ts --------- Co-authored-by: Andrey Avtomonov --- .../cli/src/connectors/mysql/connector.ts | 36 +++++++++++++++++-- packages/cli/src/connectors/mysql/dialect.ts | 14 ++++++-- .../test/connectors/mysql/connector.test.ts | 33 ++++++++++++++--- .../cli/test/connectors/mysql/dialect.test.ts | 22 ++++++++++++ .../test/context/connections/dialects.test.ts | 2 +- 5 files changed, 97 insertions(+), 10 deletions(-) diff --git a/packages/cli/src/connectors/mysql/connector.ts b/packages/cli/src/connectors/mysql/connector.ts index c147c7dd..080c5cdd 100644 --- a/packages/cli/src/connectors/mysql/connector.ts +++ b/packages/cli/src/connectors/mysql/connector.ts @@ -159,6 +159,15 @@ interface MysqlDistinctValueRow extends RowDataPacket { val: unknown; } +interface MysqlStatsRow extends RowDataPacket { + column_name: string; + estimated_cardinality: number | null; +} + +export interface KtxMysqlColumnStatisticsResult { + cardinalityByColumn: Map; +} + class DefaultMysqlPoolFactory implements KtxMysqlPoolFactory { createPool(config: KtxMysqlPoolConfig): KtxMysqlPool { return mysql.createPool(config) as Pool; @@ -384,7 +393,7 @@ export class KtxMysqlScanConnector implements KtxScanConnector { readonly capabilities = createKtxConnectorCapabilities({ tableSampling: true, columnSampling: true, - columnStats: false, + columnStats: true, readOnlySql: true, nestedAnalysis: true, formalForeignKeys: true, @@ -562,8 +571,29 @@ export class KtxMysqlScanConnector implements KtxScanConnector { return { values, nullCount: null, distinctCount: null }; } - async columnStats(_input: KtxColumnStatsInput, _ctx: KtxScanContext): Promise { - return null; + async columnStats(input: KtxColumnStatsInput, _ctx: KtxScanContext): Promise { + const stats = await this.getColumnStatistics(input.table); + const value = stats?.cardinalityByColumn.get(input.column); + return value === undefined + ? null + : { min: null, max: null, average: null, nullCount: null, distinctCount: value }; + } + + async getColumnStatistics(table: KtxTableRef): Promise { + const schema = table.db ?? this.poolConfig.database; + const sql = this.dialect.generateColumnStatisticsQuery(schema, table.name); + if (!sql) { + return null; + } + const rows = await this.queryRaw(sql); + const cardinalityByColumn = new Map(); + for (const row of rows) { + const cardinality = Number(row.estimated_cardinality); + if (Number.isFinite(cardinality) && cardinality >= 0) { + cardinalityByColumn.set(row.column_name, cardinality); + } + } + return cardinalityByColumn.size > 0 ? { cardinalityByColumn } : null; } async executeReadOnly(input: KtxMysqlReadOnlyQueryInput, _ctx: KtxScanContext): Promise { diff --git a/packages/cli/src/connectors/mysql/dialect.ts b/packages/cli/src/connectors/mysql/dialect.ts index 7f9cc725..6b26c97a 100644 --- a/packages/cli/src/connectors/mysql/dialect.ts +++ b/packages/cli/src/connectors/mysql/dialect.ts @@ -171,8 +171,18 @@ export class KtxMysqlDialect implements KtxDialect { `; } - generateColumnStatisticsQuery(_schemaName: string, _tableName: string): string | null { - return null; + generateColumnStatisticsQuery(schemaName: string, tableName: string): string | null { + return ` + SELECT + COLUMN_NAME AS column_name, + MAX(CARDINALITY) AS estimated_cardinality + FROM INFORMATION_SCHEMA.STATISTICS + WHERE TABLE_SCHEMA = '${schemaName.replace(/'/g, "''")}' + AND TABLE_NAME = '${tableName.replace(/'/g, "''")}' + AND CARDINALITY IS NOT NULL + AND SEQ_IN_INDEX = 1 + GROUP BY COLUMN_NAME + `; } generateRandomizedCardinalitySampleQuery(tableName: string, columnName: string, sampleSize: number): string { diff --git a/packages/cli/test/connectors/mysql/connector.test.ts b/packages/cli/test/connectors/mysql/connector.test.ts index c8334164..829d2b0e 100644 --- a/packages/cli/test/connectors/mysql/connector.test.ts +++ b/packages/cli/test/connectors/mysql/connector.test.ts @@ -74,6 +74,16 @@ function fakePoolFactory(): KtxMysqlPoolFactory { if (sql.trim() === 'SELECT 1') { return mysqlResult([{ '1': 1 }], [{ name: '1', type: 8 }]); } + if (sql.includes('INFORMATION_SCHEMA.STATISTICS')) { + expect(sql).toContain('SEQ_IN_INDEX = 1'); + return mysqlResult( + [ + { column_name: 'id', estimated_cardinality: 2 }, + { column_name: 'customer_id', estimated_cardinality: 2 }, + ], + [{ name: 'column_name' }, { name: 'estimated_cardinality' }], + ); + } throw new Error(`Unexpected SQL: ${sql} params=${JSON.stringify(params)}`); }); const release = vi.fn(); @@ -515,10 +525,25 @@ describe('KtxMysqlScanConnector', () => { { catalog: null, schema: 'analytics', name: 'orders', kind: 'table' }, { catalog: null, schema: 'analytics', name: 'order_summary', kind: 'view' }, ]); - await expect(connector.columnStats( - { connectionId: 'warehouse', table: { catalog: null, db: 'analytics', name: 'orders' }, column: 'status' }, - { runId: 'scan-run-1' }, - )).resolves.toBeNull(); + await expect( + connector.columnStats( + { connectionId: 'warehouse', table: { catalog: null, db: 'analytics', name: 'orders' }, column: 'id' }, + { runId: 'scan-run-1' }, + ), + ).resolves.toEqual({ min: null, max: null, average: null, nullCount: null, distinctCount: 2 }); + + await expect( + connector.columnStats( + { connectionId: 'warehouse', table: { catalog: null, db: 'analytics', name: 'orders' }, column: 'status' }, + { runId: 'scan-run-1' }, + ), + ).resolves.toBeNull(); + + await expect( + connector.getColumnStatistics({ catalog: null, db: 'analytics', name: 'orders' }), + ).resolves.toMatchObject({ + cardinalityByColumn: new Map([['id', 2], ['customer_id', 2]]), + }); await connector.cleanup(); }); diff --git a/packages/cli/test/connectors/mysql/dialect.test.ts b/packages/cli/test/connectors/mysql/dialect.test.ts index a00d6188..26fade92 100644 --- a/packages/cli/test/connectors/mysql/dialect.test.ts +++ b/packages/cli/test/connectors/mysql/dialect.test.ts @@ -36,4 +36,26 @@ describe('KtxMysqlDialect', () => { expect(dialect.getLimitOffsetClause(10, 20)).toBe('LIMIT 10 OFFSET 20'); }); + + it('generates column statistics query using INFORMATION_SCHEMA.STATISTICS', () => { + const sql = dialect.generateColumnStatisticsQuery('analytics', 'orders'); + expect(sql).not.toBeNull(); + expect(sql).toContain('INFORMATION_SCHEMA.STATISTICS'); + expect(sql).toContain("TABLE_SCHEMA = 'analytics'"); + expect(sql).toContain("TABLE_NAME = 'orders'"); + expect(sql).toContain('CARDINALITY IS NOT NULL'); + expect(sql).toContain('column_name'); + expect(sql).toContain('estimated_cardinality'); + }); + + it('filters to leading index columns only (SEQ_IN_INDEX = 1) to avoid inflated cardinality from composite indexes', () => { + const sql = dialect.generateColumnStatisticsQuery('analytics', 'orders'); + expect(sql).toContain('SEQ_IN_INDEX = 1'); + }); + + it('escapes single quotes in schema and table names for statistics query', () => { + const sql = dialect.generateColumnStatisticsQuery("andy's_db", "o'rders"); + expect(sql).toContain("TABLE_SCHEMA = 'andy''s_db'"); + expect(sql).toContain("TABLE_NAME = 'o''rders'"); + }); }); diff --git a/packages/cli/test/context/connections/dialects.test.ts b/packages/cli/test/context/connections/dialects.test.ts index 0b72566e..217be1eb 100644 --- a/packages/cli/test/context/connections/dialects.test.ts +++ b/packages/cli/test/context/connections/dialects.test.ts @@ -89,7 +89,7 @@ const fixtures: DialectFixture[] = [ cardinalityContains: 'SELECT COUNT(DISTINCT val) AS cardinality', randomizedCardinalityContains: 'ORDER BY RAND()', distinctValuesContains: 'SELECT DISTINCT CAST(`status` AS CHAR) AS val', - statisticsContains: null, + statisticsContains: 'INFORMATION_SCHEMA.STATISTICS', dimensionInput: 'tinyint(1)', dimensionType: 'boolean', nativeTypeInput: 'varchar(255)',