feat(mysql): implement columnStats using INFORMATION_SCHEMA.STATISTICS (#233)

* feat(mysql): implement columnStats using INFORMATION_SCHEMA.STATISTICS

Enable column cardinality statistics for the MySQL connector by querying
INFORMATION_SCHEMA.STATISTICS, which provides index-based cardinality
estimates without requiring additional permissions.

- Add generateColumnStatisticsQuery() to KtxMysqlDialect
- Add getColumnStatistics() and columnStats() to KtxMysqlScanConnector
- Flip columnStats capability from false to true
- Add MysqlStatsRow and KtxMysqlColumnStatisticsResult interfaces
- Add tests for dialect query generation and connector stats retrieval
- Update dialect conformance fixture for mysql

* fix(mysql): filter to leading index columns to avoid inflated cardinality

Add AND SEQ_IN_INDEX = 1 to INFORMATION_SCHEMA.STATISTICS query to
ensure only leading index columns are returned. For composite indexes,
non-leading columns report the cardinality of the index prefix rather
than the column's own distinct count, which inflates distinctCount.

Add regression test asserting SEQ_IN_INDEX = 1 is present in the query.

* fix: add trailing newline to dialect.test.ts

---------

Co-authored-by: Andrey Avtomonov <andreybavt@gmail.com>
This commit is contained in:
Mayorkun Ayanshina 2026-06-08 11:21:19 +01:00 committed by GitHub
parent 0d0ea55184
commit 18245c2373
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 97 additions and 10 deletions

View file

@ -74,6 +74,16 @@ function fakePoolFactory(): KtxMysqlPoolFactory {
if (sql.trim() === 'SELECT 1') {
return mysqlResult([{ '1': 1 }], [{ name: '1', type: 8 }]);
}
if (sql.includes('INFORMATION_SCHEMA.STATISTICS')) {
expect(sql).toContain('SEQ_IN_INDEX = 1');
return mysqlResult(
[
{ column_name: 'id', estimated_cardinality: 2 },
{ column_name: 'customer_id', estimated_cardinality: 2 },
],
[{ name: 'column_name' }, { name: 'estimated_cardinality' }],
);
}
throw new Error(`Unexpected SQL: ${sql} params=${JSON.stringify(params)}`);
});
const release = vi.fn();
@ -515,10 +525,25 @@ describe('KtxMysqlScanConnector', () => {
{ catalog: null, schema: 'analytics', name: 'orders', kind: 'table' },
{ catalog: null, schema: 'analytics', name: 'order_summary', kind: 'view' },
]);
await expect(connector.columnStats(
{ connectionId: 'warehouse', table: { catalog: null, db: 'analytics', name: 'orders' }, column: 'status' },
{ runId: 'scan-run-1' },
)).resolves.toBeNull();
await expect(
connector.columnStats(
{ connectionId: 'warehouse', table: { catalog: null, db: 'analytics', name: 'orders' }, column: 'id' },
{ runId: 'scan-run-1' },
),
).resolves.toEqual({ min: null, max: null, average: null, nullCount: null, distinctCount: 2 });
await expect(
connector.columnStats(
{ connectionId: 'warehouse', table: { catalog: null, db: 'analytics', name: 'orders' }, column: 'status' },
{ runId: 'scan-run-1' },
),
).resolves.toBeNull();
await expect(
connector.getColumnStatistics({ catalog: null, db: 'analytics', name: 'orders' }),
).resolves.toMatchObject({
cardinalityByColumn: new Map([['id', 2], ['customer_id', 2]]),
});
await connector.cleanup();
});

View file

@ -36,4 +36,26 @@ describe('KtxMysqlDialect', () => {
expect(dialect.getLimitOffsetClause(10, 20)).toBe('LIMIT 10 OFFSET 20');
});
it('generates column statistics query using INFORMATION_SCHEMA.STATISTICS', () => {
const sql = dialect.generateColumnStatisticsQuery('analytics', 'orders');
expect(sql).not.toBeNull();
expect(sql).toContain('INFORMATION_SCHEMA.STATISTICS');
expect(sql).toContain("TABLE_SCHEMA = 'analytics'");
expect(sql).toContain("TABLE_NAME = 'orders'");
expect(sql).toContain('CARDINALITY IS NOT NULL');
expect(sql).toContain('column_name');
expect(sql).toContain('estimated_cardinality');
});
it('filters to leading index columns only (SEQ_IN_INDEX = 1) to avoid inflated cardinality from composite indexes', () => {
const sql = dialect.generateColumnStatisticsQuery('analytics', 'orders');
expect(sql).toContain('SEQ_IN_INDEX = 1');
});
it('escapes single quotes in schema and table names for statistics query', () => {
const sql = dialect.generateColumnStatisticsQuery("andy's_db", "o'rders");
expect(sql).toContain("TABLE_SCHEMA = 'andy''s_db'");
expect(sql).toContain("TABLE_NAME = 'o''rders'");
});
});

View file

@ -89,7 +89,7 @@ const fixtures: DialectFixture[] = [
cardinalityContains: 'SELECT COUNT(DISTINCT val) AS cardinality',
randomizedCardinalityContains: 'ORDER BY RAND()',
distinctValuesContains: 'SELECT DISTINCT CAST(`status` AS CHAR) AS val',
statisticsContains: null,
statisticsContains: 'INFORMATION_SCHEMA.STATISTICS',
dimensionInput: 'tinyint(1)',
dimensionType: 'boolean',
nativeTypeInput: 'varchar(255)',