mirror of
https://github.com/Kaelio/ktx.git
synced 2026-06-10 08:05:14 +02:00
feat(mysql): implement columnStats using INFORMATION_SCHEMA.STATISTICS (#233)
* feat(mysql): implement columnStats using INFORMATION_SCHEMA.STATISTICS Enable column cardinality statistics for the MySQL connector by querying INFORMATION_SCHEMA.STATISTICS, which provides index-based cardinality estimates without requiring additional permissions. - Add generateColumnStatisticsQuery() to KtxMysqlDialect - Add getColumnStatistics() and columnStats() to KtxMysqlScanConnector - Flip columnStats capability from false to true - Add MysqlStatsRow and KtxMysqlColumnStatisticsResult interfaces - Add tests for dialect query generation and connector stats retrieval - Update dialect conformance fixture for mysql * fix(mysql): filter to leading index columns to avoid inflated cardinality Add AND SEQ_IN_INDEX = 1 to INFORMATION_SCHEMA.STATISTICS query to ensure only leading index columns are returned. For composite indexes, non-leading columns report the cardinality of the index prefix rather than the column's own distinct count, which inflates distinctCount. Add regression test asserting SEQ_IN_INDEX = 1 is present in the query. * fix: add trailing newline to dialect.test.ts --------- Co-authored-by: Andrey Avtomonov <andreybavt@gmail.com>
This commit is contained in:
parent
0d0ea55184
commit
18245c2373
5 changed files with 97 additions and 10 deletions
|
|
@ -159,6 +159,15 @@ interface MysqlDistinctValueRow extends RowDataPacket {
|
||||||
val: unknown;
|
val: unknown;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface MysqlStatsRow extends RowDataPacket {
|
||||||
|
column_name: string;
|
||||||
|
estimated_cardinality: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface KtxMysqlColumnStatisticsResult {
|
||||||
|
cardinalityByColumn: Map<string, number>;
|
||||||
|
}
|
||||||
|
|
||||||
class DefaultMysqlPoolFactory implements KtxMysqlPoolFactory {
|
class DefaultMysqlPoolFactory implements KtxMysqlPoolFactory {
|
||||||
createPool(config: KtxMysqlPoolConfig): KtxMysqlPool {
|
createPool(config: KtxMysqlPoolConfig): KtxMysqlPool {
|
||||||
return mysql.createPool(config) as Pool;
|
return mysql.createPool(config) as Pool;
|
||||||
|
|
@ -384,7 +393,7 @@ export class KtxMysqlScanConnector implements KtxScanConnector {
|
||||||
readonly capabilities = createKtxConnectorCapabilities({
|
readonly capabilities = createKtxConnectorCapabilities({
|
||||||
tableSampling: true,
|
tableSampling: true,
|
||||||
columnSampling: true,
|
columnSampling: true,
|
||||||
columnStats: false,
|
columnStats: true,
|
||||||
readOnlySql: true,
|
readOnlySql: true,
|
||||||
nestedAnalysis: true,
|
nestedAnalysis: true,
|
||||||
formalForeignKeys: true,
|
formalForeignKeys: true,
|
||||||
|
|
@ -562,8 +571,29 @@ export class KtxMysqlScanConnector implements KtxScanConnector {
|
||||||
return { values, nullCount: null, distinctCount: null };
|
return { values, nullCount: null, distinctCount: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
async columnStats(_input: KtxColumnStatsInput, _ctx: KtxScanContext): Promise<KtxColumnStatsResult | null> {
|
async columnStats(input: KtxColumnStatsInput, _ctx: KtxScanContext): Promise<KtxColumnStatsResult | null> {
|
||||||
return null;
|
const stats = await this.getColumnStatistics(input.table);
|
||||||
|
const value = stats?.cardinalityByColumn.get(input.column);
|
||||||
|
return value === undefined
|
||||||
|
? null
|
||||||
|
: { min: null, max: null, average: null, nullCount: null, distinctCount: value };
|
||||||
|
}
|
||||||
|
|
||||||
|
async getColumnStatistics(table: KtxTableRef): Promise<KtxMysqlColumnStatisticsResult | null> {
|
||||||
|
const schema = table.db ?? this.poolConfig.database;
|
||||||
|
const sql = this.dialect.generateColumnStatisticsQuery(schema, table.name);
|
||||||
|
if (!sql) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const rows = await this.queryRaw<MysqlStatsRow>(sql);
|
||||||
|
const cardinalityByColumn = new Map<string, number>();
|
||||||
|
for (const row of rows) {
|
||||||
|
const cardinality = Number(row.estimated_cardinality);
|
||||||
|
if (Number.isFinite(cardinality) && cardinality >= 0) {
|
||||||
|
cardinalityByColumn.set(row.column_name, cardinality);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return cardinalityByColumn.size > 0 ? { cardinalityByColumn } : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async executeReadOnly(input: KtxMysqlReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
async executeReadOnly(input: KtxMysqlReadOnlyQueryInput, _ctx: KtxScanContext): Promise<KtxQueryResult> {
|
||||||
|
|
|
||||||
|
|
@ -171,8 +171,18 @@ export class KtxMysqlDialect implements KtxDialect {
|
||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
generateColumnStatisticsQuery(_schemaName: string, _tableName: string): string | null {
|
generateColumnStatisticsQuery(schemaName: string, tableName: string): string | null {
|
||||||
return null;
|
return `
|
||||||
|
SELECT
|
||||||
|
COLUMN_NAME AS column_name,
|
||||||
|
MAX(CARDINALITY) AS estimated_cardinality
|
||||||
|
FROM INFORMATION_SCHEMA.STATISTICS
|
||||||
|
WHERE TABLE_SCHEMA = '${schemaName.replace(/'/g, "''")}'
|
||||||
|
AND TABLE_NAME = '${tableName.replace(/'/g, "''")}'
|
||||||
|
AND CARDINALITY IS NOT NULL
|
||||||
|
AND SEQ_IN_INDEX = 1
|
||||||
|
GROUP BY COLUMN_NAME
|
||||||
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
generateRandomizedCardinalitySampleQuery(tableName: string, columnName: string, sampleSize: number): string {
|
generateRandomizedCardinalitySampleQuery(tableName: string, columnName: string, sampleSize: number): string {
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,16 @@ function fakePoolFactory(): KtxMysqlPoolFactory {
|
||||||
if (sql.trim() === 'SELECT 1') {
|
if (sql.trim() === 'SELECT 1') {
|
||||||
return mysqlResult([{ '1': 1 }], [{ name: '1', type: 8 }]);
|
return mysqlResult([{ '1': 1 }], [{ name: '1', type: 8 }]);
|
||||||
}
|
}
|
||||||
|
if (sql.includes('INFORMATION_SCHEMA.STATISTICS')) {
|
||||||
|
expect(sql).toContain('SEQ_IN_INDEX = 1');
|
||||||
|
return mysqlResult(
|
||||||
|
[
|
||||||
|
{ column_name: 'id', estimated_cardinality: 2 },
|
||||||
|
{ column_name: 'customer_id', estimated_cardinality: 2 },
|
||||||
|
],
|
||||||
|
[{ name: 'column_name' }, { name: 'estimated_cardinality' }],
|
||||||
|
);
|
||||||
|
}
|
||||||
throw new Error(`Unexpected SQL: ${sql} params=${JSON.stringify(params)}`);
|
throw new Error(`Unexpected SQL: ${sql} params=${JSON.stringify(params)}`);
|
||||||
});
|
});
|
||||||
const release = vi.fn();
|
const release = vi.fn();
|
||||||
|
|
@ -515,10 +525,25 @@ describe('KtxMysqlScanConnector', () => {
|
||||||
{ catalog: null, schema: 'analytics', name: 'orders', kind: 'table' },
|
{ catalog: null, schema: 'analytics', name: 'orders', kind: 'table' },
|
||||||
{ catalog: null, schema: 'analytics', name: 'order_summary', kind: 'view' },
|
{ catalog: null, schema: 'analytics', name: 'order_summary', kind: 'view' },
|
||||||
]);
|
]);
|
||||||
await expect(connector.columnStats(
|
await expect(
|
||||||
{ connectionId: 'warehouse', table: { catalog: null, db: 'analytics', name: 'orders' }, column: 'status' },
|
connector.columnStats(
|
||||||
{ runId: 'scan-run-1' },
|
{ connectionId: 'warehouse', table: { catalog: null, db: 'analytics', name: 'orders' }, column: 'id' },
|
||||||
)).resolves.toBeNull();
|
{ runId: 'scan-run-1' },
|
||||||
|
),
|
||||||
|
).resolves.toEqual({ min: null, max: null, average: null, nullCount: null, distinctCount: 2 });
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
connector.columnStats(
|
||||||
|
{ connectionId: 'warehouse', table: { catalog: null, db: 'analytics', name: 'orders' }, column: 'status' },
|
||||||
|
{ runId: 'scan-run-1' },
|
||||||
|
),
|
||||||
|
).resolves.toBeNull();
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
connector.getColumnStatistics({ catalog: null, db: 'analytics', name: 'orders' }),
|
||||||
|
).resolves.toMatchObject({
|
||||||
|
cardinalityByColumn: new Map([['id', 2], ['customer_id', 2]]),
|
||||||
|
});
|
||||||
|
|
||||||
await connector.cleanup();
|
await connector.cleanup();
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -36,4 +36,26 @@ describe('KtxMysqlDialect', () => {
|
||||||
expect(dialect.getLimitOffsetClause(10, 20)).toBe('LIMIT 10 OFFSET 20');
|
expect(dialect.getLimitOffsetClause(10, 20)).toBe('LIMIT 10 OFFSET 20');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
it('generates column statistics query using INFORMATION_SCHEMA.STATISTICS', () => {
|
||||||
|
const sql = dialect.generateColumnStatisticsQuery('analytics', 'orders');
|
||||||
|
expect(sql).not.toBeNull();
|
||||||
|
expect(sql).toContain('INFORMATION_SCHEMA.STATISTICS');
|
||||||
|
expect(sql).toContain("TABLE_SCHEMA = 'analytics'");
|
||||||
|
expect(sql).toContain("TABLE_NAME = 'orders'");
|
||||||
|
expect(sql).toContain('CARDINALITY IS NOT NULL');
|
||||||
|
expect(sql).toContain('column_name');
|
||||||
|
expect(sql).toContain('estimated_cardinality');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('filters to leading index columns only (SEQ_IN_INDEX = 1) to avoid inflated cardinality from composite indexes', () => {
|
||||||
|
const sql = dialect.generateColumnStatisticsQuery('analytics', 'orders');
|
||||||
|
expect(sql).toContain('SEQ_IN_INDEX = 1');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('escapes single quotes in schema and table names for statistics query', () => {
|
||||||
|
const sql = dialect.generateColumnStatisticsQuery("andy's_db", "o'rders");
|
||||||
|
expect(sql).toContain("TABLE_SCHEMA = 'andy''s_db'");
|
||||||
|
expect(sql).toContain("TABLE_NAME = 'o''rders'");
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -89,7 +89,7 @@ const fixtures: DialectFixture[] = [
|
||||||
cardinalityContains: 'SELECT COUNT(DISTINCT val) AS cardinality',
|
cardinalityContains: 'SELECT COUNT(DISTINCT val) AS cardinality',
|
||||||
randomizedCardinalityContains: 'ORDER BY RAND()',
|
randomizedCardinalityContains: 'ORDER BY RAND()',
|
||||||
distinctValuesContains: 'SELECT DISTINCT CAST(`status` AS CHAR) AS val',
|
distinctValuesContains: 'SELECT DISTINCT CAST(`status` AS CHAR) AS val',
|
||||||
statisticsContains: null,
|
statisticsContains: 'INFORMATION_SCHEMA.STATISTICS',
|
||||||
dimensionInput: 'tinyint(1)',
|
dimensionInput: 'tinyint(1)',
|
||||||
dimensionType: 'boolean',
|
dimensionType: 'boolean',
|
||||||
nativeTypeInput: 'varchar(255)',
|
nativeTypeInput: 'varchar(255)',
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue