ktx/packages/context/src/scan/description-generation.ts

583 lines
19 KiB
TypeScript
Raw Normal View History

2026-05-10 23:51:24 +02:00
import type { KtxLlmProvider } from '@ktx/llm';
import { generateKtxText } from '../llm/index.js';
2026-05-10 23:12:26 +02:00
import type {
2026-05-10 23:51:24 +02:00
KtxColumnSampleInput,
KtxColumnSampleResult,
KtxScanContext,
KtxScanLoggerPort,
KtxTableRef,
KtxTableSampleInput,
KtxTableSampleResult,
2026-05-10 23:12:26 +02:00
} from './types.js';
2026-05-10 23:51:24 +02:00
export interface KtxDescriptionCachePort {
buildTableKey(table: KtxTableRef): string;
buildColumnKey(table: KtxTableRef, columnName: string): string;
2026-05-10 23:12:26 +02:00
buildConnectionKey(connectionName: string): string;
get(key: string): Promise<string | null>;
set(key: string, value: string): Promise<void>;
}
2026-05-10 23:51:24 +02:00
export interface KtxDescriptionSamplingPort {
2026-05-10 23:12:26 +02:00
id: string;
2026-05-10 23:51:24 +02:00
sampleColumn?(input: KtxColumnSampleInput, ctx: KtxScanContext): Promise<KtxColumnSampleResult>;
sampleTable?(input: KtxTableSampleInput, ctx: KtxScanContext): Promise<KtxTableSampleResult>;
2026-05-10 23:12:26 +02:00
}
2026-05-10 23:51:24 +02:00
export interface KtxDescriptionGenerationSettings {
2026-05-10 23:12:26 +02:00
columnMaxWords: number;
tableMaxWords: number;
dataSourceMaxWords: number;
temperature?: number;
concurrencyLimit?: number;
}
2026-05-10 23:51:24 +02:00
interface ResolvedKtxDescriptionGenerationSettings {
2026-05-10 23:12:26 +02:00
columnMaxWords: number;
tableMaxWords: number;
dataSourceMaxWords: number;
temperature?: number;
concurrencyLimit: number;
}
2026-05-10 23:51:24 +02:00
export interface KtxDescriptionColumn {
2026-05-10 23:12:26 +02:00
name: string;
type?: string;
rawDescriptions?: Record<string, string>;
sampleValues?: unknown[];
}
2026-05-10 23:51:24 +02:00
export interface KtxDescriptionColumnTable extends KtxTableRef {
columns: KtxDescriptionColumn[];
2026-05-10 23:12:26 +02:00
}
2026-05-10 23:51:24 +02:00
export interface KtxDescriptionTableInput extends KtxTableRef {
2026-05-10 23:12:26 +02:00
rawDescriptions?: Record<string, string>;
}
2026-05-10 23:51:24 +02:00
export interface KtxColumnAnalysisResult {
2026-05-10 23:12:26 +02:00
columnDescriptions: Array<[string, string | null]>;
processedColumns: string[];
skippedColumns: string[];
}
2026-05-10 23:51:24 +02:00
export interface KtxColumnDescriptionPromptInput {
2026-05-10 23:12:26 +02:00
columnName: string;
columnValues: unknown[];
tableContext: string;
dataSourceType: string;
supportsNestedAnalysis: boolean;
rawDescriptions?: Record<string, string>;
}
2026-05-10 23:51:24 +02:00
export interface KtxTableDescriptionPromptInput {
2026-05-10 23:12:26 +02:00
tableName: string;
2026-05-10 23:51:24 +02:00
sampleData: KtxTableSampleResult;
2026-05-10 23:12:26 +02:00
dataSourceType: string;
rawDescriptions?: Record<string, string>;
}
2026-05-10 23:51:24 +02:00
export interface KtxDataSourceDescriptionPromptInput {
tableSamples: Array<[string, KtxTableSampleResult]>;
2026-05-10 23:12:26 +02:00
dataSourceType: string;
}
2026-05-10 23:51:24 +02:00
export interface KtxGenerateColumnDescriptionsInput {
2026-05-10 23:12:26 +02:00
connectionId: string;
2026-05-10 23:51:24 +02:00
connector: KtxDescriptionSamplingPort;
context: KtxScanContext;
2026-05-10 23:12:26 +02:00
dataSourceType: string;
supportsNestedAnalysis: boolean;
2026-05-10 23:51:24 +02:00
table: KtxDescriptionColumnTable;
2026-05-10 23:12:26 +02:00
skipExisting?: boolean;
existingDescriptions?: Record<string, string | null>;
}
2026-05-10 23:51:24 +02:00
export interface KtxGenerateTableDescriptionInput {
2026-05-10 23:12:26 +02:00
connectionId: string;
2026-05-10 23:51:24 +02:00
connector: KtxDescriptionSamplingPort;
context: KtxScanContext;
2026-05-10 23:12:26 +02:00
dataSourceType: string;
2026-05-10 23:51:24 +02:00
table: KtxDescriptionTableInput;
2026-05-10 23:12:26 +02:00
}
2026-05-10 23:51:24 +02:00
export interface KtxGenerateDataSourceDescriptionInput {
2026-05-10 23:12:26 +02:00
connectionId: string;
2026-05-10 23:51:24 +02:00
connector: KtxDescriptionSamplingPort;
context: KtxScanContext;
2026-05-10 23:12:26 +02:00
dataSourceType: string;
2026-05-10 23:51:24 +02:00
tables: KtxTableRef[];
2026-05-10 23:12:26 +02:00
connectionName?: string;
}
2026-05-10 23:51:24 +02:00
export interface KtxDescriptionGeneratorOptions {
llmProvider: KtxLlmProvider;
cache?: KtxDescriptionCachePort;
logger?: KtxScanLoggerPort;
settings: KtxDescriptionGenerationSettings;
2026-05-10 23:12:26 +02:00
}
interface ColumnTaskResult {
columnName: string;
description: string | null;
processed: boolean;
skipped: boolean;
}
function descriptionSources(rawDescriptions: Record<string, string> | undefined): Array<[string, string]> {
if (!rawDescriptions) {
return [];
}
return Object.entries(rawDescriptions).filter(([source, text]) => source !== 'ai' && source !== 'user' && !!text);
}
function errorMessage(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
2026-05-10 23:51:24 +02:00
function toTableRef(table: KtxTableRef): KtxTableRef {
2026-05-10 23:12:26 +02:00
return {
catalog: table.catalog,
db: table.db,
name: table.name,
};
}
async function runWithConcurrency<TInput, TOutput>(
items: readonly TInput[],
concurrencyLimit: number,
worker: (item: TInput, index: number) => Promise<TOutput>,
): Promise<TOutput[]> {
const results: TOutput[] = [];
let nextIndex = 0;
const workerCount = Math.max(1, Math.min(concurrencyLimit, items.length || 1));
await Promise.all(
Array.from({ length: workerCount }, async () => {
while (nextIndex < items.length) {
const index = nextIndex;
nextIndex += 1;
const item = items[index];
if (item !== undefined) {
results[index] = await worker(item, index);
}
}
}),
);
return results;
}
export interface KtxDescriptionPrompt {
system: string;
user: string;
2026-05-10 23:12:26 +02:00
}
function wordLimitLine(maxWords: number): string {
return `Please provide a concise description in ${maxWords} words or less.`;
}
export function buildKtxColumnDescriptionPrompt(
input: KtxColumnDescriptionPromptInput & { maxWords?: number },
): KtxDescriptionPrompt {
2026-05-10 23:12:26 +02:00
const sampleValues = input.columnValues.slice(0, 5);
const valuesStr = sampleValues
.filter((value) => value !== null && value !== undefined)
.map((value) => String(value))
.join(', ');
const systemParts: string[] = [
`Analyze database columns and provide a concise description.
Provide a brief description of what the column contains without repeating the column name.
Focus on the data's meaning and business purpose. Start directly with the content description.
Example:
"first names of individuals, likely employees or contacts" instead of "The column contains first names..."
"Job titles or roles of individuals..." instead of "This column contains job titles..."`,
];
if (input.dataSourceType === 'BIGQUERY' && input.supportsNestedAnalysis) {
systemParts.push(
'If the sampled values indicate nested/structured data (JSON, STRUCT, or ARRAY), describe its general business purpose and data organization.',
);
}
if (input.maxWords !== undefined) {
systemParts.push(wordLimitLine(input.maxWords));
}
2026-05-10 23:12:26 +02:00
let user = `<table_context> ${input.tableContext} </table_context>
2026-05-10 23:12:26 +02:00
<column_name> ${input.columnName} </column_name>
<sample_values> ${valuesStr} </sample_values>
`;
const sources = descriptionSources(input.rawDescriptions);
if (sources.length > 0) {
user += '\nExisting descriptions from other sources:\n';
2026-05-10 23:12:26 +02:00
for (const [source, text] of sources) {
user += `<${source}_documentation> ${text} </${source}_documentation>\n`;
2026-05-10 23:12:26 +02:00
}
user +=
2026-05-10 23:12:26 +02:00
'\nSynthesize a description that captures the most important information from all sources. Prioritize the sources as authoritative context.\n';
}
return { system: systemParts.join('\n\n'), user: user.trim() };
2026-05-10 23:12:26 +02:00
}
export function buildKtxTableDescriptionPrompt(
input: KtxTableDescriptionPromptInput & { maxWords?: number },
): KtxDescriptionPrompt {
2026-05-10 23:12:26 +02:00
const columnInfo: string[] = [];
for (let index = 0; index < Math.min(input.sampleData.headers.length, 10); index += 1) {
const header = input.sampleData.headers[index];
const sampleValues = input.sampleData.rows
.slice(0, 3)
.map((row) => row[index])
.filter((value) => value !== null && value !== undefined);
columnInfo.push(`${header}: ${sampleValues.map((value) => String(value)).join(', ')}`);
}
const systemParts: string[] = [
`Analyze database tables and provide a concise description.
2026-05-10 23:12:26 +02:00
Provide a brief description of what the table represents and its business purpose.
Do NOT list or describe individual columns or fields.
Start directly with the content description without mentioning the table name.
Focus on the data's meaning and business purpose.
Example: "Information about healthcare professionals used for workforce management" instead of "The blahblah table contains information about healthcare professionals including their names, titles..."`,
];
if (input.dataSourceType === 'BIGQUERY') {
systemParts.push(
"Note (don't include in the final answer): BigQuery tables may contain nested structures, arrays, or other complex data types.",
);
}
if (input.maxWords !== undefined) {
systemParts.push(wordLimitLine(input.maxWords));
}
let user = `Table: ${input.tableName}
Columns and sample data: ${columnInfo.join(' | ')}
Total rows in sample: ${input.sampleData.rows.length}
Data source type: ${input.dataSourceType}`;
2026-05-10 23:12:26 +02:00
const sources = descriptionSources(input.rawDescriptions);
if (sources.length > 0) {
user += '\n\nExisting descriptions from other sources:\n';
2026-05-10 23:12:26 +02:00
for (const [source, text] of sources) {
user += `${source}: ${text}\n`;
2026-05-10 23:12:26 +02:00
}
user +=
'\nSynthesize a description that captures the most important information from all sources. Prioritize the sources as authoritative context.';
2026-05-10 23:12:26 +02:00
}
return { system: systemParts.join('\n\n'), user: user.trim() };
2026-05-10 23:12:26 +02:00
}
export function buildKtxDataSourceDescriptionPrompt(
input: KtxDataSourceDescriptionPromptInput & { maxWords?: number },
): KtxDescriptionPrompt {
2026-05-10 23:12:26 +02:00
const tablesText = input.tableSamples
.map(
([tableName, sampleData]) =>
`${tableName} (${sampleData.headers.length} columns, ${sampleData.rows.length} sample rows)`,
)
.join(' | ');
const systemParts: string[] = [
`Analyze databases and provide a concise description.
2026-05-10 23:12:26 +02:00
Provide a direct, concise description of what the database represents and its business purpose.
Do NOT start with phrases like "This database appears to represent" or "This BigQuery dataset".
Start directly with the domain or business area description.
Focus on the overall data model and its intended use.
Example: "Healthcare-related database with a focus on patient management..." instead of "This database appears to represent a healthcare-related system..."`,
];
2026-05-10 23:12:26 +02:00
if (input.dataSourceType === 'BIGQUERY') {
systemParts.push(
"Note (don't include in the final answer): BigQuery datasets may contain large-scale analytics data, nested structures, and complex data types.",
);
}
if (input.maxWords !== undefined) {
systemParts.push(wordLimitLine(input.maxWords));
2026-05-10 23:12:26 +02:00
}
const user = `Tables: ${tablesText}
Total tables analyzed: ${input.tableSamples.length}
Data source type: ${input.dataSourceType}`;
2026-05-10 23:12:26 +02:00
return { system: systemParts.join('\n\n'), user };
2026-05-10 23:12:26 +02:00
}
2026-05-10 23:51:24 +02:00
export class KtxDescriptionGenerator {
private readonly llmProvider: KtxLlmProvider;
private readonly cache?: KtxDescriptionCachePort;
private readonly logger?: KtxScanLoggerPort;
private readonly settings: ResolvedKtxDescriptionGenerationSettings;
2026-05-10 23:12:26 +02:00
2026-05-10 23:51:24 +02:00
constructor(options: KtxDescriptionGeneratorOptions) {
2026-05-10 23:12:26 +02:00
this.llmProvider = options.llmProvider;
this.cache = options.cache;
this.logger = options.logger;
this.settings = {
columnMaxWords: options.settings.columnMaxWords,
tableMaxWords: options.settings.tableMaxWords,
dataSourceMaxWords: options.settings.dataSourceMaxWords,
...(options.settings.temperature !== undefined ? { temperature: options.settings.temperature } : {}),
concurrencyLimit: options.settings.concurrencyLimit ?? 5,
};
}
2026-05-10 23:51:24 +02:00
async generateColumnDescriptions(input: KtxGenerateColumnDescriptionsInput): Promise<KtxColumnAnalysisResult> {
2026-05-10 23:12:26 +02:00
const columnsToProcess = input.table.columns;
const tableContext = `Table: ${input.table.name} | Columns: ${columnsToProcess.map((column) => column.name).join(', ')} | Data source: ${input.dataSourceType}`;
const results = await runWithConcurrency(columnsToProcess, this.settings.concurrencyLimit, async (column) =>
this.generateOneColumnDescription(input, column, tableContext),
);
const columnDescriptions: Array<[string, string | null]> = [];
const processedColumns: string[] = [];
const skippedColumns: string[] = [];
for (const result of results) {
columnDescriptions.push([result.columnName, result.description]);
if (result.skipped) {
skippedColumns.push(result.columnName);
} else if (result.processed) {
processedColumns.push(result.columnName);
}
}
return {
columnDescriptions,
processedColumns,
skippedColumns,
};
}
async generateTableDescription(input: KtxGenerateTableDescriptionInput): Promise<string | null> {
2026-05-10 23:12:26 +02:00
const tableRef = toTableRef(input.table);
const cacheKey = this.cache?.buildTableKey(tableRef);
if (cacheKey) {
const cached = await this.cache?.get(cacheKey);
if (cached) {
return cached;
}
}
if (!input.connector.sampleTable) {
2026-05-10 23:51:24 +02:00
this.logger?.warn('KTX scan connector does not support table sampling for table description generation', {
2026-05-10 23:12:26 +02:00
connectorId: input.connector.id,
table: input.table.name,
});
return 'Table not found';
}
try {
const sampleData = await input.connector.sampleTable(
{
connectionId: input.connectionId,
table: tableRef,
limit: 20,
},
input.context,
);
2026-05-10 23:51:24 +02:00
const prompt = buildKtxTableDescriptionPrompt({
2026-05-10 23:12:26 +02:00
tableName: input.table.name,
sampleData,
dataSourceType: input.dataSourceType,
rawDescriptions: input.table.rawDescriptions,
maxWords: this.settings.tableMaxWords,
2026-05-10 23:12:26 +02:00
});
const description = await this.generateAiDescription(prompt, 'ktx-table-description');
if (cacheKey && description) {
2026-05-10 23:12:26 +02:00
await this.cache?.set(cacheKey, description);
}
return description;
} catch (error) {
this.logger?.error(`Error generating table description: ${errorMessage(error)}`);
return 'Table not found';
}
}
async generateDataSourceDescription(input: KtxGenerateDataSourceDescriptionInput): Promise<string | null> {
2026-05-10 23:12:26 +02:00
if (input.tables.length === 0) {
return 'No tables found in database';
}
const cacheKey = input.connectionName ? this.cache?.buildConnectionKey(input.connectionName) : undefined;
if (cacheKey) {
const cached = await this.cache?.get(cacheKey);
if (cached) {
return cached;
}
}
if (!input.connector.sampleTable) {
2026-05-10 23:51:24 +02:00
this.logger?.warn('KTX scan connector does not support table sampling for data-source description generation', {
2026-05-10 23:12:26 +02:00
connectorId: input.connector.id,
});
return 'No accessible tables found in database';
}
const tablesToAnalyze = input.tables.slice(0, 10);
const tableSamples = await runWithConcurrency(tablesToAnalyze, this.settings.concurrencyLimit, async (table) => {
try {
const sampleData = await input.connector.sampleTable!(
{
connectionId: input.connectionId,
table: toTableRef(table),
limit: 5,
},
input.context,
);
2026-05-10 23:51:24 +02:00
return [table.name, sampleData] as [string, KtxTableSampleResult];
2026-05-10 23:12:26 +02:00
} catch (error) {
this.logger?.warn(`Failed to sample table '${table.name}' for data source analysis - ${errorMessage(error)}`);
return null;
}
});
const accessibleSamples = tableSamples.filter(
2026-05-10 23:51:24 +02:00
(sample): sample is [string, KtxTableSampleResult] => sample !== null,
2026-05-10 23:12:26 +02:00
);
if (accessibleSamples.length === 0) {
return 'No accessible tables found in database';
}
try {
2026-05-10 23:51:24 +02:00
const prompt = buildKtxDataSourceDescriptionPrompt({
2026-05-10 23:12:26 +02:00
tableSamples: accessibleSamples,
dataSourceType: input.dataSourceType,
maxWords: this.settings.dataSourceMaxWords,
2026-05-10 23:12:26 +02:00
});
const description = await this.generateAiDescription(prompt, 'ktx-data-source-description');
if (cacheKey && description) {
2026-05-10 23:12:26 +02:00
await this.cache?.set(cacheKey, description);
}
return description;
} catch (error) {
this.logger?.error(`Error generating data source description: ${errorMessage(error)}`);
return 'Failed to generate data source description';
}
}
private async generateOneColumnDescription(
2026-05-10 23:51:24 +02:00
input: KtxGenerateColumnDescriptionsInput,
column: KtxDescriptionColumn,
2026-05-10 23:12:26 +02:00
tableContext: string,
): Promise<ColumnTaskResult> {
const existingDescription = input.existingDescriptions?.[column.name];
if (input.skipExisting && existingDescription) {
return {
columnName: column.name,
description: existingDescription,
skipped: true,
processed: false,
};
}
const tableRef = toTableRef(input.table);
const cacheKey = this.cache?.buildColumnKey(tableRef, column.name);
if (cacheKey) {
const cached = await this.cache?.get(cacheKey);
if (cached) {
return {
columnName: column.name,
description: cached,
skipped: true,
processed: false,
};
}
}
try {
let columnValues = column.sampleValues;
if (!columnValues || columnValues.length === 0) {
if (!input.connector.sampleColumn) {
2026-05-10 23:51:24 +02:00
this.logger?.warn('KTX scan connector does not support column sampling for column description generation', {
2026-05-10 23:12:26 +02:00
connectorId: input.connector.id,
table: input.table.name,
column: column.name,
});
return {
columnName: column.name,
description: null,
skipped: false,
processed: false,
};
}
const sample = await input.connector.sampleColumn(
{
connectionId: input.connectionId,
table: tableRef,
column: column.name,
limit: 50,
},
input.context,
);
columnValues = sample.values;
}
const nonNullValues = (columnValues ?? []).filter((value) => value !== null && value !== undefined);
if (nonNullValues.length === 0) {
return {
columnName: column.name,
description: null,
skipped: false,
processed: false,
};
}
2026-05-10 23:51:24 +02:00
const prompt = buildKtxColumnDescriptionPrompt({
2026-05-10 23:12:26 +02:00
columnName: column.name,
columnValues: nonNullValues,
tableContext,
dataSourceType: input.dataSourceType,
supportsNestedAnalysis: input.supportsNestedAnalysis,
rawDescriptions: column.rawDescriptions,
maxWords: this.settings.columnMaxWords,
2026-05-10 23:12:26 +02:00
});
const description = await this.generateAiDescription(prompt, 'ktx-column-description');
2026-05-10 23:12:26 +02:00
if (cacheKey && description) {
2026-05-10 23:12:26 +02:00
await this.cache?.set(cacheKey, description);
}
return {
columnName: column.name,
description,
skipped: false,
processed: description !== null,
2026-05-10 23:12:26 +02:00
};
} catch (error) {
this.logger?.error(`Error analyzing column '${column.name}': ${errorMessage(error)}`);
return {
columnName: column.name,
description: null,
2026-05-10 23:12:26 +02:00
skipped: false,
processed: false,
};
}
}
private async generateAiDescription(prompt: KtxDescriptionPrompt, _operationName: string): Promise<string | null> {
2026-05-10 23:12:26 +02:00
try {
2026-05-10 23:51:24 +02:00
const text = await generateKtxText({
2026-05-10 23:12:26 +02:00
llmProvider: this.llmProvider,
role: 'candidateExtraction',
system: prompt.system,
prompt: prompt.user,
2026-05-10 23:12:26 +02:00
temperature: this.settings.temperature,
});
const description = text.trim();
return description || null;
2026-05-10 23:12:26 +02:00
} catch (error) {
this.logger?.error(`Error generating AI description: ${errorMessage(error)}`);
return null;
2026-05-10 23:12:26 +02:00
}
}
}