mirror of
https://github.com/Kaelio/ktx.git
synced 2026-07-04 10:52:13 +02:00
583 lines
19 KiB
TypeScript
583 lines
19 KiB
TypeScript
|
|
import type { KloLlmProvider } from '@klo/llm';
|
||
|
|
import { generateKloText } from '../llm/index.js';
|
||
|
|
import type {
|
||
|
|
KloColumnSampleInput,
|
||
|
|
KloColumnSampleResult,
|
||
|
|
KloScanContext,
|
||
|
|
KloScanLoggerPort,
|
||
|
|
KloTableRef,
|
||
|
|
KloTableSampleInput,
|
||
|
|
KloTableSampleResult,
|
||
|
|
} from './types.js';
|
||
|
|
|
||
|
|
export interface KloDescriptionCachePort {
|
||
|
|
buildTableKey(table: KloTableRef): string;
|
||
|
|
buildColumnKey(table: KloTableRef, columnName: string): string;
|
||
|
|
buildConnectionKey(connectionName: string): string;
|
||
|
|
get(key: string): Promise<string | null>;
|
||
|
|
set(key: string, value: string): Promise<void>;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloDescriptionSamplingPort {
|
||
|
|
id: string;
|
||
|
|
sampleColumn?(input: KloColumnSampleInput, ctx: KloScanContext): Promise<KloColumnSampleResult>;
|
||
|
|
sampleTable?(input: KloTableSampleInput, ctx: KloScanContext): Promise<KloTableSampleResult>;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloDescriptionGenerationSettings {
|
||
|
|
columnMaxWords: number;
|
||
|
|
tableMaxWords: number;
|
||
|
|
dataSourceMaxWords: number;
|
||
|
|
temperature?: number;
|
||
|
|
concurrencyLimit?: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ResolvedKloDescriptionGenerationSettings {
|
||
|
|
columnMaxWords: number;
|
||
|
|
tableMaxWords: number;
|
||
|
|
dataSourceMaxWords: number;
|
||
|
|
temperature?: number;
|
||
|
|
concurrencyLimit: number;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloDescriptionColumn {
|
||
|
|
name: string;
|
||
|
|
type?: string;
|
||
|
|
rawDescriptions?: Record<string, string>;
|
||
|
|
sampleValues?: unknown[];
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloDescriptionColumnTable extends KloTableRef {
|
||
|
|
columns: KloDescriptionColumn[];
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloDescriptionTableInput extends KloTableRef {
|
||
|
|
rawDescriptions?: Record<string, string>;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloColumnAnalysisResult {
|
||
|
|
columnDescriptions: Array<[string, string | null]>;
|
||
|
|
processedColumns: string[];
|
||
|
|
skippedColumns: string[];
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloColumnDescriptionPromptInput {
|
||
|
|
columnName: string;
|
||
|
|
columnValues: unknown[];
|
||
|
|
tableContext: string;
|
||
|
|
dataSourceType: string;
|
||
|
|
supportsNestedAnalysis: boolean;
|
||
|
|
rawDescriptions?: Record<string, string>;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloTableDescriptionPromptInput {
|
||
|
|
tableName: string;
|
||
|
|
sampleData: KloTableSampleResult;
|
||
|
|
dataSourceType: string;
|
||
|
|
rawDescriptions?: Record<string, string>;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloDataSourceDescriptionPromptInput {
|
||
|
|
tableSamples: Array<[string, KloTableSampleResult]>;
|
||
|
|
dataSourceType: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloGenerateColumnDescriptionsInput {
|
||
|
|
connectionId: string;
|
||
|
|
connector: KloDescriptionSamplingPort;
|
||
|
|
context: KloScanContext;
|
||
|
|
dataSourceType: string;
|
||
|
|
supportsNestedAnalysis: boolean;
|
||
|
|
table: KloDescriptionColumnTable;
|
||
|
|
skipExisting?: boolean;
|
||
|
|
existingDescriptions?: Record<string, string | null>;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloGenerateTableDescriptionInput {
|
||
|
|
connectionId: string;
|
||
|
|
connector: KloDescriptionSamplingPort;
|
||
|
|
context: KloScanContext;
|
||
|
|
dataSourceType: string;
|
||
|
|
table: KloDescriptionTableInput;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloGenerateDataSourceDescriptionInput {
|
||
|
|
connectionId: string;
|
||
|
|
connector: KloDescriptionSamplingPort;
|
||
|
|
context: KloScanContext;
|
||
|
|
dataSourceType: string;
|
||
|
|
tables: KloTableRef[];
|
||
|
|
connectionName?: string;
|
||
|
|
}
|
||
|
|
|
||
|
|
export interface KloDescriptionGeneratorOptions {
|
||
|
|
llmProvider: KloLlmProvider;
|
||
|
|
cache?: KloDescriptionCachePort;
|
||
|
|
logger?: KloScanLoggerPort;
|
||
|
|
settings: KloDescriptionGenerationSettings;
|
||
|
|
}
|
||
|
|
|
||
|
|
interface ColumnTaskResult {
|
||
|
|
columnName: string;
|
||
|
|
description: string | null;
|
||
|
|
processed: boolean;
|
||
|
|
skipped: boolean;
|
||
|
|
}
|
||
|
|
|
||
|
|
function descriptionSources(rawDescriptions: Record<string, string> | undefined): Array<[string, string]> {
|
||
|
|
if (!rawDescriptions) {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
return Object.entries(rawDescriptions).filter(([source, text]) => source !== 'ai' && source !== 'user' && !!text);
|
||
|
|
}
|
||
|
|
|
||
|
|
function errorMessage(error: unknown): string {
|
||
|
|
return error instanceof Error ? error.message : String(error);
|
||
|
|
}
|
||
|
|
|
||
|
|
function toTableRef(table: KloTableRef): KloTableRef {
|
||
|
|
return {
|
||
|
|
catalog: table.catalog,
|
||
|
|
db: table.db,
|
||
|
|
name: table.name,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
async function runWithConcurrency<TInput, TOutput>(
|
||
|
|
items: readonly TInput[],
|
||
|
|
concurrencyLimit: number,
|
||
|
|
worker: (item: TInput, index: number) => Promise<TOutput>,
|
||
|
|
): Promise<TOutput[]> {
|
||
|
|
const results: TOutput[] = [];
|
||
|
|
let nextIndex = 0;
|
||
|
|
const workerCount = Math.max(1, Math.min(concurrencyLimit, items.length || 1));
|
||
|
|
|
||
|
|
await Promise.all(
|
||
|
|
Array.from({ length: workerCount }, async () => {
|
||
|
|
while (nextIndex < items.length) {
|
||
|
|
const index = nextIndex;
|
||
|
|
nextIndex += 1;
|
||
|
|
const item = items[index];
|
||
|
|
if (item !== undefined) {
|
||
|
|
results[index] = await worker(item, index);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}),
|
||
|
|
);
|
||
|
|
|
||
|
|
return results;
|
||
|
|
}
|
||
|
|
|
||
|
|
export function appendKloWordLimitInstruction(prompt: string, maxWords: number): string {
|
||
|
|
return `${prompt}\n\nPlease provide a concise description in ${maxWords} words or less.`;
|
||
|
|
}
|
||
|
|
|
||
|
|
export function buildKloColumnDescriptionPrompt(input: KloColumnDescriptionPromptInput): string {
|
||
|
|
const sampleValues = input.columnValues.slice(0, 5);
|
||
|
|
const valuesStr = sampleValues
|
||
|
|
.filter((value) => value !== null && value !== undefined)
|
||
|
|
.map((value) => String(value))
|
||
|
|
.join(', ');
|
||
|
|
|
||
|
|
let prompt = `Analyze this database column and provide a concise description:
|
||
|
|
|
||
|
|
<table_context> ${input.tableContext} </table_context>
|
||
|
|
|
||
|
|
<column_name> ${input.columnName} </column_name>
|
||
|
|
|
||
|
|
<sample_values> ${valuesStr} </sample_values>
|
||
|
|
`;
|
||
|
|
|
||
|
|
const sources = descriptionSources(input.rawDescriptions);
|
||
|
|
if (sources.length > 0) {
|
||
|
|
prompt += '\nExisting descriptions from other sources:\n';
|
||
|
|
for (const [source, text] of sources) {
|
||
|
|
prompt += `<${source}_documentation> ${text} </${source}_documentation>\n`;
|
||
|
|
}
|
||
|
|
prompt +=
|
||
|
|
'\nSynthesize a description that captures the most important information from all sources. Prioritize the sources as authoritative context.\n';
|
||
|
|
}
|
||
|
|
|
||
|
|
prompt += `
|
||
|
|
Provide a brief description of what this column contains without repeating the column name.
|
||
|
|
Focus on the data's meaning and business purpose. Start directly with the content description.
|
||
|
|
Example:
|
||
|
|
"first names of individuals, likely employees or contacts" instead of "The column contains first names..."
|
||
|
|
"Job titles or roles of individuals..." instead of "This column contains job titles..."
|
||
|
|
`;
|
||
|
|
|
||
|
|
if (input.dataSourceType === 'BIGQUERY' && input.supportsNestedAnalysis) {
|
||
|
|
const hasNestedData = sampleValues.some((value) => {
|
||
|
|
const text = String(value);
|
||
|
|
return text.includes('nested') || text.includes('{') || text.includes('[');
|
||
|
|
});
|
||
|
|
if (hasNestedData) {
|
||
|
|
prompt +=
|
||
|
|
'\nNote: This column contains nested/structured data (JSON, STRUCT, or ARRAY) - describe its general business purpose and data organization.';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return prompt.trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
export function buildKloTableDescriptionPrompt(input: KloTableDescriptionPromptInput): string {
|
||
|
|
const columnInfo: string[] = [];
|
||
|
|
for (let index = 0; index < Math.min(input.sampleData.headers.length, 10); index += 1) {
|
||
|
|
const header = input.sampleData.headers[index];
|
||
|
|
const sampleValues = input.sampleData.rows
|
||
|
|
.slice(0, 3)
|
||
|
|
.map((row) => row[index])
|
||
|
|
.filter((value) => value !== null && value !== undefined);
|
||
|
|
columnInfo.push(`${header}: ${sampleValues.map((value) => String(value)).join(', ')}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
let prompt = `
|
||
|
|
Analyze this database table and provide a concise description:
|
||
|
|
|
||
|
|
Table: ${input.tableName}
|
||
|
|
Columns and sample data: ${columnInfo.join(' | ')}
|
||
|
|
Total rows in sample: ${input.sampleData.rows.length}
|
||
|
|
Data source type: ${input.dataSourceType}
|
||
|
|
`;
|
||
|
|
|
||
|
|
const sources = descriptionSources(input.rawDescriptions);
|
||
|
|
if (sources.length > 0) {
|
||
|
|
prompt += '\n Existing descriptions from other sources:\n';
|
||
|
|
for (const [source, text] of sources) {
|
||
|
|
prompt += ` ${source}: ${text}\n`;
|
||
|
|
}
|
||
|
|
prompt +=
|
||
|
|
'\n Synthesize a description that captures the most important information from all sources. Prioritize the sources as authoritative context.\n';
|
||
|
|
}
|
||
|
|
|
||
|
|
if (input.dataSourceType === 'BIGQUERY') {
|
||
|
|
prompt +=
|
||
|
|
"\nNote (Don't include this note in the final answer.): This is a BigQuery table which may contain nested structures, arrays, or other complex data types.";
|
||
|
|
}
|
||
|
|
|
||
|
|
prompt += `
|
||
|
|
|
||
|
|
Provide a brief description of what this table represents and its business purpose.
|
||
|
|
Do NOT list or describe individual columns or fields.
|
||
|
|
Start directly with the content description without mentioning the table name.
|
||
|
|
Focus on the data's meaning and business purpose.
|
||
|
|
Example: "Information about healthcare professionals used for workforce management" instead of "The blahblah table contains information about healthcare professionals including their names, titles..."
|
||
|
|
`;
|
||
|
|
|
||
|
|
return prompt.trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
export function buildKloDataSourceDescriptionPrompt(input: KloDataSourceDescriptionPromptInput): string {
|
||
|
|
const tablesText = input.tableSamples
|
||
|
|
.map(
|
||
|
|
([tableName, sampleData]) =>
|
||
|
|
`${tableName} (${sampleData.headers.length} columns, ${sampleData.rows.length} sample rows)`,
|
||
|
|
)
|
||
|
|
.join(' | ');
|
||
|
|
|
||
|
|
let prompt = `
|
||
|
|
Analyze this database and provide a concise description:
|
||
|
|
|
||
|
|
Tables: ${tablesText}
|
||
|
|
Total tables analyzed: ${input.tableSamples.length}
|
||
|
|
Data source type: ${input.dataSourceType}
|
||
|
|
`;
|
||
|
|
|
||
|
|
if (input.dataSourceType === 'BIGQUERY') {
|
||
|
|
prompt +=
|
||
|
|
"\nNote (Don't include this note in the final answer): This is a BigQuery dataset which may contain large-scale analytics data, nested structures, and complex data types.";
|
||
|
|
}
|
||
|
|
|
||
|
|
prompt += `
|
||
|
|
|
||
|
|
Provide a direct, concise description of what this database represents and its business purpose.
|
||
|
|
Do NOT start with phrases like "This database appears to represent" or "This BigQuery dataset".
|
||
|
|
Start directly with the domain or business area description.
|
||
|
|
Focus on the overall data model and its intended use.
|
||
|
|
Example: "Healthcare-related database with a focus on patient management..." instead of "This database appears to represent a healthcare-related system..."
|
||
|
|
`;
|
||
|
|
|
||
|
|
return prompt.trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
export class KloDescriptionGenerator {
|
||
|
|
private readonly llmProvider: KloLlmProvider;
|
||
|
|
private readonly cache?: KloDescriptionCachePort;
|
||
|
|
private readonly logger?: KloScanLoggerPort;
|
||
|
|
private readonly settings: ResolvedKloDescriptionGenerationSettings;
|
||
|
|
|
||
|
|
constructor(options: KloDescriptionGeneratorOptions) {
|
||
|
|
this.llmProvider = options.llmProvider;
|
||
|
|
this.cache = options.cache;
|
||
|
|
this.logger = options.logger;
|
||
|
|
this.settings = {
|
||
|
|
columnMaxWords: options.settings.columnMaxWords,
|
||
|
|
tableMaxWords: options.settings.tableMaxWords,
|
||
|
|
dataSourceMaxWords: options.settings.dataSourceMaxWords,
|
||
|
|
...(options.settings.temperature !== undefined ? { temperature: options.settings.temperature } : {}),
|
||
|
|
concurrencyLimit: options.settings.concurrencyLimit ?? 5,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
async generateColumnDescriptions(input: KloGenerateColumnDescriptionsInput): Promise<KloColumnAnalysisResult> {
|
||
|
|
const columnsToProcess = input.table.columns;
|
||
|
|
const tableContext = `Table: ${input.table.name} | Columns: ${columnsToProcess.map((column) => column.name).join(', ')} | Data source: ${input.dataSourceType}`;
|
||
|
|
|
||
|
|
const results = await runWithConcurrency(columnsToProcess, this.settings.concurrencyLimit, async (column) =>
|
||
|
|
this.generateOneColumnDescription(input, column, tableContext),
|
||
|
|
);
|
||
|
|
|
||
|
|
const columnDescriptions: Array<[string, string | null]> = [];
|
||
|
|
const processedColumns: string[] = [];
|
||
|
|
const skippedColumns: string[] = [];
|
||
|
|
|
||
|
|
for (const result of results) {
|
||
|
|
columnDescriptions.push([result.columnName, result.description]);
|
||
|
|
if (result.skipped) {
|
||
|
|
skippedColumns.push(result.columnName);
|
||
|
|
} else if (result.processed) {
|
||
|
|
processedColumns.push(result.columnName);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return {
|
||
|
|
columnDescriptions,
|
||
|
|
processedColumns,
|
||
|
|
skippedColumns,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
async generateTableDescription(input: KloGenerateTableDescriptionInput): Promise<string> {
|
||
|
|
const tableRef = toTableRef(input.table);
|
||
|
|
const cacheKey = this.cache?.buildTableKey(tableRef);
|
||
|
|
if (cacheKey) {
|
||
|
|
const cached = await this.cache?.get(cacheKey);
|
||
|
|
if (cached) {
|
||
|
|
return cached;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!input.connector.sampleTable) {
|
||
|
|
this.logger?.warn('KLO scan connector does not support table sampling for table description generation', {
|
||
|
|
connectorId: input.connector.id,
|
||
|
|
table: input.table.name,
|
||
|
|
});
|
||
|
|
return 'Table not found';
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
const sampleData = await input.connector.sampleTable(
|
||
|
|
{
|
||
|
|
connectionId: input.connectionId,
|
||
|
|
table: tableRef,
|
||
|
|
limit: 20,
|
||
|
|
},
|
||
|
|
input.context,
|
||
|
|
);
|
||
|
|
const prompt = buildKloTableDescriptionPrompt({
|
||
|
|
tableName: input.table.name,
|
||
|
|
sampleData,
|
||
|
|
dataSourceType: input.dataSourceType,
|
||
|
|
rawDescriptions: input.table.rawDescriptions,
|
||
|
|
});
|
||
|
|
const description = await this.generateAiDescription(
|
||
|
|
prompt,
|
||
|
|
this.settings.tableMaxWords,
|
||
|
|
'klo-table-description',
|
||
|
|
);
|
||
|
|
if (cacheKey) {
|
||
|
|
await this.cache?.set(cacheKey, description);
|
||
|
|
}
|
||
|
|
return description;
|
||
|
|
} catch (error) {
|
||
|
|
this.logger?.error(`Error generating table description: ${errorMessage(error)}`);
|
||
|
|
return 'Table not found';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async generateDataSourceDescription(input: KloGenerateDataSourceDescriptionInput): Promise<string> {
|
||
|
|
if (input.tables.length === 0) {
|
||
|
|
return 'No tables found in database';
|
||
|
|
}
|
||
|
|
|
||
|
|
const cacheKey = input.connectionName ? this.cache?.buildConnectionKey(input.connectionName) : undefined;
|
||
|
|
if (cacheKey) {
|
||
|
|
const cached = await this.cache?.get(cacheKey);
|
||
|
|
if (cached) {
|
||
|
|
return cached;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!input.connector.sampleTable) {
|
||
|
|
this.logger?.warn('KLO scan connector does not support table sampling for data-source description generation', {
|
||
|
|
connectorId: input.connector.id,
|
||
|
|
});
|
||
|
|
return 'No accessible tables found in database';
|
||
|
|
}
|
||
|
|
|
||
|
|
const tablesToAnalyze = input.tables.slice(0, 10);
|
||
|
|
const tableSamples = await runWithConcurrency(tablesToAnalyze, this.settings.concurrencyLimit, async (table) => {
|
||
|
|
try {
|
||
|
|
const sampleData = await input.connector.sampleTable!(
|
||
|
|
{
|
||
|
|
connectionId: input.connectionId,
|
||
|
|
table: toTableRef(table),
|
||
|
|
limit: 5,
|
||
|
|
},
|
||
|
|
input.context,
|
||
|
|
);
|
||
|
|
return [table.name, sampleData] as [string, KloTableSampleResult];
|
||
|
|
} catch (error) {
|
||
|
|
this.logger?.warn(`Failed to sample table '${table.name}' for data source analysis - ${errorMessage(error)}`);
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
const accessibleSamples = tableSamples.filter(
|
||
|
|
(sample): sample is [string, KloTableSampleResult] => sample !== null,
|
||
|
|
);
|
||
|
|
if (accessibleSamples.length === 0) {
|
||
|
|
return 'No accessible tables found in database';
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
const prompt = buildKloDataSourceDescriptionPrompt({
|
||
|
|
tableSamples: accessibleSamples,
|
||
|
|
dataSourceType: input.dataSourceType,
|
||
|
|
});
|
||
|
|
const description = await this.generateAiDescription(
|
||
|
|
prompt,
|
||
|
|
this.settings.dataSourceMaxWords,
|
||
|
|
'klo-data-source-description',
|
||
|
|
);
|
||
|
|
if (cacheKey) {
|
||
|
|
await this.cache?.set(cacheKey, description);
|
||
|
|
}
|
||
|
|
return description;
|
||
|
|
} catch (error) {
|
||
|
|
this.logger?.error(`Error generating data source description: ${errorMessage(error)}`);
|
||
|
|
return 'Failed to generate data source description';
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
private async generateOneColumnDescription(
|
||
|
|
input: KloGenerateColumnDescriptionsInput,
|
||
|
|
column: KloDescriptionColumn,
|
||
|
|
tableContext: string,
|
||
|
|
): Promise<ColumnTaskResult> {
|
||
|
|
const existingDescription = input.existingDescriptions?.[column.name];
|
||
|
|
if (input.skipExisting && existingDescription) {
|
||
|
|
return {
|
||
|
|
columnName: column.name,
|
||
|
|
description: existingDescription,
|
||
|
|
skipped: true,
|
||
|
|
processed: false,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
const tableRef = toTableRef(input.table);
|
||
|
|
const cacheKey = this.cache?.buildColumnKey(tableRef, column.name);
|
||
|
|
if (cacheKey) {
|
||
|
|
const cached = await this.cache?.get(cacheKey);
|
||
|
|
if (cached) {
|
||
|
|
return {
|
||
|
|
columnName: column.name,
|
||
|
|
description: cached,
|
||
|
|
skipped: true,
|
||
|
|
processed: false,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
try {
|
||
|
|
let columnValues = column.sampleValues;
|
||
|
|
if (!columnValues || columnValues.length === 0) {
|
||
|
|
if (!input.connector.sampleColumn) {
|
||
|
|
this.logger?.warn('KLO scan connector does not support column sampling for column description generation', {
|
||
|
|
connectorId: input.connector.id,
|
||
|
|
table: input.table.name,
|
||
|
|
column: column.name,
|
||
|
|
});
|
||
|
|
return {
|
||
|
|
columnName: column.name,
|
||
|
|
description: null,
|
||
|
|
skipped: false,
|
||
|
|
processed: false,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
const sample = await input.connector.sampleColumn(
|
||
|
|
{
|
||
|
|
connectionId: input.connectionId,
|
||
|
|
table: tableRef,
|
||
|
|
column: column.name,
|
||
|
|
limit: 50,
|
||
|
|
},
|
||
|
|
input.context,
|
||
|
|
);
|
||
|
|
columnValues = sample.values;
|
||
|
|
}
|
||
|
|
|
||
|
|
const nonNullValues = (columnValues ?? []).filter((value) => value !== null && value !== undefined);
|
||
|
|
if (nonNullValues.length === 0) {
|
||
|
|
return {
|
||
|
|
columnName: column.name,
|
||
|
|
description: null,
|
||
|
|
skipped: false,
|
||
|
|
processed: false,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
const prompt = buildKloColumnDescriptionPrompt({
|
||
|
|
columnName: column.name,
|
||
|
|
columnValues: nonNullValues,
|
||
|
|
tableContext,
|
||
|
|
dataSourceType: input.dataSourceType,
|
||
|
|
supportsNestedAnalysis: input.supportsNestedAnalysis,
|
||
|
|
rawDescriptions: column.rawDescriptions,
|
||
|
|
});
|
||
|
|
const description = await this.generateAiDescription(
|
||
|
|
prompt,
|
||
|
|
this.settings.columnMaxWords,
|
||
|
|
'klo-column-description',
|
||
|
|
);
|
||
|
|
|
||
|
|
if (cacheKey) {
|
||
|
|
await this.cache?.set(cacheKey, description);
|
||
|
|
}
|
||
|
|
|
||
|
|
return {
|
||
|
|
columnName: column.name,
|
||
|
|
description,
|
||
|
|
skipped: false,
|
||
|
|
processed: true,
|
||
|
|
};
|
||
|
|
} catch (error) {
|
||
|
|
this.logger?.error(`Error analyzing column '${column.name}': ${errorMessage(error)}`);
|
||
|
|
return {
|
||
|
|
columnName: column.name,
|
||
|
|
description: `Error generating description: ${errorMessage(error)}`,
|
||
|
|
skipped: false,
|
||
|
|
processed: false,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
private async generateAiDescription(prompt: string, maxWords: number, _operationName: string): Promise<string> {
|
||
|
|
try {
|
||
|
|
const text = await generateKloText({
|
||
|
|
llmProvider: this.llmProvider,
|
||
|
|
role: 'candidateExtraction',
|
||
|
|
prompt: appendKloWordLimitInstruction(prompt, maxWords),
|
||
|
|
temperature: this.settings.temperature,
|
||
|
|
});
|
||
|
|
const description = text.trim();
|
||
|
|
return description || 'Failed to generate description';
|
||
|
|
} catch (error) {
|
||
|
|
this.logger?.error(`Error generating AI description: ${errorMessage(error)}`);
|
||
|
|
return `Error generating description: ${errorMessage(error)}`;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|