diff --git a/metagpt/prompts/ml_engineer.py b/metagpt/prompts/ml_engineer.py index 33eb9c40c..ff446281c 100644 --- a/metagpt/prompts/ml_engineer.py +++ b/metagpt/prompts/ml_engineer.py @@ -6,7 +6,7 @@ # @Desc : UPDATE_DATA_COLUMNS = """ # Background -Keep dataset column information updated to reflect changes in training or testing datasets, aiding in informed decision-making during data analysis. +Keep dataset column information updated before model train. ## Done Tasks ```python {history_code} @@ -18,15 +18,13 @@ Update and print the dataset's column information only if the train or test data from metagpt.tools.functions.libs.data_preprocess import get_column_info column_info = get_column_info(df) -print("df_column_info") +print("column_info") print(column_info) ```end # Constraints: - Use the DataFrame variable from 'Done Tasks' in place of df. - Import `get_column_info` only if it's not already imported. -- Skip update if no changes in training/testing data, except for initial data load. -- No need to update info if only model evaluation is performed. """ GEN_DATA_DESC_PROMPT = """ @@ -185,7 +183,7 @@ ojb_cols = train.select_dtypes(include='object').columns.tolist() for col in obj_cols: encoder = LabelEncoder() - train[col] = encoder.fit_transform(train[col]) + train[col] = encoder.fit_transform(train[col].unique().tolist() + ['unknown']) test[col] = test[col].apply(lambda x: x if x in encoder.classes_ else 'unknown') test[col] = encoder.transform(test[col]) @@ -241,6 +239,8 @@ from metagpt.tools.functions.libs.data_preprocess import FillMissingValue train_processed = train.copy() test_processed = test.copy() num_cols = train_processed.select_dtypes(include='number').columns.tolist() +if 'label' in num_cols: + num_cols.remove('label') fill_missing_value = FillMissingValue(features=num_cols, strategy='mean') fill_missing_value.fit(train_processed) train_processed = fill_missing_value.transform(train_processed) @@ -266,23 +266,29 @@ The current task is about data preprocessing, please note the following: - Monitor data types per column, applying appropriate methods. - Ensure operations are on existing dataset columns. - Avoid writing processed data to files. +- Avoid any change to label column, such as standardization, etc. - Prefer alternatives to one-hot encoding for categorical data. -- Only encode necessary categorical columns to allow for potential feature-specific engineering tasks later. +- Only encode or scale necessary columns to allow for potential feature-specific engineering tasks (like time_extract, binning, extraction, etc.) later. +- Each step do data preprocessing to train, must do same for test separately at the same time. """ FEATURE_ENGINEERING_PROMPT = """ The current task is about feature engineering. when performing it, please adhere to the following principles: -- Ensure operations are on existing dataset columns and consider the data type (numerical, categorical, etc.) and application scenario (classification, regression tasks, etc.). -- Create impactful features based on real-world knowledge and column info. -- Generate as diverse features as possible to improve the model's performance. +- Generate as diverse features as possible to improve the model's performance step-by-step. - If potential impactful features are not included in 'Code Steps', add new steps to generate them. +- Avoid creating redundant or excessively numerous features in one step. +- Exclude ID columns from feature generation and remove them. +- Each step do feature engineering to train, must do same for test separately at the same time. +- Avoid using the label column to create features, except for cat encoding. +- Use the data from previous task result if exist, do not mock or reload data yourself. """ MODEL_TRAIN_PROMPT = """ The current task is about training a model, please ensure high performance: - Keep in mind that your user prioritizes results and is highly focused on model performance. So, when needed, feel free to use models of any complexity to improve effectiveness, such as lightGBM, XGBoost, CatBoost, etc. -- Before training, first check not is_numeric_dtype columns and use label encoding to convert them to numeric columns. +- If non-numeric columns exist, perform label encode together with all steps. - Use the data from previous task result directly, do not mock or reload data yourself. +- Set suitable hyperparameters for the model, make metrics as high as possible. """ MODEL_EVALUATE_PROMPT = """ diff --git a/metagpt/roles/ml_engineer.py b/metagpt/roles/ml_engineer.py index 73aba1fe8..8ad7f43c9 100644 --- a/metagpt/roles/ml_engineer.py +++ b/metagpt/roles/ml_engineer.py @@ -80,8 +80,8 @@ class MLEngineer(Role): task.result = result self.plan.finish_current_task() self.working_memory.clear() - - if self.use_tools: + + if self.use_tools and task.task_type not in ['model_train', 'model_evaluate']: success, new_code = await self._update_data_columns() if success: task.code = task.code + "\n\n" + new_code @@ -120,6 +120,7 @@ class MLEngineer(Role): if is_update: result, success = await self.execute_code.run(code) if success: + print(result) self.data_desc["column_info"] = result return success, code @@ -269,7 +270,7 @@ if __name__ == "__main__": # requirement = f"This is a medical dataset with over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions.The target column is Class. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report f1 score on the eval data. Train data path: {data_path}/split_train.csv, eval data path: {data_path}/split_eval.csv." # data_path = f"{DATA_PATH}/santander-customer-transaction-prediction" - # requirement = f"This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report F1 Score on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv' ." + # requirement = f"This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report AUC Score on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv' ." data_path = f"{DATA_PATH}/house-prices-advanced-regression-techniques" requirement = f"This is a house price dataset, your goal is to predict the sale price of a property based on its features. The target column is SalePrice. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sales price on the eval data. Train data path: '{data_path}/split_train.csv', eval data path: '{data_path}/split_eval.csv'."