diff --git a/expo/README.md b/expo/README.md index 856d616b8..6e4081031 100644 --- a/expo/README.md +++ b/expo/README.md @@ -1,10 +1,17 @@ # Expo +## Setup +In the root directory, `pip install -e .` + +`cd expo` + +`pip install -r requirements.txt` + ## Instruction - 下载数据集:https://deepwisdom.feishu.cn/drive/folder/RVyofv9cvlvtxKdddt2cyn3BnTc?from=from_copylink - +- 修改`data.yaml`的`datasets_dir`为数据集合集根目录存储位置 ## Examples @@ -29,6 +36,11 @@ ### Run DI MCTS - `python run_experiment.py --exp_mode mcts --task househouse_prices --rollout 5 --low_is_better` +## Custom Experimenter + + + + ## Code and Configs Explanation diff --git a/expo/data.yaml b/expo/data.yaml index df26e29e8..050b0b893 100644 --- a/expo/data.yaml +++ b/expo/data.yaml @@ -2,34 +2,35 @@ datasets_dir: "D:/work/automl/datasets" # path to the datasets directory datasets: titanic: - dataset: "04_titanic" - user_requirement: "This is a titanic passenger survival dataset, your goal is to predict passenger survival outcome. The target column is Survived. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report accuracy on the eval data. Don't plot." - metric: "accuracy" + dataset: 04_titanic + metric: f1 + user_requirement: "This is a 04_titanic dataset. Your goal is to predict the target\ + \ column `Survived`.\nPerform data analysis, data preprocessing, feature engineering,\ + \ and modeling to predict the target. \nReport f1 on the eval data. Do not plot\ + \ or make any visualizations.\n" house_prices: - dataset: "05_house-prices-advanced-regression-techniques" - user_requirement: "This is a house price dataset, your goal is to predict the sale price of a property based on its features. Make sure to generate at least 5 tasks each time, including eda, data preprocessing, feature engineering, model training to predict the target, and model evaluation. Report RMSE between the logarithm of the predicted value and the logarithm of the observed sale prices on the eval data. The target column is 'SalePrice'. Please do not include any processing of the target column in the data preprocessing and feature engineering stages. Don't plot." - metric: "log rmse" + dataset: 05_house-prices-advanced-regression-techniques + metric: rmse + user_requirement: "This is a 05_house-prices-advanced-regression-techniques dataset.\ + \ Your goal is to predict the target column `SalePrice`.\nPerform data analysis,\ + \ data preprocessing, feature engineering, and modeling to predict the target.\ + \ \nReport rmse on the eval data. Do not plot or make any visualizations.\n" santander_customers: - dataset: "06_santander-customer-transaction-prediction" - user_requirement: "This is a customers financial dataset. Your goal is to predict which customers will make a specific transaction in the future. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report AUC on the eval data. Don't plot." - metric: "auc" - + dataset: 06_santander-customer-transaction-prediction + metric: f1 + user_requirement: "This is a 06_santander-customer-transaction-prediction dataset.\ + \ Your goal is to predict the target column `target`.\nPerform data analysis,\ + \ data preprocessing, feature engineering, and modeling to predict the target.\ + \ \nReport f1 on the eval data. Do not plot or make any visualizations.\n" icr: - dataset: "07_icr-identify-age-related-conditions" - user_requirement: "ICR dataset is a medical dataset with over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions. Make sure to generate at least 5 tasks each time, including eda, data preprocessing, feature engineering, model training to predict the target, and model evaluation. The target column is Class. Report F1 Score on the eval data. Don't plot." - metric: "f1" - - santander_value: - dataset: "08_santander-value-prediction-challenge" - user_requirement: "This is a regression problem. Your goal is to predict the value of transactions for potential customers. The target column is target. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Report RMSE on the eval data. Don't plot." - metric: "rmse" - - load_wine: - dataset: None - user_requirement: "Analyze the 'load_wine' dataset from sklearn to predict wine quality. Visualize relationships between features, use machine learning for classification, and report model accuracy. Include analysis and prediction visualizations. Perform data analysis, data preprocessing, feature engineering, and modeling to predict the target. Don't plot!" - metric: "accuracy" + dataset: 07_icr-identify-age-related-conditions + metric: f1 + user_requirement: "This is a 07_icr-identify-age-related-conditions dataset. Your\ + \ goal is to predict the target column `Class`.\nPerform data analysis, data\ + \ preprocessing, feature engineering, and modeling to predict the target. \n\ + Report f1 on the eval data. Do not plot or make any visualizations.\n" lick_prediction_small: dataset: Click_prediction_small diff --git a/expo/requirements.txt b/expo/requirements.txt new file mode 100644 index 000000000..04de1a8bb --- /dev/null +++ b/expo/requirements.txt @@ -0,0 +1,5 @@ +# expo +openml==0.14.2 +# ml module to run in DI +xgboost +catboost diff --git a/requirements.txt b/requirements.txt index 271fade14..8bf0ee399 100644 --- a/requirements.txt +++ b/requirements.txt @@ -79,9 +79,3 @@ gymnasium==0.29.1 boto3~=1.34.69 spark_ai_python~=0.3.30 agentops -openml==0.14.2 - -# ml module to run in DI -xgboost -catboost -lightgbm