diff --git a/expo/data/dataset.py b/expo/data/dataset.py index 8b0c5b980..91490dcd7 100644 --- a/expo/data/dataset.py +++ b/expo/data/dataset.py @@ -1,3 +1,4 @@ +import argparse import asyncio import json import os @@ -18,22 +19,22 @@ Report {metric} on the eval data. Do not plot or make any visualizations. """ USE_AG = """ -7. Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric. +- Please use autogluon for model training with presets='medium_quality', time_limit=None, give dev dataset to tuning_data, and use right eval_metric. """ TEXT_MODALITY = """ -7. You could use models from transformers library for this text dataset. -8. Use gpu if available for faster training. +- You could use models from transformers library for this text dataset. +- Use gpu if available for faster training. """ IMAGE_MODALITY = """ -7. You could use models from transformers/torchvision library for this image dataset. -8. Use gpu if available for faster training. +- You could use models from transformers/torchvision library for this image dataset. +- Use gpu if available for faster training. """ STACKING = """ -7. To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor. -8. You could do some quick model prototyping to see which models work best and then use them in the ensemble. +- To avoid overfitting, train a weighted ensemble model such as StackingClassifier or StackingRegressor. +- You could do some quick model prototyping to see which models work best and then use them in the ensemble. """ @@ -361,10 +362,22 @@ async def process_dataset(dataset, solution_designer: SolutionDesigner, save_ana datasets_dict["datasets"][dataset.name] = dataset_dict +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--force_update", action="store_true", help="Force update datasets") + parser.add_argument("--save_analysis_pool", action="store_true", help="Save analysis pool") + parser.add_argument( + "--no_save_analysis_pool", dest="save_analysis_pool", action="store_false", help="Do not save analysis pool" + ) + parser.set_defaults(save_analysis_pool=True) + return parser.parse_args() + + if __name__ == "__main__": datasets_dir = DATA_CONFIG["datasets_dir"] - force_update = False - save_analysis_pool = True + args = parse_args() + force_update = args.force_update + save_analysis_pool = args.save_analysis_pool datasets_dict = {"datasets": {}} solution_designer = SolutionDesigner() for dataset_id in OPENML_DATASET_IDS: diff --git a/expo/data/hf_data.py b/expo/data/hf_data.py index 133fbdfa6..a18517d49 100644 --- a/expo/data/hf_data.py +++ b/expo/data/hf_data.py @@ -7,7 +7,12 @@ import pandas as pd from datasets import load_dataset from PIL import Image -from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to_yaml +from expo.data.dataset import ( + ExpDataset, + parse_args, + process_dataset, + save_datasets_dict_to_yaml, +) from expo.insights.solution_designer import SolutionDesigner from expo.utils import DATA_CONFIG @@ -116,8 +121,9 @@ class HFExpDataset(ExpDataset): if __name__ == "__main__": dataset_dir = DATA_CONFIG["datasets_dir"] - save_analysis_pool = True - force_update = False + args = parse_args() + force_update = args.force_update + save_analysis_pool = args.save_analysis_pool datasets_dict = {"datasets": {}} solution_designer = SolutionDesigner() for dataset_meta in HFDATSETS: