mirror of
https://github.com/FoundationAgents/MetaGPT.git
synced 2026-06-11 15:15:18 +02:00
1. add support to hf dataset
2. add support to datasets that have both train and test 3. create data folder 4. fix new instruction bug
This commit is contained in:
parent
376d1b7661
commit
c0262bcd8f
5 changed files with 97 additions and 25 deletions
|
|
@ -6,7 +6,7 @@ import random
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from expo.dataset import generate_task_requirement, get_split_dataset_path
|
||||
from expo.data.dataset import generate_task_requirement, get_split_dataset_path
|
||||
from expo.evaluation.evaluation import evaluate_score
|
||||
from expo.insights.instruction_generator import InstructionGenerator
|
||||
from expo.research_assistant import ResearchAssistant
|
||||
|
|
|
|||
|
|
@ -86,6 +86,8 @@ CUSTOM_DATASETS = [
|
|||
("07_icr-identify-age-related-conditions", "Class"),
|
||||
]
|
||||
|
||||
DSAGENT_DATASETS = [("concrete-strength", "Strength"), ("smoker-status", "smoking"), ("software-defects", "defects")]
|
||||
|
||||
|
||||
def get_split_dataset_path(dataset_name, config):
|
||||
datasets_dir = config["datasets_dir"]
|
||||
|
|
@ -121,8 +123,8 @@ def get_user_requirement(task_name, config):
|
|||
)
|
||||
|
||||
|
||||
def save_datasets_dict_to_yaml(datasets_dict):
|
||||
with open("datasets.yaml", "w") as file:
|
||||
def save_datasets_dict_to_yaml(datasets_dict, name="datasets.yaml"):
|
||||
with open(name, "w") as file:
|
||||
yaml.dump(datasets_dict, file)
|
||||
|
||||
|
||||
|
|
@ -201,11 +203,15 @@ class ExpDataset:
|
|||
|
||||
def get_raw_dataset(self):
|
||||
raw_dir = Path(self.dataset_dir, self.name, "raw")
|
||||
train_df = None
|
||||
test_df = None
|
||||
if not os.path.exists(Path(raw_dir, "train.csv")):
|
||||
raise FileNotFoundError(f"Raw dataset `train.csv` not found in {raw_dir}")
|
||||
else:
|
||||
df = pd.read_csv(Path(raw_dir, "train.csv"))
|
||||
return df
|
||||
train_df = pd.read_csv(Path(raw_dir, "train.csv"))
|
||||
if os.path.exists(Path(raw_dir, "test.csv")):
|
||||
test_df = pd.read_csv(Path(raw_dir, "test.csv"))
|
||||
return train_df, test_df
|
||||
|
||||
def get_dataset_info(self):
|
||||
raw_df = pd.read_csv(Path(self.dataset_dir, self.name, "raw", "train.csv"))
|
||||
|
|
@ -249,10 +255,10 @@ class ExpDataset:
|
|||
return req
|
||||
|
||||
def save_dataset(self, target_col):
|
||||
df = self.get_raw_dataset()
|
||||
df, test_df = self.get_raw_dataset()
|
||||
if not self.check_dataset_exists() or self.force_update:
|
||||
print(f"Saving Dataset {self.name} in {self.dataset_dir}")
|
||||
self.split_and_save(df, target_col)
|
||||
self.split_and_save(df, target_col, test_df=test_df)
|
||||
else:
|
||||
print(f"Dataset {self.name} already exists")
|
||||
if not self.check_datasetinfo_exists() or self.force_update:
|
||||
|
|
@ -278,10 +284,13 @@ class ExpDataset:
|
|||
df_target = df_target.drop(columns=[target_col])
|
||||
df_target.to_csv(Path(path, f"split_{split}_target.csv"), index=False)
|
||||
|
||||
def split_and_save(self, df, target_col):
|
||||
def split_and_save(self, df, target_col, test_df=None):
|
||||
if not target_col:
|
||||
raise ValueError("Target column not provided")
|
||||
train, test = train_test_split(df, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED)
|
||||
if test_df is None:
|
||||
train, test = train_test_split(df, test_size=1 - TRAIN_TEST_SPLIT, random_state=SEED)
|
||||
else:
|
||||
train = df
|
||||
train, dev = train_test_split(train, test_size=1 - TRAIN_DEV_SPLIT, random_state=SEED)
|
||||
self.save_split_datasets(train, "train")
|
||||
self.save_split_datasets(dev, "dev", target_col)
|
||||
|
|
@ -304,7 +313,7 @@ class OpenMLExpDataset(ExpDataset):
|
|||
raw_dir = Path(self.dataset_dir, self.name, "raw")
|
||||
os.makedirs(raw_dir, exist_ok=True)
|
||||
dataset_df.to_csv(Path(raw_dir, "train.csv"), index=False)
|
||||
return dataset_df
|
||||
return dataset_df, None
|
||||
|
||||
def get_dataset_info(self):
|
||||
dataset_info = super().get_dataset_info()
|
||||
|
|
@ -315,14 +324,9 @@ class OpenMLExpDataset(ExpDataset):
|
|||
return dataset_info
|
||||
|
||||
|
||||
# class HFExpDataset(ExpDataset):
|
||||
# def __init__(self, name, dataset_dir, dataset_name, **kwargs):
|
||||
# super().__init__(name, dataset_dir, **kwargs)
|
||||
|
||||
|
||||
async def process_dataset(dataset, solution_designer, save_analysis_pool, datasets_dict):
|
||||
async def process_dataset(dataset, solution_designer: SolutionDesigner, save_analysis_pool, datasets_dict):
|
||||
if save_analysis_pool:
|
||||
asyncio.run(solution_designer.generate_solutions(dataset.get_dataset_info(), dataset.name))
|
||||
await solution_designer.generate_solutions(dataset.get_dataset_info(), dataset.name)
|
||||
dataset_dict = create_dataset_dict(dataset)
|
||||
datasets_dict["datasets"][dataset.name] = dataset_dict
|
||||
|
||||
|
|
@ -330,14 +334,18 @@ async def process_dataset(dataset, solution_designer, save_analysis_pool, datase
|
|||
if __name__ == "__main__":
|
||||
datasets_dir = "D:/work/automl/datasets"
|
||||
force_update = False
|
||||
save_analysis_pool = False
|
||||
save_analysis_pool = True
|
||||
datasets_dict = {"datasets": {}}
|
||||
solution_designer = SolutionDesigner()
|
||||
for dataset_id in OPENML_DATASET_IDS:
|
||||
openml_dataset = OpenMLExpDataset("", datasets_dir, dataset_id, force_update=force_update)
|
||||
asyncio.run(process_dataset(openml_dataset, solution_designer, save_analysis_pool, datasets_dict))
|
||||
# for dataset_id in OPENML_DATASET_IDS:
|
||||
# openml_dataset = OpenMLExpDataset("", datasets_dir, dataset_id, force_update=force_update)
|
||||
# asyncio.run(process_dataset(openml_dataset, solution_designer, save_analysis_pool, datasets_dict))
|
||||
|
||||
for dataset_name, target_col in CUSTOM_DATASETS:
|
||||
# for dataset_name, target_col in CUSTOM_DATASETS:
|
||||
# custom_dataset = ExpDataset(dataset_name, datasets_dir, target_col=target_col, force_update=force_update)
|
||||
# asyncio.run(process_dataset(custom_dataset, solution_designer, save_analysis_pool, datasets_dict))
|
||||
|
||||
for dataset_name, target_col in DSAGENT_DATASETS:
|
||||
custom_dataset = ExpDataset(dataset_name, datasets_dir, target_col=target_col, force_update=force_update)
|
||||
asyncio.run(process_dataset(custom_dataset, solution_designer, save_analysis_pool, datasets_dict))
|
||||
|
||||
64
expo/data/hf_data.py
Normal file
64
expo/data/hf_data.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
|
||||
from expo.data.dataset import ExpDataset, process_dataset, save_datasets_dict_to_yaml
|
||||
from expo.insights.solution_designer import SolutionDesigner
|
||||
|
||||
HFDATSETS = [
|
||||
{"name": "sms_spam", "dataset_name": "ucirvine/sms_spam", "target_col": "label"},
|
||||
{"name": "banking77", "dataset_name": "PolyAI/banking77", "target_col": "label"},
|
||||
{"name": "gnad10", "dataset_name": "community-datasets/gnad10", "target_col": "label"},
|
||||
{"name": "oxford-iiit-pet", "dataset_name": "timm/oxford-iiit-pet", "target_col": "label"},
|
||||
{"name": "stanford_cars", "dataset_name": "tanganke/stanford_cars", "target_col": "label"},
|
||||
{"name": "fashion_mnist", "dataset_name": "zalando-datasets/fashion_mnist", "target_col": "label"},
|
||||
]
|
||||
|
||||
|
||||
class HFExpDataset(ExpDataset):
|
||||
train_ratio = 0.6
|
||||
dev_ratio = 0.2
|
||||
test_ratio = 0.2
|
||||
|
||||
def __init__(self, name, dataset_dir, dataset_name, **kwargs):
|
||||
self.name = name
|
||||
self.dataset_dir = dataset_dir
|
||||
self.dataset_name = dataset_name
|
||||
self.target_col = kwargs.get("target_col", "label")
|
||||
self.dataset = load_dataset(dataset_name)
|
||||
super().__init__(self.name, dataset_dir, **kwargs)
|
||||
|
||||
def get_raw_dataset(self):
|
||||
raw_dir = Path(self.dataset_dir, self.name, "raw")
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
if os.path.exists(Path(raw_dir, "train.csv")):
|
||||
df = pd.read_csv(Path(raw_dir, "train.csv"))
|
||||
else:
|
||||
df = self.dataset["train"].to_pandas()
|
||||
df.to_csv(Path(raw_dir, "train.csv"))
|
||||
|
||||
if os.path.exists(Path(raw_dir, "test.csv")):
|
||||
test_df = pd.read_csv(Path(raw_dir, "test.csv"))
|
||||
else:
|
||||
if "test" in self.dataset:
|
||||
test_df = self.dataset["test"].to_pandas()
|
||||
test_df.to_csv(Path(raw_dir, "test.csv"))
|
||||
else:
|
||||
test_df = None
|
||||
return df, test_df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dataset_dir = "D:/work/automl/datasets"
|
||||
save_analysis_pool = True
|
||||
datasets_dict = {"datasets": {}}
|
||||
solution_designer = SolutionDesigner()
|
||||
for dataset_meta in HFDATSETS:
|
||||
hf_dataset = HFExpDataset(
|
||||
dataset_meta["name"], dataset_dir, dataset_meta["dataset_name"], target_col=dataset_meta["target_col"]
|
||||
)
|
||||
asyncio.run(process_dataset(hf_dataset, solution_designer, save_analysis_pool, datasets_dict))
|
||||
save_datasets_dict_to_yaml(datasets_dict, "hf_datasets.yaml")
|
||||
|
|
@ -22,8 +22,8 @@ class MCTSExperimenter(Experimenter):
|
|||
|
||||
text, num_generated_codes = get_tree_text(mcts.root_node)
|
||||
text += f"Generated {num_generated_codes} unique codes.\n"
|
||||
text += f"Best node: {best_node}, score: {best_node.raw_reward}\n"
|
||||
text += f"Dev best node: {dev_best_node}, score: {dev_best_node.raw_reward}\n"
|
||||
text += f"Best node: {best_node.id}, score: {best_node.raw_reward}\n"
|
||||
text += f"Dev best node: {dev_best_node.id}, score: {dev_best_node.raw_reward}\n"
|
||||
print(text)
|
||||
self.save_tree(text)
|
||||
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ class InstructionGenerator:
|
|||
new_instructions = []
|
||||
if len(data) == 0:
|
||||
mcts_logger.log("MCTS", f"No insights available for task {task_id}")
|
||||
return [original_instruction] # Return the original instruction if no insights are available
|
||||
# return [original_instruction] # Return the original instruction if no insights are available
|
||||
for i in range(max_num):
|
||||
if len(data) == 0:
|
||||
insights = "No insights available"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue