diff --git a/expo/MCTS.py b/expo/MCTS.py index dd4ad50b1..9787ea5e9 100644 --- a/expo/MCTS.py +++ b/expo/MCTS.py @@ -177,6 +177,8 @@ class Node(): preds.to_csv(pred_node_path, index=False) gt = pd.read_csv(gt_path)["target"] metric = self.state["dataset_config"]["metric"] + # remove original predictions.csv + os.remove(pred_path) return evaluate_score(preds, gt, metric) def evaluate_simulation(self, score_dict): diff --git a/expo/experimenter/custom.py b/expo/experimenter/custom.py index ff5ba3546..c3cb97b9c 100644 --- a/expo/experimenter/custom.py +++ b/expo/experimenter/custom.py @@ -16,9 +16,9 @@ class CustomExperimenter(Experimenter): self.result_path = f"results/custom_{self.name}" self.state = create_initial_state(self.task, start_task_id=1, data_config=self.data_config, low_is_better=self.low_is_better, name=self.name) - async def run_experiment(self): + def run_experiment(self): user_requirement = self.state["requirement"] - preds = await self.framework.run(user_requirement) + preds = self.framework.run(user_requirement) test_preds = preds["test_preds"] dev_preds = preds["dev_preds"] score_dict = { diff --git a/expo/experimenter/experimenter.py b/expo/experimenter/experimenter.py index 678d48d6a..949ab97f1 100644 --- a/expo/experimenter/experimenter.py +++ b/expo/experimenter/experimenter.py @@ -33,11 +33,22 @@ class Experimenter: "user_requirement": user_requirement, "args": vars(self.args) }) - scores = [result["score_dict"]["test_score"] for result in results] - avg_score = sum(scores) / len(scores) - best_score = max(scores) if not self.args.low_is_better else min(scores) - best_score_idx = scores.index(best_score) - results.insert(0, {"avg_score": avg_score, "best_score": best_score, "best_score_idx": best_score_idx}) + self.save_result(results) # save intermediate results + dev_scores = [result["score_dict"]["dev_score"] for result in results] + best_dev_score = max(dev_scores) if not self.args.low_is_better else min(dev_scores) + best_score_idx = dev_scores.index(best_dev_score) + + test_scores = [result["score_dict"]["test_score"] for result in results] + avg_score = sum(test_scores) / len(test_scores) + global_best_score = max(test_scores) if not self.args.low_is_better else min(test_scores) + + results.insert(0, { + "best_dev_score": best_dev_score, + "best_score_idx": best_score_idx, + "best_test_score": test_scores[best_score_idx], + "avg_test_score": avg_score, + "best_score": global_best_score + }) self.save_result(results) def evaluate_prediction(self, split, state): @@ -49,6 +60,7 @@ class Experimenter: preds.to_csv(pred_node_path, index=False) gt = pd.read_csv(gt_path)["target"] metric = state["dataset_config"]["metric"] + os.remove(pred_path) return evaluate_score(preds, gt, metric) def evaluate(self, score_dict, state): @@ -61,6 +73,14 @@ class Experimenter: def save_result(self, result): + end_time = datetime.datetime.now().strftime("%Y%m%d%H%M") + time_info = { + "start_time": self.start_time, + "end_time": end_time, + "duration (seconds)": float(end_time) - float(self.start_time) + } + result = result.copy() + result.insert(0, time_info) os.makedirs(self.result_path, exist_ok=True) with open(f"{self.result_path}/{self.args.exp_mode}-{self.args.task}_{self.start_time}.json", "w") as f: json.dump(result, f, indent=4) diff --git a/expo/experimenter/mcts.py b/expo/experimenter/mcts.py index 43c5f9868..e41f94d58 100644 --- a/expo/experimenter/mcts.py +++ b/expo/experimenter/mcts.py @@ -24,15 +24,16 @@ class MCTSExperimenter(Experimenter): print(text) self.save_tree(text) - results = { + results = [{ "best_node": best_node.id, "best_node_score": best_node.raw_reward, "dev_best_node": dev_best_node.id, "dev_best_node_score": dev_best_node.raw_reward, "num_generated_codes": num_generated_codes, "user_requirement": best_node.state["requirement"], + "tree_text": text, "args": vars(self.args) - } + }] self.save_result(results)