use path instead of os.join

2026-05-27 14:25:20 +02:00 · 2024-11-04 21:05:53 +08:00 · 2024-11-04 21:05:53 +08:00 · 4002e6f07e
commit 4002e6f07e
parent 5edeb9b5a8
9 changed files with 54 additions and 49 deletions
--- a/metagpt/ext/sela/data/custom_task.py
+++ b/metagpt/ext/sela/data/custom_task.py
@ -1,4 +1,5 @@
 import os
+from pathlib import Path

 from metagpt.ext.sela.data.dataset import SPECIAL_INSTRUCTIONS
 from metagpt.ext.sela.runner.mle_bench.instructions import (
@ -62,7 +63,7 @@ def get_mle_bench_requirements(dataset_dir, data_config, special_instruction, ob
        instructions = INSTRUCTIONS.format(dataset_dir=dataset_dir, output_dir=output_dir)
        task_file = "description.md"

-    with open(os.path.join(dataset_dir, task_file), encoding="utf-8") as f:
+    with open(Path(dataset_dir) / task_file, encoding="utf-8") as f:
        task_description = f.read()
    mle_requirement = MLE_REQUIREMENTS.format(
        instructions=instructions,
--- a/metagpt/ext/sela/data/dataset.py
+++ b/metagpt/ext/sela/data/dataset.py
@ -113,15 +113,15 @@ def get_split_dataset_path(dataset_name, config):
    datasets_dir = config["datasets_dir"]
    if dataset_name in config["datasets"]:
        dataset = config["datasets"][dataset_name]
-        data_path = os.path.join(datasets_dir, dataset["dataset"])
+        data_path = Path(datasets_dir) / dataset["dataset"]
        split_datasets = {
-            "train": os.path.join(data_path, "split_train.csv"),
-            "dev": os.path.join(data_path, "split_dev.csv"),
-            "dev_wo_target": os.path.join(data_path, "split_dev_wo_target.csv"),
-            "dev_target": os.path.join(data_path, "split_dev_target.csv"),
-            "test": os.path.join(data_path, "split_test.csv"),
-            "test_wo_target": os.path.join(data_path, "split_test_wo_target.csv"),
-            "test_target": os.path.join(data_path, "split_test_target.csv"),
+            "train": data_path / "split_train.csv",
+            "dev": data_path / "split_dev.csv",
+            "dev_wo_target": data_path / "split_dev_wo_target.csv",
+            "dev_target": data_path / "split_dev_target.csv",
+            "test": data_path / "split_test.csv",
+            "test_wo_target": data_path / "split_test_wo_target.csv",
+            "test_target": data_path / "split_test_target.csv",
        }
        return split_datasets
    else:
@ -131,10 +131,8 @@ def get_split_dataset_path(dataset_name, config):


 def get_user_requirement(task_name, config):
-    # datasets_dir = config["datasets_dir"]
    if task_name in config["datasets"]:
        dataset = config["datasets"][task_name]
-        # data_path = os.path.join(datasets_dir, dataset["dataset"])
        user_requirement = dataset["user_requirement"]
        return user_requirement
    else:
@ -220,22 +218,22 @@ class ExpDataset:
            "split_test_target.csv",
        ]
        for fname in fnames:
-            if not os.path.exists(Path(self.dataset_dir, self.name, fname)):
+            if not Path(self.dataset_dir, self.name, fname).exists():
                return False
        return True

    def check_datasetinfo_exists(self):
-        return os.path.exists(Path(self.dataset_dir, self.name, "dataset_info.json"))
+        return Path(self.dataset_dir, self.name, "dataset_info.json").exists()

    def get_raw_dataset(self):
        raw_dir = Path(self.dataset_dir, self.name, "raw")
        train_df = None
        test_df = None
-        if not os.path.exists(Path(raw_dir, "train.csv")):
+        if not Path(raw_dir, "train.csv").exists():
            raise FileNotFoundError(f"Raw dataset `train.csv` not found in {raw_dir}")
        else:
            train_df = pd.read_csv(Path(raw_dir, "train.csv"))
-        if os.path.exists(Path(raw_dir, "test.csv")):
+        if Path(raw_dir, "test.csv").exists():
            test_df = pd.read_csv(Path(raw_dir, "test.csv"))
        return train_df, test_df

--- a/metagpt/ext/sela/data/hf_data.py
+++ b/metagpt/ext/sela/data/hf_data.py
@ -1,6 +1,5 @@
 import asyncio
 import io
-import os
 from pathlib import Path

 import pandas as pd
@ -63,7 +62,7 @@ class HFExpDataset(ExpDataset):
        raw_dir = Path(self.dataset_dir, self.name, "raw")
        raw_dir.mkdir(parents=True, exist_ok=True)

-        if os.path.exists(Path(raw_dir, "train.csv")):
+        if Path(raw_dir, "train.csv").exists():
            df = pd.read_csv(Path(raw_dir, "train.csv"), encoding="utf-8")
        else:
            df = self.dataset["train"].to_pandas()
@ -73,7 +72,7 @@ class HFExpDataset(ExpDataset):

            df.to_csv(Path(raw_dir, "train.csv"), index=False, encoding="utf-8")

-        if os.path.exists(Path(raw_dir, "test.csv")):
+        if Path(raw_dir, "test.csv").exists():
            test_df = pd.read_csv(Path(raw_dir, "test.csv"), encoding="utf-8")
        else:
            if self.dataset and "test" in self.dataset:
--- a/metagpt/ext/sela/experimenter.py
+++ b/metagpt/ext/sela/experimenter.py
@ -2,7 +2,7 @@ from __future__ import annotations

 import asyncio
 import json
-import os
+from pathlib import Path

 from pydantic import model_validator

@ -172,7 +172,7 @@ class Experimenter(DataInterpreter):
            mcts_logger.log("MCTS", "Static Saving")
        stg_path = self.role_dir
        name = self.get_node_name()
-        role_path = os.path.join(stg_path, f"{name}.json")
+        role_path = Path(stg_path) / f"{name}.json"
        # save state as json file
        write_json_file(role_path, self.model_dump())

--- a/metagpt/ext/sela/runner/autogluon.py
+++ b/metagpt/ext/sela/runner/autogluon.py
@ -1,9 +1,10 @@
-import os
 from datetime import datetime
+from pathlib import Path

 import pandas as pd

 from metagpt.ext.sela.runner.custom import CustomRunner
+from metagpt.ext.sela.utils import DATA_CONFIG


 class AGRunner:
@ -80,7 +81,7 @@ class AGRunner:
        """

        # Define the root path to append
-        root_folder = os.path.join("F:/Download/Dataset/", self.state["task"])
+        root_folder = Path(DATA_CONFIG["datasets_dir"]) / self.state["task"]

        # Load the datasets
        train_data = pd.read_csv(train_path)
@ -92,12 +93,10 @@ class AGRunner:
        image_column = train_data.columns[0]

        # Append root folder path to the image column in each dataset
-        train_data[image_column] = train_data[image_column].apply(lambda x: os.path.join(root_folder, x))
-        dev_data[image_column] = dev_data[image_column].apply(lambda x: os.path.join(root_folder, x))
-        dev_wo_target_data[image_column] = dev_wo_target_data[image_column].apply(
-            lambda x: os.path.join(root_folder, x)
-        )
-        test_data[image_column] = test_data[image_column].apply(lambda x: os.path.join(root_folder, x))
+        train_data[image_column] = train_data[image_column].apply(lambda x: Path(root_folder) / x)
+        dev_data[image_column] = dev_data[image_column].apply(lambda x: Path(root_folder) / x)
+        dev_wo_target_data[image_column] = dev_wo_target_data[image_column].apply(lambda x: Path(root_folder) / x)
+        test_data[image_column] = test_data[image_column].apply(lambda x: Path(root_folder) / x)

        return train_data, dev_data, dev_wo_target_data, test_data

--- a/metagpt/ext/sela/runner/custom.py
+++ b/metagpt/ext/sela/runner/custom.py
@ -1,4 +1,4 @@
-import os
+from pathlib import Path

 import pandas as pd

@ -47,7 +47,7 @@ class CustomRunner(Runner):

    def evaluate_predictions(self, preds, split):
        metric = self.state["dataset_config"]["metric"]
-        gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"])
+        gt_path = Path(self.state["datasets_dir"][f"{split}_target"])
        gt = pd.read_csv(gt_path)["target"]
        score = evaluate_score(preds, gt, metric)
        return score
--- a/metagpt/ext/sela/runner/runner.py
+++ b/metagpt/ext/sela/runner/runner.py
@ -1,6 +1,7 @@
 import datetime
 import json
 import os
+from pathlib import Path

 import numpy as np
 import pandas as pd
@ -94,10 +95,10 @@ class Runner:
        self.save_result(results)

    def evaluate_prediction(self, split, state):
-        pred_path = os.path.join(state["work_dir"], state["task"], f"{split}_predictions.csv")
+        pred_path = Path(state["work_dir"]) / state["task"] / f"{split}_predictions.csv"
        os.makedirs(state["node_dir"], exist_ok=True)
-        pred_node_path = os.path.join(state["node_dir"], f"{self.start_time}-{split}_predictions.csv")
-        gt_path = os.path.join(state["datasets_dir"][f"{split}_target"])
+        pred_node_path = Path(state["node_dir"]) / f"{self.start_time}-{split}_predictions.csv"
+        gt_path = Path(state["datasets_dir"]) / f"{split}_target.csv"
        preds = pd.read_csv(pred_path)
        preds = preds[preds.columns.tolist()[-1]]
        preds.to_csv(pred_node_path, index=False)
--- a/metagpt/ext/sela/search/tree_search.py
+++ b/metagpt/ext/sela/search/tree_search.py
@ -2,6 +2,7 @@ import json
 import os
 import pickle
 import shutil
+from pathlib import Path

 import numpy as np
 import pandas as pd
@ -95,7 +96,9 @@ def create_initial_state(task: str, start_task_id: int, data_config: dict, args)
    initial_state = {
        "task": task,
        "work_dir": data_config["work_dir"],
-        "node_dir": os.path.join(data_config["work_dir"], data_config["role_dir"], f"{task}{args.name}"),
+        "node_dir": os.path.join(
+            data_config["work_dir"], data_config["role_dir"], f"{task}{args.name}"
+        ),  # cannot use Path here because of the Pydantic
        "dataset_config": dataset_config,
        "datasets_dir": datasets_dir,  # won't be used if external eval is used
        "exp_pool_path": exp_pool_path,
@ -145,12 +148,15 @@ class Node:
        return hash(self.id)

    def save_node(self):
-        os.makedirs(self.state["node_dir"], exist_ok=True)
-        with open(os.path.join(self.state["node_dir"], f"Node-{self.id}.pkl"), "wb") as f:
+        node_dir = Path(self.state["node_dir"])
+        node_dir.mkdir(parents=True, exist_ok=True)
+        node_path = node_dir / f"Node-{self.id}.pkl"
+        with node_path.open("wb") as f:
            pickle.dump(self, f)

    def load_node(self):
-        with open(os.path.join(self.state["node_dir"], f"Node-{self.id}.pkl"), "rb") as f:
+        node_path = Path(self.state["node_dir"]) / f"Node-{self.id}.pkl"
+        with node_path.open("rb") as f:
            return pickle.load(f)

    def get_depth(self):
@ -195,7 +201,7 @@ class Node:

    def get_role_path(self):
        fname = f"Node-{self.id}.json"
-        role_path = os.path.join(self.state["node_dir"], fname)
+        role_path = Path(self.state["node_dir"]) / fname
        return role_path

    def load_role(self):
@ -239,17 +245,17 @@ class Node:
            self.add_child(node)

    def get_predictions_path(self, split):
-        return os.path.join(self.state["node_dir"], f"Node-{self.id}-{split}_predictions.csv")
+        return Path(self.state["node_dir"]) / f"Node-{self.id}-{split}_predictions.csv"

    def get_and_move_predictions(self, split):
-        if not os.path.exists(self.get_predictions_path(split)):
-            pred_path = os.path.join(self.state["work_dir"], self.state["task"], f"{split}_predictions.csv")
+        if not self.get_predictions_path(split).exists():
+            pred_path = Path(self.state["work_dir"]) / self.state["task"] / f"{split}_predictions.csv"
            shutil.copy(pred_path, self.get_predictions_path(split))
            os.remove(pred_path)
        return pd.read_csv(self.get_predictions_path(split))

    def get_gt(self, split):
-        gt_path = os.path.join(self.state["datasets_dir"][f"{split}_target"])
+        gt_path = Path(self.state["datasets_dir"][f"{split}_target"])
        return pd.read_csv(gt_path)

    def evaluate_prediction(self, split):
@ -391,11 +397,11 @@ class BaseTreeSearch:

    def save_node_order(self, node_id: str):
        self.node_order.append(node_id)
-        with open(os.path.join(self.root_node.state["node_dir"], "node_order.json"), "w") as f:
+        with open(Path(self.root_node.state["node_dir"]) / "node_order.json", "w") as f:
            json.dump(self.node_order, f)

    def load_node_order(self):
-        with open(os.path.join(self.root_node.state["node_dir"], "node_order.json"), "r") as f:
+        with open(Path(self.root_node.state["node_dir"]) / "node_order.json", "r") as f:
            self.node_order = json.load(f)

    def get_score_order_dict(self):
@ -481,8 +487,9 @@ class BaseTreeSearch:
        # Load all pkl files in the node_dir
        all_pkl_files = os.listdir(self.root_node.state["node_dir"])
        all_pkl_files = [f for f in all_pkl_files if f.endswith(".pkl")]
-        if os.path.exists(os.path.join(self.root_node.state["node_dir"], "Node-0.pkl")):
-            with open(os.path.join(self.root_node.state["node_dir"], "Node-0.pkl"), "rb") as f:
+        node_0_path = Path(self.root_node.state["node_dir"]) / "Node-0.pkl"
+        if node_0_path.exists():
+            with open(node_0_path, "rb") as f:
                self.root_node = pickle.load(f)
            self.children[self.root_node] = self.root_node.children
            load_children_node(self.root_node)
--- a/metagpt/ext/sela/utils.py
+++ b/metagpt/ext/sela/utils.py
@ -45,13 +45,13 @@ def get_exp_pool_path(task_name, data_config, pool_name="analysis_pool"):
    datasets_dir = data_config["datasets_dir"]
    if task_name in data_config["datasets"]:
        dataset = data_config["datasets"][task_name]
-        data_path = os.path.join(datasets_dir, dataset["dataset"])
+        data_path = Path(datasets_dir) / dataset["dataset"]
    else:
        raise ValueError(
            f"Dataset {task_name} not found in config file. Available datasets: {data_config['datasets'].keys()}"
        )
-    exp_pool_path = os.path.join(data_path, f"{pool_name}.json")
-    if not os.path.exists(exp_pool_path):
+    exp_pool_path = Path(data_path) / f"{pool_name}.json"
+    if not exp_pool_path.exists():
        return None
    return exp_pool_path