update code

change dir, add new role
2026-06-17 15:35:21 +02:00 · 2024-03-22 10:22:49 +08:00 · 2024-03-22 10:22:49 +08:00 · 7bf4505d90
commit 7bf4505d90
parent 3fac156d66
11 changed files with 338 additions and 158 deletions
--- a/data/inference/const.py
+++ b/data/inference/const.py
@ -28,22 +28,38 @@ SCIKIT_LEARN_IDS = [
    "scikit-learn__scikit-learn-10459",
 ]

+MATPLOTLIB_IDS = [
+    "matplotlib__matplotlib-24362",
+    "matplotlib__matplotlib-20584",
+    "matplotlib__matplotlib-23188",
+    "matplotlib__matplotlib-24403",
+    # 'matplotlib__matplotlib-21443',
+    # 'matplotlib__matplotlib-23047'
+]
+

 def read_sub_set_instance(path=SUBSET_DATASET, tag="scikit-learn"):
    try:
        df = pd.read_excel(path)
+        pass_filters = df["instance_id_pass"].tolist()
+        fail_filters = df["instance_id_fail"].tolist()
+        pass_filters = [s for s in pass_filters if tag in s]
+        fail_filters = [s for s in fail_filters if tag in s]
+        print(pass_filters)
+        print(fail_filters)
        # Filter for instances containing the tag in either column
-        pass_filter = df["instance_id_pass"].str.contains(tag, na=False)
-        fail_filter = df["instance_id_fail"].str.contains(tag, na=False)
+        # pass_filter = df["instance_id_pass"].str.contains(tag, na=False)
+        # fail_filter = df["instance_id_fail"].str.contains(tag, na=False)

        # Combine the filters using | (OR operator) for efficiency
-        combined_filter = pass_filter | fail_filter
+        # combined_filter = pass_filters | fail_filters

+        # print(df[combined_filter])
        # Apply combined filter and select the specific columns
-        filtered_df = df[combined_filter][["instance_id_pass", "instance_id_fail"]]
+        # filtered_df = df[combined_filter][["instance_id_pass", "instance_id_fail"]]

        # Flatten the DataFrame into a list and remove NaN values
-        subset_instance = filtered_df.stack().dropna().tolist()
+        subset_instance = pass_filters + fail_filters

        return subset_instance
    except FileNotFoundError:
@ -52,3 +68,7 @@ def read_sub_set_instance(path=SUBSET_DATASET, tag="scikit-learn"):
    except Exception as e:
        print(f"An error occurred: {e}")
        return []
+
+
+if __name__ == "__main__":
+    print(read_sub_set_instance(tag="matplotlib__matplotlib"))
--- a/data/inference/make_datasets/parse_utils.py
+++ b/data/inference/make_datasets/parse_utils.py
@ -1,38 +0,0 @@
-import re
-
-
-def extract_scripts_from_codetext(codetext: str):
-    """
-    Extracts Python script file names from a given text that contains multiple sections.
-    Each section starts with '[start of <script_name>.py]' and ends with '[end of <script_name>.py]'.
-
-    Parameters:
-    - codetext (str): A string that may contain multiple sections, each indicating the start of a Python script file.
-
-    Returns:
-    - list: A list of extracted Python script file names.
-
-    Example of codetext:
-    '''
-    [end of README.rst]
-    [start of sklearn/compose/_target.py]
-    ... file content ...
-    [end of sklearn/compose/_target.py]
-    [start of another_module/example.py]
-    ... file content ...
-    [end of another_module/example.py]
-    '''
-    """
-    script_names = []
-
-    # Match all occurrences of '[start of <script_name>.py]'
-    matches = re.findall(r"\[start of ([^\]]+\.py)\]", codetext)
-
-    if matches:
-        for script_name in matches:
-            print("Extracted script name:", script_name)
-            script_names.append(script_name)
-    else:
-        print("No script names found in the text.")
-
-    return script_names
--- a/data/inference/make_datasets/repo_utils.py
+++ b/data/inference/make_datasets/repo_utils.py
@ -1,87 +0,0 @@
-import os
-import subprocess
-from pathlib import Path
-from traceback import format_exc
-from typing import Dict
-
-import git
-from git.exc import GitError
-
-from metagpt.logs import logger
-
-KEY_INSTANCE_ID = "instance_id"
-RESET_FAILED = ">>>>> Reset Failed"
-
-
-class ExecWrapper:
-    def __init__(self, subprocess_args: Dict = None):
-        self.subprocess_args = subprocess_args or {}
-
-    def __call__(self, cmd, raise_error=True, **kwargs):
-        try:
-            combined_args = {**self.subprocess_args, **kwargs}
-            output = subprocess.run(cmd, **combined_args)
-            return output
-        except subprocess.CalledProcessError as e:
-            if raise_error:
-                error_message = (
-                    f"Error: {e}\nError stdout: {e.stdout}\nError stderr: {e.stderr}\nError traceback: {format_exc()}"
-                )
-                logger.error(error_message)
-                raise e
-
-
-class EnvManager:
-    def __init__(self, testbed):
-        shellenv = os.environ.copy()
-        self.testbed = testbed
-
-        self.exec = ExecWrapper(
-            subprocess_args={
-                "check": True,
-                "shell": False,
-                "capture_output": True,
-                "text": True,
-                "env": shellenv,
-            }
-        )
-
-    def clone_repo(self, repo_name: str, path: str, token: str = None):
-        if token is None:
-            token = os.environ.get("GITHUB_TOKEN", "git")
-            if not token:
-                raise ValueError("GitHub token is required for cloning repositories.")
-
-        repo_url = f"https://{token}@github.com/swe-bench/{repo_name.replace('/', '__')}.git"
-
-        try:
-            # Ensure the destination directory exists
-            os.makedirs(path, exist_ok=True)
-
-            # Clone the repository
-            git.Repo.clone_from(repo_url, path)
-            print(f"Repository '{repo_name}' cloned successfully.")
-        except GitError as e:
-            print(f"Failed to clone repository '{repo_name}': {e}")
-
-    def reset_task_env(self, instance: Dict):
-        """
-        Reset task environment + testbed and checkout base commit of given task instance
-        """
-        try:
-            gitignore_path = Path(".gitignore")
-            if gitignore_path.exists():
-                self.exec(["git", "ls-files", "--ignored", "--exclude-standard", "-o", "-z"], raise_error=False)
-                # fixme: need detect platform and change this cmd
-                # self.exec(["xargs", "-0", "-r", "rm", "-rf"], input=gitignore_path.read_text())
-
-            self.exec(["git", "restore", "."])
-            self.exec(["git", "reset", "HEAD", "."])
-            self.exec(["git", "clean", "-fdx"])
-            self.exec(["git", "-c", "advice.detachedHead=false", "checkout", instance["base_commit"]])
-            logger.info(f"[{instance['instance_id']}] Reset task environment to {instance['base_commit']}")
-            return True
-        except Exception as e:
-            err_msg = f"{RESET_FAILED}; Failed to reset task environment to {instance['base_commit']}: {e}"
-            logger.error(err_msg)
-            return False
--- a/data/inference/make_datasets/utils.py
+++ b/data/inference/make_datasets/utils.py
@ -1,28 +0,0 @@
-import re
-
-
-def extract_diff(response):
-    """
-    Extracts the diff from a response formatted in different ways
-    """
-    if response is None:
-        return None
-    diff_matches = []
-    other_matches = []
-    pattern = re.compile(r"\<([\w-]+)\>(.*?)\<\/\1\>", re.DOTALL)
-    for code, match in pattern.findall(response):
-        if code in {"diff", "patch"}:
-            diff_matches.append(match)
-        else:
-            other_matches.append(match)
-    pattern = re.compile(r"```(\w+)?\n(.*?)```", re.DOTALL)
-    for code, match in pattern.findall(response):
-        if code in {"diff", "patch"}:
-            diff_matches.append(match)
-        else:
-            other_matches.append(match)
-    if diff_matches:
-        return diff_matches[0]
-    if other_matches:
-        return other_matches[0]
-    return response.split("</s>")[0]