diff --git a/data/inference/make_datasets/repo_utils.py b/data/inference/make_datasets/repo_utils.py new file mode 100644 index 000000000..18eda3134 --- /dev/null +++ b/data/inference/make_datasets/repo_utils.py @@ -0,0 +1,83 @@ +import os +import subprocess +from typing import Dict +from traceback import format_exc +from pathlib import Path +import git +from git.exc import GitError + +from metagpt.logs import logger + +KEY_INSTANCE_ID = "instance_id" +RESET_FAILED = ">>>>> Reset Failed" + + +class ExecWrapper: + def __init__(self, subprocess_args: Dict = None): + self.subprocess_args = subprocess_args or {} + + def __call__(self, cmd, raise_error=True, **kwargs): + try: + combined_args = {**self.subprocess_args, **kwargs} + output = subprocess.run(cmd, **combined_args) + return output + except subprocess.CalledProcessError as e: + if raise_error: + error_message = f"Error: {e}\nError stdout: {e.stdout}\nError stderr: {e.stderr}\nError traceback: {format_exc()}" + logger.error(error_message) + raise e + + +class EnvManager: + def __init__(self, testbed): + shellenv = os.environ.copy() + self.testbed = testbed + + self.exec = ExecWrapper( + subprocess_args={ + "check": True, + "shell": False, + "capture_output": True, + "text": True, + "env": shellenv, + } + ) + + def clone_repo(self, repo_name: str, path: str, token: str = None): + if token is None: + token = os.environ.get("GITHUB_TOKEN", "git") + if not token: + raise ValueError("GitHub token is required for cloning repositories.") + + repo_url = f"https://{token}@github.com/swe-bench/{repo_name.replace('/', '__')}.git" + + try: + # Ensure the destination directory exists + os.makedirs(path, exist_ok=True) + + # Clone the repository + git.Repo.clone_from(repo_url, path) + print(f"Repository '{repo_name}' cloned successfully.") + except GitError as e: + print(f"Failed to clone repository '{repo_name}': {e}") + + def reset_task_env(self, instance: Dict): + """ + Reset task environment + testbed and checkout base commit of given task instance + """ + try: + gitignore_path = Path(".gitignore") + if gitignore_path.exists(): + self.exec(["git", "ls-files", "--ignored", "--exclude-standard", "-o", "-z"], raise_error=False) + self.exec(["xargs", "-0", "-r", "rm", "-rf"], input=gitignore_path.read_text()) + + self.exec(["git", "restore", "."]) + self.exec(["git", "reset", "HEAD", "."]) + self.exec(["git", "clean", "-fdx"]) + self.exec(["git", "-c", "advice.detachedHead=false", "checkout", instance['base_commit']]) + logger.info(f"[{instance['instance_id']}] Reset task environment to {instance['base_commit']}") + return True + except Exception as e: + err_msg = f"{RESET_FAILED}; Failed to reset task environment to {instance['base_commit']}: {e}" + logger.error(err_msg) + return False \ No newline at end of file diff --git a/data/inference/make_datasets/utils.py b/data/inference/make_datasets/utils.py new file mode 100644 index 000000000..6ecbd5832 --- /dev/null +++ b/data/inference/make_datasets/utils.py @@ -0,0 +1,28 @@ +import re + + +def extract_diff(response): + """ + Extracts the diff from a response formatted in different ways + """ + if response is None: + return None + diff_matches = [] + other_matches = [] + pattern = re.compile(r"\<([\w-]+)\>(.*?)\<\/\1\>", re.DOTALL) + for code, match in pattern.findall(response): + if code in {"diff", "patch"}: + diff_matches.append(match) + else: + other_matches.append(match) + pattern = re.compile(r"```(\w+)?\n(.*?)```", re.DOTALL) + for code, match in pattern.findall(response): + if code in {"diff", "patch"}: + diff_matches.append(match) + else: + other_matches.append(match) + if diff_matches: + return diff_matches[0] + if other_matches: + return other_matches[0] + return response.split("")[0] \ No newline at end of file diff --git a/data/inference/run.py b/data/inference/run.py new file mode 100644 index 000000000..d71ff580a --- /dev/null +++ b/data/inference/run.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# @Author : stellahong (stellahong@fuzhi.ai) +# @Desc : +import runpy +import sys + +# 备份原始的sys.argv +original_argv = sys.argv.copy() + +try: + # 设置你想要传递给脚本的命令行参数 + sys.argv = ['run_api.py', '--dataset_name_or_path', 'princeton-nlp/SWE-bench_oracle', '--output_dir', + './outputs'] + # 添加其他可选参数到sys.argv中,例如: + # sys.argv.extend(['--some_option', 'some_value']) + + # 执行脚本 + runpy.run_path(path_name='run_api.py', run_name='__main__') +finally: + # 恢复原始的sys.argv以避免对后续代码的潜在影响 + sys.argv = original_argv +