add repo utils

2026-04-28 02:23:52 +02:00 · 2024-03-19 17:50:10 +08:00 · 2024-03-19 17:50:10 +08:00 · 45eb352054
commit 45eb352054
parent 1e44d70202
3 changed files with 133 additions and 0 deletions
--- a/data/inference/make_datasets/repo_utils.py
+++ b/data/inference/make_datasets/repo_utils.py
@ -0,0 +1,83 @@
+import os
+import subprocess
+from typing import Dict
+from traceback import format_exc
+from pathlib import Path
+import git
+from git.exc import GitError
+
+from metagpt.logs import logger
+
+KEY_INSTANCE_ID = "instance_id"
+RESET_FAILED = ">>>>> Reset Failed"
+
+
+class ExecWrapper:
+    def __init__(self, subprocess_args: Dict = None):
+        self.subprocess_args = subprocess_args or {}
+    
+    def __call__(self, cmd, raise_error=True, **kwargs):
+        try:
+            combined_args = {**self.subprocess_args, **kwargs}
+            output = subprocess.run(cmd, **combined_args)
+            return output
+        except subprocess.CalledProcessError as e:
+            if raise_error:
+                error_message = f"Error: {e}\nError stdout: {e.stdout}\nError stderr: {e.stderr}\nError traceback: {format_exc()}"
+                logger.error(error_message)
+                raise e
+
+
+class EnvManager:
+    def __init__(self, testbed):
+        shellenv = os.environ.copy()
+        self.testbed = testbed
+        
+        self.exec = ExecWrapper(
+            subprocess_args={
+                "check": True,
+                "shell": False,
+                "capture_output": True,
+                "text": True,
+                "env": shellenv,
+            }
+        )
+
+    def clone_repo(self, repo_name: str, path: str, token: str = None):
+        if token is None:
+            token = os.environ.get("GITHUB_TOKEN", "git")
+            if not token:
+                raise ValueError("GitHub token is required for cloning repositories.")
+    
+        repo_url = f"https://{token}@github.com/swe-bench/{repo_name.replace('/', '__')}.git"
+    
+        try:
+            # Ensure the destination directory exists
+            os.makedirs(path, exist_ok=True)
+        
+            # Clone the repository
+            git.Repo.clone_from(repo_url, path)
+            print(f"Repository '{repo_name}' cloned successfully.")
+        except GitError as e:
+            print(f"Failed to clone repository '{repo_name}': {e}")
+        
+    def reset_task_env(self, instance: Dict):
+        """
+        Reset task environment + testbed and checkout base commit of given task instance
+        """
+        try:
+            gitignore_path = Path(".gitignore")
+            if gitignore_path.exists():
+                self.exec(["git", "ls-files", "--ignored", "--exclude-standard", "-o", "-z"], raise_error=False)
+                self.exec(["xargs", "-0", "-r", "rm", "-rf"], input=gitignore_path.read_text())
+            
+            self.exec(["git", "restore", "."])
+            self.exec(["git", "reset", "HEAD", "."])
+            self.exec(["git", "clean", "-fdx"])
+            self.exec(["git", "-c", "advice.detachedHead=false", "checkout", instance['base_commit']])
+            logger.info(f"[{instance['instance_id']}] Reset task environment to {instance['base_commit']}")
+            return True
+        except Exception as e:
+            err_msg = f"{RESET_FAILED}; Failed to reset task environment to {instance['base_commit']}: {e}"
+            logger.error(err_msg)
+            return False
--- a/data/inference/make_datasets/utils.py
+++ b/data/inference/make_datasets/utils.py
@ -0,0 +1,28 @@
+import re
+
+
+def extract_diff(response):
+    """
+    Extracts the diff from a response formatted in different ways
+    """
+    if response is None:
+        return None
+    diff_matches = []
+    other_matches = []
+    pattern = re.compile(r"\<([\w-]+)\>(.*?)\<\/\1\>", re.DOTALL)
+    for code, match in pattern.findall(response):
+        if code in {"diff", "patch"}:
+            diff_matches.append(match)
+        else:
+            other_matches.append(match)
+    pattern = re.compile(r"```(\w+)?\n(.*?)```", re.DOTALL)
+    for code, match in pattern.findall(response):
+        if code in {"diff", "patch"}:
+            diff_matches.append(match)
+        else:
+            other_matches.append(match)
+    if diff_matches:
+        return diff_matches[0]
+    if other_matches:
+        return other_matches[0]
+    return response.split("</s>")[0]
--- a/data/inference/run.py
+++ b/data/inference/run.py
@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+# @Author  : stellahong (stellahong@fuzhi.ai)
+# @Desc    :
+import runpy
+import sys
+
+# 备份原始的sys.argv
+original_argv = sys.argv.copy()
+
+try:
+    # 设置你想要传递给脚本的命令行参数
+    sys.argv = ['run_api.py', '--dataset_name_or_path', 'princeton-nlp/SWE-bench_oracle', '--output_dir',
+                './outputs']
+    # 添加其他可选参数到sys.argv中，例如:
+    # sys.argv.extend(['--some_option', 'some_value'])
+    
+    # 执行脚本
+    runpy.run_path(path_name='run_api.py', run_name='__main__')
+finally:
+    # 恢复原始的sys.argv以避免对后续代码的潜在影响
+    sys.argv = original_argv
+