update: SWE Agent

2026-05-15 11:02:36 +02:00 · 2024-07-03 20:11:32 +08:00 · 2024-07-03 20:11:32 +08:00 · fafe2ce68f
commit fafe2ce68f
parent 92f94862cb
8 changed files with 103 additions and 91 deletions
--- a/tests/metagpt/roles/di/run_swe.py
+++ b/tests/metagpt/roles/di/run_swe.py
@ -12,7 +12,7 @@ from metagpt.tools.swe_agent_commands.swe_agent_utils import load_hf_dataset

 # Specify by yourself
 TEST_REPO_DIR = Path("/Users/seeker/Projects/sdfz/mg/mg-swe-agent") / "benchmark" / "swe_bench" / "data" / "test_repo"
-DATA_DIR = METAGPT_ROOT / "benchmark" / "swe_bench" / "data"
+DATA_DIR = METAGPT_ROOT / "data/hugging_face"

 INSTANCE_TEMPLATE = """
 ## User Requirement
@ -27,8 +27,9 @@ hints text is the comment under issue:
 {hints_text}

 The repository may already exist at the path `{repo_path}`. If it doesn't, please download the repository to this path.
-All your subsequent actions should use the project path as your root directory, and you should never leave that directory to execute any actions.
+Your first action must be to navigate to the repository path `{repo_path}`.
 This issue occurred in version {version}, with the corresponding base commit being {base_commit}. You need to switch to the code version associated with this commit.
+All subsequent actions must be performed within this repository path. Do not leave this directory to execute any actions at any time.

 # INSTRUCTIONS:
 Now, you're going to solve this issue on your own from the perspective of a programmer. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need. 
@ -36,38 +37,6 @@ Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for f
 """


-def split_dataset_equally(dataset):
-    # 计算索引
-    # fixme: 设置django
-
-    part1 = dataset.filter(
-        lambda x: x["repo"]
-        not in [
-            "django/django",
-            "sympy/sympy",
-            "pytest-dev/pytest",
-        ],
-        desc="Filtering out existing ids",
-        load_from_cache_file=True,
-    )
-
-    part2 = dataset.filter(
-        lambda x: x["repo"] in ["sympy/sympy", "pytest-dev/pytest"],
-        desc="Filtering out existing ids",
-        load_from_cache_file=True,
-    )
-
-    part3 = dataset.filter(
-        lambda x: x["repo"] in ["django/django"],
-        desc="Filtering out existing ids",
-        load_from_cache_file=False,
-    )
-
-    print(len(part1), len(part2), len(part3))
-
-    return [part1, part2, part3]
-
-
 def check_instance_status(instance, swe_result_dir):
    output_file = swe_result_dir / "all_preds.jsonl"
    res = True
@ -87,12 +56,20 @@ async def run(instance, swe_result_dir):
        logger.info(f"Instance {instance['instance_id']} already exists, skipping execution.")
        return

-    repo_path = TEST_REPO_DIR / (instance["repo"].replace("-", "_").replace("/", "__") + "_" + instance["version"])
-    """
-    All your subsequent actions should use the project path as your root directory, and you should never leave that directory to execute any actions.
-    """
+    repo_path = Path("/Users/seeker/Projects/other/test_repo") / (
+        instance["repo"].replace("-", "_").replace("/", "__") + "_" + instance["version"]
+    )
+    # repo_path = Path("/Users/seeker/Projects/other/test_repo") / instance["repo"].split("/")[-1]
+
+    # 前处理
    terminal = Terminal()
-    terminal.run_command(f"cd {repo_path} &&     git checkout . && git clean -n -d && git clean -f -d")
+    terminal.run_command(f"cd {repo_path} && git reset --hard && git clean -n -d && git clean -f -d")
+    terminal.run_command("BRANCH=$(git remote show origin | awk '/HEAD branch/ {print $NF}')")
+    logger.info(terminal.run_command("echo $BRANCH"))
+    # logger.info(terminal.run_command(f'Branch name: $BRANCH'))
+    logger.info(terminal.run_command('git checkout "$BRANCH"'))
+    logger.info(terminal.run_command("git branch"))
+
    user_requirement_and_issue = INSTANCE_TEMPLATE.format(
        issue=instance["problem_statement"],
        hints_text=instance["hints_text"],
@ -126,16 +103,14 @@ async def async_main():
    dataset_path = "manna-ai/SWE-bench_Nano"  # "princeton-nlp/SWE-bench_Lite" #"manna-ai/SWE-bench_Nano"

    dataset = load_hf_dataset(dataset_name_or_path=dataset_path, cache_dir=DATA_DIR, split="test")
-    sample_datasets = split_dataset_equally(dataset)
    date_time = datetime.now().strftime("%m-%d")
-    round_ = "third"
-
-    for idx, sub_dataset in enumerate(sample_datasets):
-        exp_name = f"nano_mgx_{date_time}_{round_}_part_{idx}"
-        swe_result_dir = DEFAULT_WORKSPACE_ROOT / f"result_{config.llm.model}" / exp_name
-        swe_result_dir.mkdir(parents=True, exist_ok=True)
-        for instance in sub_dataset:
-            await run(instance, swe_result_dir)
+    # _round = "first"
+    _round = "second"
+    exp_name = f"nano_mgx_{date_time}_{_round}"
+    swe_result_dir = DEFAULT_WORKSPACE_ROOT / f"result_{config.llm.model.replace('/', '_')}" / exp_name
+    swe_result_dir.mkdir(parents=True, exist_ok=True)
+    for instance in dataset:
+        await run(instance, swe_result_dir)


 if __name__ == "__main__":