update: SWE Agent

This commit is contained in:
seeker 2024-07-03 20:11:32 +08:00
parent 92f94862cb
commit fafe2ce68f
8 changed files with 103 additions and 91 deletions

View file

@ -12,7 +12,7 @@ from metagpt.tools.swe_agent_commands.swe_agent_utils import load_hf_dataset
# Specify by yourself
TEST_REPO_DIR = Path("/Users/seeker/Projects/sdfz/mg/mg-swe-agent") / "benchmark" / "swe_bench" / "data" / "test_repo"
DATA_DIR = METAGPT_ROOT / "benchmark" / "swe_bench" / "data"
DATA_DIR = METAGPT_ROOT / "data/hugging_face"
INSTANCE_TEMPLATE = """
## User Requirement
@ -27,8 +27,9 @@ hints text is the comment under issue:
{hints_text}
The repository may already exist at the path `{repo_path}`. If it doesn't, please download the repository to this path.
All your subsequent actions should use the project path as your root directory, and you should never leave that directory to execute any actions.
Your first action must be to navigate to the repository path `{repo_path}`.
This issue occurred in version {version}, with the corresponding base commit being {base_commit}. You need to switch to the code version associated with this commit.
All subsequent actions must be performed within this repository path. Do not leave this directory to execute any actions at any time.
# INSTRUCTIONS:
Now, you're going to solve this issue on your own from the perspective of a programmer. Your terminal session has started and you're in the repository's root directory. You can use any bash commands or the special interface to help you. Edit all the files you need.
@ -36,38 +37,6 @@ Remember, YOU CAN ONLY ENTER ONE COMMAND AT A TIME. You should always wait for f
"""
def split_dataset_equally(dataset):
# 计算索引
# fixme: 设置django
part1 = dataset.filter(
lambda x: x["repo"]
not in [
"django/django",
"sympy/sympy",
"pytest-dev/pytest",
],
desc="Filtering out existing ids",
load_from_cache_file=True,
)
part2 = dataset.filter(
lambda x: x["repo"] in ["sympy/sympy", "pytest-dev/pytest"],
desc="Filtering out existing ids",
load_from_cache_file=True,
)
part3 = dataset.filter(
lambda x: x["repo"] in ["django/django"],
desc="Filtering out existing ids",
load_from_cache_file=False,
)
print(len(part1), len(part2), len(part3))
return [part1, part2, part3]
def check_instance_status(instance, swe_result_dir):
output_file = swe_result_dir / "all_preds.jsonl"
res = True
@ -87,12 +56,20 @@ async def run(instance, swe_result_dir):
logger.info(f"Instance {instance['instance_id']} already exists, skipping execution.")
return
repo_path = TEST_REPO_DIR / (instance["repo"].replace("-", "_").replace("/", "__") + "_" + instance["version"])
"""
All your subsequent actions should use the project path as your root directory, and you should never leave that directory to execute any actions.
"""
repo_path = Path("/Users/seeker/Projects/other/test_repo") / (
instance["repo"].replace("-", "_").replace("/", "__") + "_" + instance["version"]
)
# repo_path = Path("/Users/seeker/Projects/other/test_repo") / instance["repo"].split("/")[-1]
# 前处理
terminal = Terminal()
terminal.run_command(f"cd {repo_path} && git checkout . && git clean -n -d && git clean -f -d")
terminal.run_command(f"cd {repo_path} && git reset --hard && git clean -n -d && git clean -f -d")
terminal.run_command("BRANCH=$(git remote show origin | awk '/HEAD branch/ {print $NF}')")
logger.info(terminal.run_command("echo $BRANCH"))
# logger.info(terminal.run_command(f'Branch name: $BRANCH'))
logger.info(terminal.run_command('git checkout "$BRANCH"'))
logger.info(terminal.run_command("git branch"))
user_requirement_and_issue = INSTANCE_TEMPLATE.format(
issue=instance["problem_statement"],
hints_text=instance["hints_text"],
@ -126,16 +103,14 @@ async def async_main():
dataset_path = "manna-ai/SWE-bench_Nano" # "princeton-nlp/SWE-bench_Lite" #"manna-ai/SWE-bench_Nano"
dataset = load_hf_dataset(dataset_name_or_path=dataset_path, cache_dir=DATA_DIR, split="test")
sample_datasets = split_dataset_equally(dataset)
date_time = datetime.now().strftime("%m-%d")
round_ = "third"
for idx, sub_dataset in enumerate(sample_datasets):
exp_name = f"nano_mgx_{date_time}_{round_}_part_{idx}"
swe_result_dir = DEFAULT_WORKSPACE_ROOT / f"result_{config.llm.model}" / exp_name
swe_result_dir.mkdir(parents=True, exist_ok=True)
for instance in sub_dataset:
await run(instance, swe_result_dir)
# _round = "first"
_round = "second"
exp_name = f"nano_mgx_{date_time}_{_round}"
swe_result_dir = DEFAULT_WORKSPACE_ROOT / f"result_{config.llm.model.replace('/', '_')}" / exp_name
swe_result_dir.mkdir(parents=True, exist_ok=True)
for instance in dataset:
await run(instance, swe_result_dir)
if __name__ == "__main__":